From d4fd805dda7c60a2c09983a9cd5aa1b04d9477d1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Feb 2026 09:57:04 -0800 Subject: [PATCH 001/145] Rename folder dpctl to dpctl_ext --- .../tensor/libtensor/include/kernels/alignment.hpp | 0 .../tensor/libtensor/include/kernels/dpctl_tensor_types.hpp | 0 .../libtensor/include/kernels/elementwise_functions/common.hpp | 0 .../include/kernels/elementwise_functions/common_detail.hpp | 0 .../include/kernels/elementwise_functions/logaddexp.hpp | 0 .../libtensor/include/kernels/elementwise_functions/maximum.hpp | 0 .../libtensor/include/kernels/elementwise_functions/minimum.hpp | 0 .../include/kernels/elementwise_functions/sycl_complex.hpp | 0 .../include/kernels/elementwise_functions/vec_size_util.hpp | 0 .../tensor/libtensor/include/utils/indexing_utils.hpp | 0 .../tensor/libtensor/include/utils/math_utils.hpp | 0 .../tensor/libtensor/include/utils/memory_overlap.hpp | 0 .../tensor/libtensor/include/utils/offset_utils.hpp | 0 .../tensor/libtensor/include/utils/output_validation.hpp | 0 .../tensor/libtensor/include/utils/strided_iters.hpp | 0 .../tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 0 .../tensor/libtensor/include/utils/sycl_utils.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch.hpp | 0 .../tensor/libtensor/include/utils/type_dispatch_building.hpp | 0 .../tensor/libtensor/include/utils/type_utils.hpp | 0 dpnp/backend/extensions/blas/CMakeLists.txt | 2 +- dpnp/backend/extensions/fft/CMakeLists.txt | 2 +- dpnp/backend/extensions/indexing/CMakeLists.txt | 2 +- dpnp/backend/extensions/lapack/CMakeLists.txt | 2 +- dpnp/backend/extensions/statistics/CMakeLists.txt | 2 +- dpnp/backend/extensions/ufunc/CMakeLists.txt | 2 +- dpnp/backend/extensions/vm/CMakeLists.txt | 2 +- dpnp/backend/extensions/window/CMakeLists.txt | 2 +- 28 files changed, 8 insertions(+), 8 deletions(-) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/alignment.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/indexing_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/math_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/memory_overlap.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/offset_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/output_validation.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/strided_iters.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_alloc_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/sycl_utils.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_dispatch_building.hpp (100%) rename {dpctl => dpctl_ext}/tensor/libtensor/include/utils/type_utils.hpp (100%) diff --git a/dpctl/tensor/libtensor/include/kernels/alignment.hpp b/dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/alignment.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/alignment.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/common_detail.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/sycl_complex.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp rename to dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/vec_size_util.hpp diff --git a/dpctl/tensor/libtensor/include/utils/indexing_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/indexing_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/indexing_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/math_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/math_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/memory_overlap.hpp rename to dpctl_ext/tensor/libtensor/include/utils/memory_overlap.hpp diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/offset_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/offset_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/output_validation.hpp b/dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/output_validation.hpp rename to dpctl_ext/tensor/libtensor/include/utils/output_validation.hpp diff --git a/dpctl/tensor/libtensor/include/utils/strided_iters.hpp b/dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/strided_iters.hpp rename to dpctl_ext/tensor/libtensor/include/utils/strided_iters.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_alloc_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/sycl_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/sycl_utils.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_dispatch_building.hpp diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp similarity index 100% rename from dpctl/tensor/libtensor/include/utils/type_utils.hpp rename to dpctl_ext/tensor/libtensor/include/utils/type_utils.hpp diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt index 0015eda84843..cbc3e31d923b 100644 --- a/dpnp/backend/extensions/blas/CMakeLists.txt +++ b/dpnp/backend/extensions/blas/CMakeLists.txt @@ -68,7 +68,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt index 0569ecc8bca4..edc7bff7dce4 100644 --- a/dpnp/backend/extensions/fft/CMakeLists.txt +++ b/dpnp/backend/extensions/fft/CMakeLists.txt @@ -61,7 +61,7 @@ target_include_directories( ${python_module_name} PRIVATE ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt index c0de75ae3146..39f68ffba846 100644 --- a/dpnp/backend/extensions/indexing/CMakeLists.txt +++ b/dpnp/backend/extensions/indexing/CMakeLists.txt @@ -65,7 +65,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 76b25c3a6d10..59499a3b28f8 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -86,7 +86,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt index e04279b75e49..8544e816e8d6 100644 --- a/dpnp/backend/extensions/statistics/CMakeLists.txt +++ b/dpnp/backend/extensions/statistics/CMakeLists.txt @@ -70,7 +70,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt index 55a750f8423f..293cef0ab326 100644 --- a/dpnp/backend/extensions/ufunc/CMakeLists.txt +++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt @@ -88,7 +88,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt index 32d6a6765a00..551c43842af2 100644 --- a/dpnp/backend/extensions/vm/CMakeLists.txt +++ b/dpnp/backend/extensions/vm/CMakeLists.txt @@ -110,7 +110,7 @@ target_include_directories( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt index 6fe04e334f42..01274317782d 100644 --- a/dpnp/backend/extensions/window/CMakeLists.txt +++ b/dpnp/backend/extensions/window/CMakeLists.txt @@ -66,7 +66,7 @@ target_include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/../../ ${CMAKE_CURRENT_SOURCE_DIR}/../common ${CMAKE_SOURCE_DIR}/dpnp/backend/include - ${CMAKE_SOURCE_DIR}/dpctl/tensor/libtensor/include + ${CMAKE_SOURCE_DIR}/dpctl_ext/tensor/libtensor/include ) target_include_directories(${python_module_name} PUBLIC ${Dpctl_INCLUDE_DIR}) From c040713d50cd10c628990b628cb74b0a5029f99b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:04:36 -0800 Subject: [PATCH 002/145] Add simplify_iteration_space implementation to libtensor --- .../source/simplify_iteration_space.cpp | 544 ++++++++++++++++++ .../source/simplify_iteration_space.hpp | 130 +++++ 2 files changed, 674 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp new file mode 100644 index 000000000000..2526f022e0ac --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -0,0 +1,544 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "simplify_iteration_space.hpp" +#include "utils/strided_iters.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &simplified_shape, + std::vector &simplified_strides, + py::ssize_t &offset) +{ + using dpctl::tensor::strides::simplify_iteration_stride; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + + simplified_strides.reserve(nd); + simplified_strides.insert(std::end(simplified_strides), + std::begin(strides), std::end(strides)); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + int contracted_nd = simplify_iteration_stride( + nd, simplified_shape.data(), simplified_strides.data(), + offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + + simplified_strides.reserve(nd); + simplified_strides.push_back((strides[0] >= 0) ? strides[0] + : -strides[0]); + if ((strides[0] < 0) && (shape[0] > 1)) { + offset += (shape[0] - 1) * strides[0]; + } + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &src_strides, + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_two_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::begin(simplified_shape), shape, + shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_src_strides.insert(std::end(simplified_src_strides), + std::begin(src_strides), + std::end(src_strides)); + assert(simplified_src_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_two_strides( + nd, simplified_shape.data(), simplified_src_strides.data(), + simplified_dst_strides.data(), + src_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if (src_strides[0] < 0 && dst_strides[0] < 0) { + simplified_src_strides.push_back(-src_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src_offset += (shape[0] - 1) * src_strides[0]; + dst_offset += (shape[0] - 1) * dst_strides[0]; + } + } + else { + simplified_src_strides.push_back(src_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_3( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_three_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_three_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (dst_strides[0] < 0)) { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void simplify_iteration_space_4( + int &nd, + const py::ssize_t *const &shape, + // src1 + std::vector const &src1_strides, + // src2 + std::vector const &src2_strides, + // src3 + std::vector const &src3_strides, + // dst + std::vector const &dst_strides, + // output + std::vector &simplified_shape, + std::vector &simplified_src1_strides, + std::vector &simplified_src2_strides, + std::vector &simplified_src3_strides, + std::vector &simplified_dst_strides, + py::ssize_t &src1_offset, + py::ssize_t &src2_offset, + py::ssize_t &src3_offset, + py::ssize_t &dst_offset) +{ + using dpctl::tensor::strides::simplify_iteration_four_strides; + if (nd > 1) { + // Simplify iteration space to reduce dimensionality + // and improve access pattern + simplified_shape.reserve(nd); + simplified_shape.insert(std::end(simplified_shape), shape, shape + nd); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src1_strides.insert(std::end(simplified_src1_strides), + std::begin(src1_strides), + std::end(src1_strides)); + assert(simplified_src1_strides.size() == static_cast(nd)); + + simplified_src2_strides.reserve(nd); + simplified_src2_strides.insert(std::end(simplified_src2_strides), + std::begin(src2_strides), + std::end(src2_strides)); + assert(simplified_src2_strides.size() == static_cast(nd)); + + simplified_src3_strides.reserve(nd); + simplified_src3_strides.insert(std::end(simplified_src3_strides), + std::begin(src3_strides), + std::end(src3_strides)); + assert(simplified_src3_strides.size() == static_cast(nd)); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.insert(std::end(simplified_dst_strides), + std::begin(dst_strides), + std::end(dst_strides)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + int contracted_nd = simplify_iteration_four_strides( + nd, simplified_shape.data(), simplified_src1_strides.data(), + simplified_src2_strides.data(), simplified_src3_strides.data(), + simplified_dst_strides.data(), + src1_offset, // modified by reference + src2_offset, // modified by reference + src3_offset, // modified by reference + dst_offset // modified by reference + ); + simplified_shape.resize(contracted_nd); + simplified_src1_strides.resize(contracted_nd); + simplified_src2_strides.resize(contracted_nd); + simplified_src3_strides.resize(contracted_nd); + simplified_dst_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + src1_offset = 0; + src2_offset = 0; + src3_offset = 0; + dst_offset = 0; + // Populate vectors + simplified_shape.reserve(nd); + simplified_shape.push_back(shape[0]); + assert(simplified_shape.size() == static_cast(nd)); + + simplified_src1_strides.reserve(nd); + simplified_src2_strides.reserve(nd); + simplified_src3_strides.reserve(nd); + simplified_dst_strides.reserve(nd); + + if ((src1_strides[0] < 0) && (src2_strides[0] < 0) && + (src3_strides[0] < 0) && (dst_strides[0] < 0)) + { + simplified_src1_strides.push_back(-src1_strides[0]); + simplified_src2_strides.push_back(-src2_strides[0]); + simplified_src3_strides.push_back(-src3_strides[0]); + simplified_dst_strides.push_back(-dst_strides[0]); + if (shape[0] > 1) { + src1_offset += src1_strides[0] * (shape[0] - 1); + src2_offset += src2_strides[0] * (shape[0] - 1); + src3_offset += src3_strides[0] * (shape[0] - 1); + dst_offset += dst_strides[0] * (shape[0] - 1); + } + } + else { + simplified_src1_strides.push_back(src1_strides[0]); + simplified_src2_strides.push_back(src2_strides[0]); + simplified_src3_strides.push_back(src3_strides[0]); + simplified_dst_strides.push_back(dst_strides[0]); + } + + assert(simplified_src1_strides.size() == static_cast(nd)); + assert(simplified_src2_strides.size() == static_cast(nd)); + assert(simplified_src3_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + } +} + +void compact_iteration_space(int &nd, + const py::ssize_t *const &shape, + std::vector const &strides, + // output + std::vector &compact_shape, + std::vector &compact_strides) +{ + using dpctl::tensor::strides::compact_iteration; + if (nd > 1) { + // Compact iteration space to reduce dimensionality + // and improve access pattern + compact_shape.reserve(nd); + compact_shape.insert(std::begin(compact_shape), shape, shape + nd); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.insert(std::end(compact_strides), std::begin(strides), + std::end(strides)); + assert(compact_strides.size() == static_cast(nd)); + + int contracted_nd = + compact_iteration(nd, compact_shape.data(), compact_strides.data()); + compact_shape.resize(contracted_nd); + compact_strides.resize(contracted_nd); + + nd = contracted_nd; + } + else if (nd == 1) { + // Populate vectors + compact_shape.reserve(nd); + compact_shape.push_back(shape[0]); + assert(compact_shape.size() == static_cast(nd)); + + compact_strides.reserve(nd); + compact_strides.push_back(strides[0]); + assert(compact_strides.size() == static_cast(nd)); + } +} + +/* @brief Split shape/strides into dir1 (complementary to axis_start <= i < + * axis_end) and dir2 (along given set of axes) + */ +void split_iteration_space(const std::vector &shape_vec, + const std::vector &strides_vec, + int axis_start, + int axis_end, + std::vector &dir1_shape_vec, + std::vector &dir2_shape_vec, + std::vector &dir1_strides_vec, + std::vector &dir2_strides_vec) +{ + int nd = static_cast(shape_vec.size()); + int dir2_sz = axis_end - axis_start; + int dir1_sz = nd - dir2_sz; + + assert(dir1_sz > 0); + assert(dir2_sz > 0); + + dir1_shape_vec.resize(dir1_sz); + dir2_shape_vec.resize(dir2_sz); + + std::copy(shape_vec.begin(), shape_vec.begin() + axis_start, + dir1_shape_vec.begin()); + std::copy(shape_vec.begin() + axis_end, shape_vec.end(), + dir1_shape_vec.begin() + axis_start); + + std::copy(shape_vec.begin() + axis_start, shape_vec.begin() + axis_end, + dir2_shape_vec.begin()); + + dir1_strides_vec.resize(dir1_sz); + dir2_strides_vec.resize(dir2_sz); + + std::copy(strides_vec.begin(), strides_vec.begin() + axis_start, + dir1_strides_vec.begin()); + std::copy(strides_vec.begin() + axis_end, strides_vec.end(), + dir1_strides_vec.begin() + axis_start); + + std::copy(strides_vec.begin() + axis_start, strides_vec.begin() + axis_end, + dir2_strides_vec.begin()); + + return; +} + +py::ssize_t _ravel_multi_index_c(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(nd - 1 - i) * s; + s *= shape.at(nd - 1 - i); + } + + return flat_index; +} + +py::ssize_t _ravel_multi_index_f(std::vector const &mi, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + if (nd != mi.size()) { + throw py::value_error( + "Multi-index and shape vectors must have the same length."); + } + + py::ssize_t flat_index = 0; + py::ssize_t s = 1; + for (std::size_t i = 0; i < nd; ++i) { + flat_index += mi.at(i) * s; + s *= shape.at(i); + } + + return flat_index; +} + +std::vector _unravel_index_c(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[nd - 1 - dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[nd - 1 - dim] = r; + i_ = q; + } + if (nd) { + mi[0] = i_; + } + return mi; +} + +std::vector _unravel_index_f(py::ssize_t flat_index, + std::vector const &shape) +{ + std::size_t nd = shape.size(); + std::vector mi; + mi.resize(nd); + + py::ssize_t i_ = flat_index; + for (std::size_t dim = 0; dim + 1 < nd; ++dim) { + const py::ssize_t si = shape[dim]; + const py::ssize_t q = i_ / si; + const py::ssize_t r = (i_ - q * si); + mi[dim] = r; + i_ = q; + } + if (nd) { + mi[nd - 1] = i_; + } + return mi; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp new file mode 100644 index 000000000000..d3448ee1f5fd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -0,0 +1,130 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace py = pybind11; + +void simplify_iteration_space_1(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector &, + std::vector &, + py::ssize_t &); + +void simplify_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + std::vector const &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_3(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void simplify_iteration_space_4(int &, + const py::ssize_t *const &, + // src1 + std::vector const &, + // src2 + std::vector const &, + // src3 + std::vector const &, + // dst + std::vector const &, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &, + std::vector &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &, + py::ssize_t &); + +void compact_iteration_space(int &, + const py::ssize_t *const &, + std::vector const &, + // output + std::vector &, + std::vector &); + +void split_iteration_space(const std::vector &, + const std::vector &, + int, + int, + // output + std::vector &, + std::vector &, + std::vector &, + std::vector &); + +py::ssize_t _ravel_multi_index_c(std::vector const &, + std::vector const &); +py::ssize_t _ravel_multi_index_f(std::vector const &, + std::vector const &); +std::vector _unravel_index_c(py::ssize_t, + std::vector const &); +std::vector _unravel_index_f(py::ssize_t, + std::vector const &); +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 14b466facfe6b23f92113ccc2dbb224e2727bf3c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:14:43 -0800 Subject: [PATCH 003/145] Extend codespell ignore list for libtensor --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cdf592535d11..67fb75cb5f54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT" quiet-level = 3 [tool.coverage.report] From dcc421bc61c36549d3e6865927f495abab15d078 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:15:09 -0800 Subject: [PATCH 004/145] Add copy_and_cast kernels to libtensor --- .../include/kernels/copy_and_cast.hpp | 1288 +++++++++++++++++ .../include/kernels/copy_as_contiguous.hpp | 655 +++++++++ .../libtensor/source/copy_as_contig.cpp | 758 ++++++++++ .../libtensor/source/copy_as_contig.hpp | 61 + 4 files changed, 2762 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp new file mode 100644 index 000000000000..a07d311a7fcb --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -0,0 +1,1288 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_and_cast +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class copy_cast_generic_kernel; + +template +class copy_cast_contig_kernel; + +template +class copy_cast_from_host_kernel; + +template +class copy_cast_from_host_contig_kernel; + +template +class Caster +{ +public: + Caster() = default; + dstTy operator()(const srcTy &src) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(src); + } +}; + +template +class GenericCopyFunctor +{ +private: + const srcT *src_ = nullptr; + dstT *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFunctor(const srcT *src_p, dstT *dst_p, const IndexerT &indexer) + : src_(src_p), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + static constexpr CastFnT fn{}; + dst_[dst_offset] = fn(src_[src_offset]); + } +}; + +/*! + @defgroup CopyAndCastKernels + */ + +/*! + * @brief Function pointer type for generic array cast and copying function. + */ +typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Generic function to copy `nelems` elements from `src` usm_ndarray to + `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have array dimensionality specified via argument `nd`. The + `shape_and_strides` is kernel accessible USM array of length `3*nd`, where the + first `nd` elements encode common shape, second `nd` elements contain strides + of `src` array, and the trailing `nd` elements contain strides of `dst` array. + `src_p` and `dst_p` represent pointers into respective arrays, but the start of + iteration begins at offset of `src_offset` elements for `src` array and at + offset `dst_offset` elements for `dst` array. Kernel is submitted to sycl queue + `q` with events `depends` and `additional_depends` as dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param nd Array dimensionality, i.e. number of indices needed to + identify an element of each array. + @param shape_and_strides Kernel accessible USM pointer to packed shape and + strides. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of + elements of source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of + elements of destination array from `dst_p`. + @param depends List of events to wait for before starting computations, if + any. + @param additional_depends Additional list of events to wait for before + starting computations, if any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_generic_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + const TwoOffsets_StridedIndexer indexer{nd, src_offset, dst_offset, + shape_and_strides}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFunctor, + TwoOffsets_StridedIndexer>(src_tp, dst_tp, + indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get generic function pointer of type `fnT` for given source + * data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastGenericFactory +{ + fnT get() + { + fnT f = copy_and_cast_generic_impl; + return f; + } +}; + +// Specialization of copy_and_cast for contiguous arrays + +template +class ContigCopyFunctor +{ +private: + std::size_t nelems; + const srcT *src_p = nullptr; + dstT *dst_p = nullptr; + +public: + ContigCopyFunctor(const std::size_t nelems_, + const srcT *src_p_, + dstT *dst_p_) + : nelems(nelems_), src_p(src_p_), dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr CastFnT fn{}; + + static constexpr std::uint8_t elems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex_v; + if constexpr (!enable_sg_loadstore || is_complex_v || + is_complex_v) { + std::uint16_t sgSize = ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * elems_per_sg + (gid % sgSize) + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = fn(src_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + elems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t offset = base + it * sgSize; + auto src_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&src_p[offset]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[offset]); + + const sycl::vec src_vec = + sub_group_load(sg, src_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; k++) { + dst_vec[k] = fn(src_vec[k]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t start = base + sg.get_local_id()[0]; + for (std::size_t k = start; k < nelems; k += sgSize) { + dst_p[k] = fn(src_p[k]); + } + } + } + } +}; + +/*! + * @brief Function pointer type for contiguous array cast and copy function. + */ +typedef sycl::event (*copy_and_cast_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const std::vector &); + +/*! + * @brief Function to copy `nelems` elements from contiguous `src` usm_ndarray + to contiguous `dst` usm_ndarray while casting from `srcTy` to `dstTy`. + + Both arrays have the same number of elements `nelems`. + `src_cp` and `dst_cp` represent char pointers to the start of respective + arrays. Kernel is submitted to sycl queue `q` with events `depends` as + dependencies. + + @param q Sycl queue to which the kernel is submitted. + @param nelems Number of elements to cast and copy. + @param src_p Kernel accessible USM pointer for the source array + @param dst_p Kernel accessible USM pointer for the destination array + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *src_cp, + char *dst_cp, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const srcTy *src_tp = reinterpret_cast(src_cp); + dstTy *dst_tp = reinterpret_cast(dst_cp); + + std::size_t lws = 64; + static constexpr std::uint32_t vec_sz = 4; + static constexpr std::uint32_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(src_cp) && + is_aligned(dst_cp)) + { + static constexpr bool enable_sg_loadstore = true; + using KernelName = + copy_cast_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, enable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + copy_cast_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + ContigCopyFunctor, vec_sz, + n_vecs, disable_sg_loadstore>(nelems, src_tp, + dst_tp)); + } + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get specialized function pointer for casting and copying + * contiguous arrays. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCastContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_contig_impl; + return f; + } +}; + +// Specialization of copy_and_cast for 1D arrays + +/*! + * @brief Factory to get function pointer for casting and copying 1D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Factory to get function pointer for casting and copying 2D arrays. + * @ingroup CopyAndCastKernels + */ +typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const std::array &, + const std::array &, + const std::array &, + const char *, + ssize_t, + char *, + ssize_t, + const std::vector &); + +/*! + * @brief Specialized for given array dimension function to copy `nelems` + elements from `src` usm_ndarray to `dst` usm_ndarray while casting from `srcTy` + to `dstTy`. + + Both arrays have array dimensionality known at compile time and specified in + template parameters `nd`. Arrays' shape and strides are provided as + `std::array`. `src_p` and `dst_p` represent pointers into respective arrays, + but the start of iteration begins at offset of `src_offset` elements for `src` + array and at offset `dst_offset` elements for `dst` array. Kernel is submitted + to sycl queue `q` with events `depends` as dependencies. + + @param q The queue where the routine should be executed. + @param nelems Number of elements to cast and copy. + @param shape Common shape of the arrays. + @param src_strides Strides of the source array. + @param dst_strides Strides of the destination array. + @param src_p Kernel accessible USM pointer for the source array + @param src_offset Offset to the beginning of iteration in number of elements + of the source array from `src_p`. + @param dst_p Kernel accessible USM pointer for the destination array + @param dst_offset Offset to the beginning of iteration in number of elements + of the destination array from `src_p`. + @param depends List of events to wait for before starting computations, if + any. + + @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_and_cast_nd_specialized_impl( + sycl::queue &q, + std::size_t nelems, + const std::array &shape, + const std::array &src_strides, + const std::array &dst_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_and_cast_ev = q.submit([&](sycl::handler &cgh) { + using IndexerT = TwoOffsets_FixedDimStridedIndexer; + const IndexerT indexer{shape, src_strides, dst_strides, src_offset, + dst_offset}; + const srcTy *src_tp = reinterpret_cast(src_p); + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.depends_on(depends); + cgh.parallel_for< + class copy_cast_generic_kernel>( + sycl::range<1>(nelems), + GenericCopyFunctor, IndexerT>( + src_tp, dst_tp, indexer)); + }); + + return copy_and_cast_ev; +} + +/*! + * @brief Factory to get 1D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast1DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +/*! + * @brief Factory to get 2D-specialized function pointer of type `fnT` for given + * source data type `S` and destination data type `D`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyAndCast2DFactory +{ + fnT get() + { + fnT f = copy_and_cast_nd_specialized_impl; + return f; + } +}; + +// ====================== Copying from host to USM + +template +class GenericCopyFromHostFunctor +{ +private: + AccessorT src_acc_; + dstTy *dst_ = nullptr; + IndexerT indexer_; + +public: + GenericCopyFromHostFunctor(const AccessorT &src_acc, + dstTy *dst_p, + const IndexerT &indexer) + : src_acc_(src_acc), dst_(dst_p), indexer_(indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); + + CastFnT fn{}; + dst_[dst_offset] = fn(src_acc_[src_offset]); + } +}; + +typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + ssize_t, + ssize_t, + ssize_t, + char *, + ssize_t, + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy`. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Arrays' metadata are given in packed USM vector of length `3*nd` whose first + * `nd` elements contain arrays' shape, next `nd` elements specify source + * strides in elements (not bytes), and trailing `nd` elements specify + * destination array strides. Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param nd The dimensionality of arrays + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides. + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param src_min_nelem_offset Smallest value of offset relative to + * `host_src_p` in number of elements attained while iterating over elements of + * the source array. + * @param src_max_nelem_offset Largest value of offset relative to `host_src_p` + * in number of elements attained while iterating over elements of the source + * array. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *host_src_p, + ssize_t src_offset, + ssize_t src_min_nelem_offset, + ssize_t src_max_nelem_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_min_nelem_offset, + sycl::range<1>(nelems_range), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + const TwoOffsets_StridedIndexer indexer{ + nd, src_offset - src_min_nelem_offset, dst_offset, + const_cast(shape_and_strides)}; + + dstTy *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, + TwoOffsets_StridedIndexer>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_impl; + return f; + } +}; + +typedef void (*copy_and_cast_from_host_contig_blocking_fn_ptr_t)( + sycl::queue &, + std::size_t, /* nelems */ + const char *, /* src_pointer */ + ssize_t, /* src_offset */ + char *, /* dst_pointer */ + ssize_t, /* dst_offset */ + const std::vector &); + +/*! + * @brief Function to copy from NumPy's ndarray with elements of type `srcTy` + * into usm_ndarray with elements of type `srcTy` for contiguous arrays. + * + * Function to cast and copy elements from numpy.ndarray specified by typeless + * `host_src_p` and the `src_offset` given in the number of array elements. + * Kernel dependencies are given by two vectors of + * events: `depends` and `additional_depends`. The function execution is + * complete at the return. + * + * @param q The queue where the routine should be executed. + * @param nelems Number of elements to cast and copy. + * @param src_stride The stride of source array in elements + * @param dst_stride The stride of destimation array in elements + * @param host_src_p Host (not USM allocated) pointer associated with the + * source array. + * @param src_offset Offset to the beginning of iteration in number of elements + * of the source array from `host_src_p`. + * @param dst_p USM pointer associated with the destination array. + * @param dst_offset Offset to the beginning of iteration in number of elements + * of the destination array from `dst_p`. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @ingroup CopyAndCastKernels + */ +template +void copy_and_cast_from_host_contig_impl( + sycl::queue &q, + std::size_t nelems, + const char *host_src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::buffer npy_buf( + reinterpret_cast(host_src_p) + src_offset, + sycl::range<1>(nelems), {sycl::property::buffer::use_host_ptr{}}); + + sycl::event copy_and_cast_from_host_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::accessor npy_acc(npy_buf, cgh, sycl::read_only); + + using IndexerT = TwoOffsets_CombinedIndexer; + static constexpr NoOpIndexer src_indexer{}; + static constexpr NoOpIndexer dst_indexer{}; + static constexpr TwoOffsets_CombinedIndexer indexer{src_indexer, + dst_indexer}; + + dstTy *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for< + copy_cast_from_host_contig_kernel>( + sycl::range<1>(nelems), + GenericCopyFromHostFunctor, IndexerT>( + npy_acc, dst_tp, indexer)); + }); + + // perform explicit synchronization. Implicit synchronization would be + // performed by sycl::buffer destructor. + copy_and_cast_from_host_ev.wait(); + + return; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given NumPy array + * source data type `S` and destination data type `D`. + * @defgroup CopyAndCastKernels + */ +template +struct CopyAndCastFromHostContigFactory +{ + fnT get() + { + fnT f = copy_and_cast_from_host_contig_impl; + return f; + } +}; + +// =============== Copying for reshape ================== // + +template +class copy_for_reshape_generic_kernel; + +template +class GenericCopyForReshapeFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + GenericCopyForReshapeFunctor(const char *src_ptr, + char *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(reinterpret_cast(src_ptr)), + dst_p(reinterpret_cast(dst_ptr)), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const ssize_t src_offset = src_indexer_(wiid.get(0)); + const ssize_t dst_offset = dst_indexer_(wiid.get(0)); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_reshape_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // src_nd + int, // dst_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to copy content of array while reshaping. + * + * Submits a kernel to perform a copy `dst[unravel_index(i, + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param nelems The number of elements to copy + * @param src_nd Array dimension of the source array + * @param dst_nd Array dimension of the destination array + * @param packed_shapes_and_strides Kernel accessible USM array of size + * `2*src_nd + 2*dst_nd` with content `[src_shape, src_strides, dst_shape, + * dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event + copy_for_reshape_generic_impl(sycl::queue &q, + std::size_t nelems, + int src_nd, + int dst_nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_reshape_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 2*(src_nd + dst_nd) + // [ src_shape; src_strides; dst_shape; dst_strides ] + + const ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); + + const ssize_t *dst_shape_and_strides = const_cast( + packed_shapes_and_strides + (2 * src_nd)); + + const StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; + const StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; + + using KernelName = + copy_for_reshape_generic_kernel; + + cgh.parallel_for( + sycl::range<1>(nelems), + GenericCopyForReshapeFunctor( + src_p, dst_p, src_indexer, dst_indexer)); + }); + + return copy_for_reshape_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForReshapeGenericFactory +{ + fnT get() + { + fnT f = copy_for_reshape_generic_impl; + return f; + } +}; + +// ================== Copying for roll ================== // + +/*! @brief Functor to cyclically roll global_id to the left */ +struct LeftRolled1DTransformer +{ + LeftRolled1DTransformer(std::size_t offset, std::size_t size) + : offset_(offset), size_(size) + { + } + + std::size_t operator()(std::size_t gid) const + { + const std::size_t shifted_gid = + ((gid < offset_) ? gid + size_ - offset_ : gid - offset_); + return shifted_gid; + } + +private: + std::size_t offset_ = 0; + std::size_t size_ = 1; +}; + +/*! @brief Indexer functor to compose indexer and transformer */ +template +struct CompositionIndexer +{ + CompositionIndexer(IndexerT f, TransformerT t) : f_(f), t_(t) {} + + auto operator()(std::size_t gid) const + { + return f_(t_(gid)); + } + +private: + IndexerT f_; + TransformerT t_; +}; + +/*! @brief Indexer functor to find offset for nd-shifted indices lifted from + * iteration id */ +struct RolledNDIndexer +{ + RolledNDIndexer(int nd, + const ssize_t *shape, + const ssize_t *strides, + const ssize_t *ndshifts, + ssize_t starting_offset) + : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), + starting_offset_(starting_offset) + { + } + + ssize_t operator()(std::size_t gid) const + { + return compute_offset(gid); + } + +private: + int nd_ = -1; + const ssize_t *shape_ = nullptr; + const ssize_t *strides_ = nullptr; + const ssize_t *ndshifts_ = nullptr; + ssize_t starting_offset_ = 0; + + ssize_t compute_offset(ssize_t gid) const + { + using dpctl::tensor::strides::CIndexer_vector; + + CIndexer_vector _ind(nd_); + ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( + gid, + shape_, // shape ptr + strides_, // strides ptr + ndshifts_, // shifts ptr + relative_offset_); + return starting_offset_ + relative_offset_; + } +}; + +template +class copy_for_roll_strided_kernel; + +template +class StridedCopyForRollFunctor +{ +private: + const Ty *src_p = nullptr; + Ty *dst_p = nullptr; + SrcIndexerT src_indexer_; + DstIndexerT dst_indexer_; + +public: + StridedCopyForRollFunctor(const Ty *src_ptr, + Ty *dst_ptr, + const SrcIndexerT &src_indexer, + const DstIndexerT &dst_indexer) + : src_p(src_ptr), dst_p(dst_ptr), src_indexer_(src_indexer), + dst_indexer_(dst_indexer) + { + } + + void operator()(sycl::id<1> wiid) const + { + const std::size_t gid = wiid.get(0); + + const ssize_t src_offset = src_indexer_(gid); + const ssize_t dst_offset = dst_indexer_(gid); + + dst_p[dst_offset] = src_p[src_offset]; + } +}; + +// define function type +typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param nd Array dimensionality of the destination and source arrays + * @param packed_shapes_and_strides Kernel accessible USM array + * of size `3*nd` with content `[common_shape, src_strides, dst_strides]`. + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of first element of src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of first element of dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_strided_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides: + // USM array of size 3 * nd + // [ common_shape; src_strides; dst_strides ] + + const StridedIndexer src_indexer{nd, src_offset, + packed_shapes_and_strides}; + const LeftRolled1DTransformer left_roll_transformer{shift, nelems}; + + using CompositeIndexerT = + CompositionIndexer; + + const CompositeIndexerT rolled_src_indexer(src_indexer, + left_roll_transformer); + + UnpackedStridedIndexer dst_indexer{nd, dst_offset, + packed_shapes_and_strides, + packed_shapes_and_strides + 2 * nd}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, rolled_src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +// define function type +typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( + sycl::queue &, + std::size_t, // shift + std::size_t, // num_elements + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +class copy_for_roll_contig_kernel; + +/*! + * @brief Function to copy content of array with a shift. + * + * Submits a kernel to perform a copy `dst[unravel_index((i + shift) % nelems , + * dst.shape)] = src[unravel_undex(i, src.shape)]`. + * + * @param q The execution queue where kernel is submitted. + * @param shift The shift in flat indexing, must be non-negative. + * @param nelems The number of elements to copy + * @param src_p Typeless USM pointer to the buffer of the source array + * @param src_offset Displacement of the start of array src relative src_p in + * elements + * @param dst_p Typeless USM pointer to the buffer of the destination array + * @param dst_offset Displacement of the start of array dst relative dst_p in + * elements + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @ingroup CopyAndCastKernels + */ +template +sycl::event copy_for_roll_contig_impl(sycl::queue &q, + std::size_t shift, + std::size_t nelems, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + static constexpr NoOpIndexer src_indexer{}; + const LeftRolled1DTransformer roller{shift, nelems}; + + const CompositionIndexer + left_rolled_src_indexer{src_indexer, roller}; + static constexpr NoOpIndexer dst_indexer{}; + + using KernelName = copy_for_roll_contig_kernel; + + const Ty *src_tp = reinterpret_cast(src_p) + src_offset; + Ty *dst_tp = reinterpret_cast(dst_p) + dst_offset; + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor< + Ty, CompositionIndexer, + NoOpIndexer>(src_tp, dst_tp, left_rolled_src_indexer, + dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollStridedFactory +{ + fnT get() + { + fnT f = copy_for_roll_strided_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollContigFactory +{ + fnT get() + { + fnT f = copy_for_roll_contig_impl; + return f; + } +}; + +template +class copy_for_roll_ndshift_strided_kernel; + +// define function type +typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset + const std::vector &); + +template +sycl::event copy_for_roll_ndshift_strided_impl( + sycl::queue &q, + std::size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides_and_shifts, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event copy_for_roll_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + // packed_shapes_and_strides_and_shifts: + // USM array of size 4 * nd + // [ common_shape; src_strides; dst_strides; shifts ] + + const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const ssize_t *src_strides_ptr = + packed_shapes_and_strides_and_shifts + nd; + const ssize_t *dst_strides_ptr = + packed_shapes_and_strides_and_shifts + 2 * nd; + const ssize_t *shifts_ptr = + packed_shapes_and_strides_and_shifts + 3 * nd; + + const RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, + shifts_ptr, src_offset}; + + const UnpackedStridedIndexer dst_indexer{nd, dst_offset, shape_ptr, + dst_strides_ptr}; + + using KernelName = copy_for_roll_strided_kernel; + + const Ty *src_tp = reinterpret_cast(src_p); + Ty *dst_tp = reinterpret_cast(dst_p); + + cgh.parallel_for( + sycl::range<1>(nelems), + StridedCopyForRollFunctor( + src_tp, dst_tp, src_indexer, dst_indexer)); + }); + + return copy_for_roll_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for given array data + * type `Ty`. + * @ingroup CopyAndCastKernels + */ +template +struct CopyForRollNDShiftFactory +{ + fnT get() + { + fnT f = copy_for_roll_ndshift_strided_impl; + return f; + } +}; + +} // namespace copy_and_cast +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp new file mode 100644 index 000000000000..b4f367448758 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -0,0 +1,655 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor copying and value casting. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copy_as_contig +{ + +using dpctl::tensor::ssize_t; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class CopyAsCContigFunctor +{ +private: + std::size_t nelems; + const T *src_p = nullptr; + T *dst_p = nullptr; + IndexerT src_indexer; + +public: + CopyAsCContigFunctor(std::size_t n, + const T *src_, + T *dst_, + const IndexerT &src_indexer_) + : nelems(n), src_p(src_), dst_p(dst_), src_indexer(src_indexer_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::uint8_t elems_per_wi = vec_sz * n_vecs; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_max_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + // start = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize) + // gid % sgSize == gid - (gid / sgSize) * sgSize + const std::uint16_t elems_per_sg = sgSize * elems_per_wi; + const std::size_t start = + (gid / sgSize) * (elems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + elems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + auto src_offset = src_indexer(offset); + dst_p[offset] = src_p[src_offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + const std::size_t base = + elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + const std::uint16_t elems_per_sg = elems_per_wi * sgSize; + + if (base + elems_per_sg < nelems) { +#pragma unroll + for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) { + // it == vec_id * vec_sz, for 0 <= vec_id < n_vecs + const std::size_t block_start_id = base + it * sgSize; + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[block_start_id]); + + const std::size_t elem_id0 = + block_start_id + sg.get_local_id(); + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + const std::size_t elem_id = elem_id0 + k * sgSize; + const ssize_t src_offset = src_indexer(elem_id); + dst_vec[k] = src_p[src_offset]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + const std::size_t k0 = base + lane_id; + for (std::size_t k = k0; k < nelems; k += sgSize) { + const ssize_t src_offset = src_indexer(k); + dst_p[k] = src_p[src_offset]; + } + } + } + } +}; + +template +sycl::event submit_c_contiguous_copy(sycl::queue &exec_q, + std::size_t nelems, + const T *src, + T *dst, + const IndexerT &src_indexer, + const std::vector &depends) +{ + static_assert(vec_sz > 0); + static_assert(n_vecs > 0); + + static constexpr std::size_t preferred_lws = 256; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + const std::size_t lws = + ((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size; + + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + const std::size_t nelems_per_group = nelems_per_wi * lws; + const std::size_t n_groups = + (nelems + nelems_per_group - 1) / (nelems_per_group); + + sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.use_kernel_bundle(kb); + + const sycl::range<1> gRange{n_groups * lws}; + const sycl::range<1> lRange{lws}; + + cgh.parallel_for( + sycl::nd_range<1>(gRange, lRange), + CopyAsCContigFunctor( + nelems, src, dst, src_indexer)); + }); + return copy_ev; +} + +template +class as_contig_krn; + +template +sycl::event + as_c_contiguous_array_generic_impl(sycl::queue &exec_q, + std::size_t nelems, + int nd, + const ssize_t *shape_and_strides, + const char *src_p, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + using IndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides); + + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + + using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; + using dpctl::tensor::kernels::alignment_utils::is_aligned; + using dpctl::tensor::kernels::alignment_utils::required_alignment; + + sycl::event copy_ev; + if (is_aligned(dst_p)) { + static constexpr bool enable_sg_load = true; + using KernelName = + as_contig_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + else { + static constexpr bool disable_sg_load = false; + using InnerKernelName = + as_contig_krn; + using KernelName = disabled_sg_loadstore_wrapper_krn; + copy_ev = submit_c_contiguous_copy( + exec_q, nelems, src_tp, dst_tp, src_indexer, depends); + } + + return copy_ev; +} + +typedef sycl::event (*as_c_contiguous_array_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const ssize_t *, + const char *, + char *, + const std::vector &); + +template +struct AsCContigFactory +{ + fnT get() + { + return as_c_contiguous_array_generic_impl; + } +}; + +template +class as_contig_batch_of_square_matrices_krn; + +namespace detail +{ +/*! @brief batch of matrices (n, n), source strides (1, src_ld), destination + strides (dst_ld, 1) src and destination arrays must be disjoint memory blocks + to avoid race condition + */ +template +sycl::event as_c_contiguous_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + const BatchIndexerT &batch_two_offsets_indexer, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const T *src_tp = reinterpret_cast(src_p); + T *dst_tp = reinterpret_cast(dst_p); + + static constexpr std::uint16_t private_tile_size = 4; + static constexpr std::uint16_t n_lines = 2; + static constexpr std::uint16_t block_size = + n_lines * private_tile_size * private_tile_size; + + static constexpr std::uint16_t lws0 = block_size; + static constexpr std::uint16_t lws1 = n_lines; + static constexpr std::uint16_t nelems_per_wi = (block_size / lws1); + + static_assert(nelems_per_wi * lws1 == block_size); + static_assert(nelems_per_wi == private_tile_size * private_tile_size); + + static constexpr std::uint32_t lws = lws0 * lws1; + + const std::size_t n_tiles = (n + block_size - 1) / block_size; + + const ssize_t src_stride = src_ld; + const ssize_t dst_stride = dst_ld; + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{batch_nelems * n_tiles * n_tiles * lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + using KernelName = + as_contig_batch_of_square_matrices_krn; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::local_accessor local_block(block_size * block_size, cgh); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> nd_it) { + // 1. Read block from source array into SLM + const std::uint32_t lid_lin = nd_it.get_local_linear_id(); + const std::size_t gr_id_lin = nd_it.get_group_linear_id(); + + const std::size_t batch_id = gr_id_lin / (n_tiles * n_tiles); + const std::size_t rem = gr_id_lin - batch_id * (n_tiles * n_tiles); + + const auto &batch_two_offsets = batch_two_offsets_indexer(batch_id); + const auto &src_batch_offset = batch_two_offsets.get_first_offset(); + const auto &dst_batch_offset = + batch_two_offsets.get_second_offset(); + + // Block id + /* 0 <= src_gr_i1 < n_groups_n1 */ + const std::size_t src_tile_i1 = rem / n_tiles; + /* 0 <= src_gr_i0 < n_groups_n0 */ + const std::size_t src_tile_i0 = rem - src_tile_i1 * n_tiles; + + // ID of element within the block + /* 0 <= src_i1 < lws1 */ + const std::uint32_t src_i1 = lid_lin / lws0; + /* 0 <= src_i0 < lws0 */ + const std::uint32_t src_i0 = lid_lin - src_i1 * lws0; + + // Matrix element ID + const std::size_t src_tile_start0 = src_tile_i0 * block_size; + const std::size_t src_tile_start1 = src_tile_i1 * block_size; + const std::size_t src_gid0 = (src_tile_start0 + src_i0); + const std::size_t src_gid1 = (src_tile_start1 + src_i1); + + // src_offset = src_gid0 * 1 + (src_gid1 + pr_id * lws1) * + // src_stride + const std::size_t src_offset0 = + src_batch_offset + src_gid0 * 1 + src_gid1 * src_stride; + const std::size_t pr_step_src = lws1 * src_stride; + + const std::uint32_t local_offset0 = src_i0 + src_i1 * block_size; + const std::uint32_t pr_step_local = lws1 * block_size; + + for (std::uint32_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + local_block[local_offset0 + pr_step_local * pr_id] = + (src_gid0 < n && src_gid1 + pr_id * lws1 < n) + ? src_tp[src_offset0 + pr_step_src * pr_id] + : T(0); + } + + const std::uint32_t local_dim0 = static_cast( + std::min(src_tile_start0 + block_size, n) - + src_tile_start0); + const std::uint32_t local_dim1 = static_cast( + std::min(src_tile_start1 + block_size, n) - + src_tile_start1); + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 2. Permute the block matrix in SLM using two private arrays + std::array private_block_01 = {T(0)}; + std::array private_block_10 = {T(0)}; + + // 0 <= lid_lin < lws0 * lws1 == + // (block_size * block_size / nelems_per_wi) == + // (block_size/private_tile_size)**2 + static constexpr std::uint16_t n_private_tiles_per_axis = + block_size / private_tile_size; + const std::uint16_t local_tile_id0 = + lid_lin / n_private_tiles_per_axis; + const std::uint16_t local_tile_id1 = + lid_lin - local_tile_id0 * n_private_tiles_per_axis; + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + + const std::uint16_t pr_offset = + pr_i1 * private_tile_size + pr_i0; + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // read (local_tile_id0, local_tile_id1) + const std::uint16_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + private_block_01[pr_offset] = + local_block[local_01_offset]; + + // read (local_tile_id1, local_tile_id0) + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + private_block_10[pr_offset] = + local_block[local_10_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + if (local_tile_id0 <= local_tile_id1) { + for (std::uint16_t pr_i0 = 0; pr_i0 < private_tile_size; + ++pr_i0) { + for (std::uint16_t pr_i1 = 0; pr_i1 < private_tile_size; + ++pr_i1) { + const std::uint16_t t0_offset = + local_tile_id0 * private_tile_size; + const std::uint16_t t1_offset = + local_tile_id1 * private_tile_size; + const std::uint16_t pr_offset = + pr_i0 * private_tile_size + pr_i1; + + const std::uint16_t rel_offset = + pr_i0 + pr_i1 * block_size; + + // write back permuted private blocks + const std::uint32_t local_01_offset = + (t0_offset + t1_offset * block_size) + rel_offset; + local_block[local_01_offset] = + private_block_10[pr_offset]; + + const std::uint16_t local_10_offset = + (t1_offset + t0_offset * block_size) + rel_offset; + local_block[local_10_offset] = + private_block_01[pr_offset]; + } + } + } + + sycl::group_barrier(nd_it.get_group(), + sycl::memory_scope::work_group); + + // 3. Write out permuted SLM to destination array + + const std::size_t dst_tile_start0 = src_tile_start0; + const std::size_t dst_tile_start1 = src_tile_start1; + + if (local_dim0 == block_size && local_dim1 == block_size) { + const std::uint16_t dst_i0 = src_i1; + const std::uint16_t dst_i1 = src_i0; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset0 = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::size_t pr_step_dst = lws1 * dst_stride; + + const std::uint16_t _local_offset0 = + dst_i0 * block_size + dst_i1; + const std::uint16_t _pr_step_local = lws1 * block_size; + + for (std::uint16_t pr_id = 0; pr_id < nelems_per_wi; ++pr_id) { + if ((dst_gid1 < n) && ((dst_gid0 + pr_id * lws1) < n)) { + dst_tp[dst_offset0 + pr_step_dst * pr_id] = + local_block[_local_offset0 + + _pr_step_local * pr_id]; + } + } + } + else { + // map local_linear_id into (local_dim0, local_dim1) + for (std::uint16_t el_id = lid_lin; + el_id < local_dim0 * local_dim1; el_id += lws0 * lws1) + { + + // 0 <= local_i0 < local_dim0 + const std::uint16_t loc_i0 = el_id / local_dim1; + // 0 <= local_i1 < local_dim1 + const std::uint16_t loc_i1 = el_id - loc_i0 * local_dim1; + + const std::uint16_t dst_i0 = loc_i0; + const std::uint16_t dst_i1 = loc_i1; + + const std::size_t dst_gid0 = (dst_tile_start0 + dst_i0); + const std::size_t dst_gid1 = (dst_tile_start1 + dst_i1); + + const std::size_t dst_offset = + dst_batch_offset + dst_gid0 * dst_stride + dst_gid1 * 1; + const std::uint16_t local_offset = + loc_i0 * block_size + loc_i1; + + if ((dst_gid1 < n) && (dst_gid0 < n)) { + dst_tp[dst_offset] = local_block[local_offset]; + } + } + } + }); + }); + + return e; +} + +} // end of namespace detail + +template +sycl::event as_c_contiguous_1d_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + ssize_t src_batch_step, + ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = + TwoOffsets_CombinedIndexer; + + const auto &src_batch_indexer = + Strided1DIndexer(batch_nelems, src_batch_step); + const auto &dst_batch_indexer = + Strided1DIndexer(batch_nelems, dst_batch_step); + + const BatchIndexerT batch_two_indexer{src_batch_indexer, dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_indexer, n, src_p, src_ld, dst_p, + dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of batch elements */ + ssize_t, /* distance between batches in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* size of square matrices in the batch */ + const char *, + ssize_t, /* untyped pointer to F-contig source array, and matrix leading + dimension */ + char *, + ssize_t, /* untyped pointer to C-contig destination array, and matrix + leading dimension */ + const std::vector &); + +template +struct AsCContig1DBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_1d_batch_of_square_matrices_impl; + } +}; + +template +sycl::event as_c_contiguous_nd_batch_of_square_matrices_impl( + sycl::queue &exec_q, + std::size_t batch_nelems, + int batch_nd, + const ssize_t *src_batch_shape_strides, + const ssize_t dst_batch_step, + std::size_t n, + const char *src_p, + ssize_t src_ld, + char *dst_p, + ssize_t dst_ld, + const std::vector &depends) +{ + using SrcIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using DstIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer; + using BatchIndexerT = TwoOffsets_CombinedIndexer; + + static constexpr ssize_t zero_offset{0}; + + const SrcIndexerT src_batch_indexer{batch_nd, zero_offset, + src_batch_shape_strides}; + const DstIndexerT dst_batch_indexer{/* size */ batch_nelems, + /* step */ dst_batch_step}; + + const BatchIndexerT batch_two_offsets_indexer{src_batch_indexer, + dst_batch_indexer}; + + return detail::as_c_contiguous_batch_of_square_matrices_impl( + exec_q, batch_nelems, batch_two_offsets_indexer, n, src_p, src_ld, + dst_p, dst_ld, depends); +} + +typedef sycl::event ( + *as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t)( + sycl::queue &, /* execution queue */ + std::size_t, /* number of matrices in the batch */ + int, + const ssize_t *, /* dimensionality, and packed [shape, src_strides] + describing iteration over batch in source array */ + ssize_t, /* distance between batches in destination array */ + std::size_t, /* matrix size */ + const char *, + ssize_t, /* untyped pointer to source array of F-contig matrices, and + leading dimension of the matrix */ + char *, + ssize_t, /* untyped pointer to destination array of F-contig matrices, and + leading dimension of the matrix */ + const std::vector &); + +template +struct AsCContigNDBatchOfSquareMatricesFactory +{ + fnT get() + { + return as_c_contiguous_nd_batch_of_square_matrices_impl; + } +}; + +} // namespace copy_as_contig +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp new file mode 100644 index 000000000000..53b39ff5874c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -0,0 +1,758 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/copy_as_contiguous.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_array_impl_fn_ptr_t; +using dpctl::tensor::kernels::copy_as_contig:: + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +static as_c_contiguous_array_impl_fn_ptr_t + as_c_contig_array_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_1d_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +static as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[td_ns::num_types]; + +void init_copy_as_contig_dispatch_vectors(void) +{ + + using dpctl::tensor::kernels::copy_as_contig:: + AsCContig1DBatchOfSquareMatricesFactory; + using dpctl::tensor::kernels::copy_as_contig::AsCContigFactory; + using dpctl::tensor::kernels::copy_as_contig:: + AsCContigNDBatchOfSquareMatricesFactory; + using td_ns::DispatchVectorBuilder; + + // Generic to c-contig + DispatchVectorBuilder + dtv_as_c_contig_array; + + dtv_as_c_contig_array.populate_dispatch_vector( + as_c_contig_array_dispatch_vector); + + // 1D batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_1d_batch_of_square_matrices_impl_fn_ptr_t, + AsCContig1DBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_1d_batch_of_square_matrices; + + dtv_as_c_contig_1d_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_1d_batch_of_square_matrices_dispatch_vector); + + // ND batch of square views into F-contig matrices to c-contig array + DispatchVectorBuilder< + as_c_contiguous_nd_batch_of_square_matrices_impl_fn_ptr_t, + AsCContigNDBatchOfSquareMatricesFactory, td_ns::num_types> + dtv_as_c_contig_nd_batch_of_square_matrices; + + dtv_as_c_contig_nd_batch_of_square_matrices.populate_dispatch_vector( + as_c_contig_nd_batch_of_square_matrices_dispatch_vector); +} + +namespace +{ + +template +std::size_t get_nelems(const std::vector &shape) +{ + auto mult_fn = [](std::size_t prod, const dimT &term) -> std::size_t { + return prod * static_cast(term); + }; + + static constexpr std::size_t unit{1}; + + const std::size_t nelems = + std::accumulate(std::begin(shape), std::end(shape), unit, mult_fn); + return nelems; +} + +} // end of anonymous namespace + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + const int src_nd = src.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.back(); + if (n == dst_shape_vec[src_nd - 2]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[src_nd - 2] == unit_stride) { + return py_as_c_contig_f2c(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + if (src_shape_vec != dst_shape_vec) { + throw py::value_error("Shapes must be equal"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be F-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_nd >= 2) { + auto n = dst_shape_vec.front(); + if (n == dst_shape_vec[1]) { + static constexpr auto unit_stride = py::ssize_t(1); + if (src_strides_vec[1] == unit_stride) { + return py_as_f_contig_c2f(src, dst, exec_q, depends); + } + } + } + + const std::size_t nelems = get_nelems(src_shape_vec); + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, src_shape_vec.data(), src_strides_vec, dst.get_strides_vector(), + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + std::vector host_task_events{}; + auto ptr_size_event_tuple = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto shape_stride_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_stride = shape_stride_owner.get(); + + auto ascontig_fn = as_c_contig_array_dispatch_vector[src_type_id]; + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + ascontig_fn(exec_q, nelems, nd, shape_stride, src.get_data(), + dst.get_data(), all_depends); + + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {ascontig_ev}, + shape_stride_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_c_contig_f2c(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is C-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = src_shape_vec.back(); + if (src_shape_vec[src_nd - 2] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[src_nd - 2] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_c_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[src_nd - 3]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec), + std::end(src_shape_vec) - 2); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec), + std::end(src_strides_vec) - 2); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec), + std::end(dst_strides_vec) - 2); + } + + // simplify batch iteration space + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.back(), dst.get_data(), + dst_strides_vec[src_nd - 2], all_depends); + + // async free of shape_strides temporary + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +std::pair + py_as_f_contig_c2f(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + /* Same dimensions, same shape, same data-type + * dst is F-contiguous. + */ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Number of dimensions must be the same."); + } + if (src_nd < 2) { + throw py::value_error("Arrays must have 2 or more axes"); + } + + // ensures also that destination is plenty ample to accommodate all + // elements of src array + if (!dst.is_f_contiguous()) { + throw py::value_error("Destination array must be C-contiguous"); + } + + const auto &src_shape_vec = src.get_shape_vector(); + const auto &dst_shape_vec = dst.get_shape_vector(); + + std::size_t nelems{1}; + bool equal_shapes = true; + + for (int i = 0; equal_shapes && (i < src_nd); ++i) { + auto sh_i = src_shape_vec[i]; + equal_shapes = equal_shapes && (sh_i == dst_shape_vec[i]); + nelems *= static_cast(sh_i); + } + + if (!equal_shapes) { + throw py::value_error("Shapes must be equal"); + } + + const auto n = dst_shape_vec.front(); + if (dst_shape_vec[1] != n) { + throw py::value_error("Matrices must be square"); + } + + const auto &src_strides_vec = src.get_strides_vector(); + + if (src_strides_vec[1] != py::ssize_t(1)) { + throw py::value_error("Unexpected destination array layout"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + const int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + const int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::value_error( + "Source and destination arrays must have the same data type"); + } + + if (nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + const auto &dst_strides_vec = dst.get_strides_vector(); + + const std::size_t batch_nelems = + (src_nd == 2) ? std::size_t(1) : (nelems / (n * n)); + const py::ssize_t dst_batch_step = + (src_nd == 2) ? py::ssize_t(0) : dst_strides_vec[2]; + + std::vector src_batch_strides_vec; + std::vector dst_batch_strides_vec; + std::vector batch_shape_vec; + + if (src_nd == 2) { + batch_shape_vec.push_back(py::ssize_t(1)); + src_batch_strides_vec.push_back(py::ssize_t(0)); + dst_batch_strides_vec.push_back(dst_batch_step); + } + else { + batch_shape_vec.insert(std::end(batch_shape_vec), + std::begin(src_shape_vec) + 2, + std::end(src_shape_vec)); + src_batch_strides_vec.insert(std::end(src_batch_strides_vec), + std::begin(src_strides_vec) + 2, + std::end(src_strides_vec)); + dst_batch_strides_vec.insert(std::end(dst_batch_strides_vec), + std::begin(dst_strides_vec) + 2, + std::end(dst_strides_vec)); + } + + // simplify batch iteration space + // NB: simplification reverses dst strides to C contig, + // it also reverses simplified_shape and simplified_src_strides + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = static_cast(batch_shape_vec.size()); + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, batch_shape_vec.data(), src_batch_strides_vec, + dst_batch_strides_vec, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (!((0 == src_offset) && (0 == dst_offset))) { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 1"); + } + + if (1 == nd) { + const auto expected_dim = static_cast(batch_nelems); + if ((simplified_shape.front() != expected_dim) || + (simplified_dst_strides.front() != dst_batch_step)) + { + throw std::runtime_error( + "Unexpected result of simplifying iteration space, 2"); + } + + auto impl_fn = as_c_contig_1d_batch_of_square_matrices_dispatch_vector + [src_type_id]; + const py::ssize_t src_batch_step = simplified_src_strides.front(); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, src_batch_step, dst_batch_step, n, + src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {ascontig_ev}), ascontig_ev); + } + + auto impl_fn = + as_c_contig_nd_batch_of_square_matrices_dispatch_vector[src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_depends; + all_depends.reserve(depends.size() + 1); + all_depends.insert(std::end(all_depends), std::begin(depends), + std::end(depends)); + all_depends.push_back(copy_shape_ev); + + sycl::event ascontig_ev = + impl_fn(exec_q, batch_nelems, nd, packed_shape_strides, dst_batch_step, + n, src.get_data(), src_strides_vec.front(), dst.get_data(), + dst_strides_vec[1], all_depends); + + // async free of shape_strides + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {ascontig_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + ascontig_ev); +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp new file mode 100644 index 000000000000..2de67098b7fa --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +std::pair + py_as_c_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +std::pair + py_as_f_contig(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +void init_copy_as_contig_dispatch_vectors(void); + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl From 5a9c14cd5ac07cf0a79da70e67b1cd9c28f063c6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:16:36 -0800 Subject: [PATCH 005/145] Add copy_usm_ndarray_into_usm_ndarray implementation --- .../source/copy_and_cast_usm_to_usm.cpp | 310 ++++++++++++++++++ .../source/copy_and_cast_usm_to_usm.hpp | 60 ++++ 2 files changed, 370 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp new file mode 100644 index 000000000000..0458aa75ac32 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -0,0 +1,310 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "copy_as_contig.hpp" +#include "simplify_iteration_space.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_1d_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_and_cast_generic_fn_ptr_t; + +static copy_and_cast_generic_fn_ptr_t + copy_and_cast_generic_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_1d_fn_ptr_t + copy_and_cast_1d_dispatch_table[td_ns::num_types][td_ns::num_types]; +static copy_and_cast_contig_fn_ptr_t + copy_and_cast_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && (i < src_nd); ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + // check for applicability of special cases: + // (both C-contiguous || both F-contiguous) + bool both_c_contig = (is_src_c_contig && is_dst_c_contig); + bool both_f_contig = (is_src_f_contig && is_dst_f_contig); + if (both_c_contig || both_f_contig) { + + sycl::event copy_ev; + if (src_type_id == dst_type_id) { + + int src_elem_size = src.get_elemsize(); + + copy_ev = exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + } + else { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id][src_type_id]; + copy_ev = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + // make sure src and dst are not GC-ed before copy_ev is complete + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + if ((src_type_id == dst_type_id) && (src_nd > 1)) { + if (is_dst_c_contig) { + return py_as_c_contig(src, dst, exec_q, depends); + } + else if (is_dst_f_contig) { + return py_as_f_contig(src, dst, exec_q, depends); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd < 2) { + if (nd == 1) { + std::array shape_arr = {simplified_shape[0]}; + std::array src_strides_arr = { + simplified_src_strides[0]}; + std::array dst_strides_arr = { + simplified_dst_strides[0]}; + + sycl::event copy_and_cast_1d_event; + if ((src_strides_arr[0] == 1) && (dst_strides_arr[0] == 1) && + (src_offset == 0) && (dst_offset == 0)) + { + auto contig_fn = + copy_and_cast_contig_dispatch_table[dst_type_id] + [src_type_id]; + copy_and_cast_1d_event = + contig_fn(exec_q, src_nelems, src_data, dst_data, depends); + } + else { + auto fn = + copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + copy_and_cast_1d_event = + fn(exec_q, src_nelems, shape_arr, src_strides_arr, + dst_strides_arr, src_data, src_offset, dst_data, + dst_offset, depends); + } + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_1d_event}), + copy_and_cast_1d_event); + } + else if (nd == 0) { // case of a scalar + assert(src_nelems == 1); + std::array shape_arr = {1}; + std::array src_strides_arr = {1}; + std::array dst_strides_arr = {1}; + + auto fn = copy_and_cast_1d_dispatch_table[dst_type_id][src_type_id]; + + sycl::event copy_and_cast_0d_event = fn( + exec_q, src_nelems, shape_arr, src_strides_arr, dst_strides_arr, + src_data, src_offset, dst_data, dst_offset, depends); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {copy_and_cast_0d_event}), + copy_and_cast_0d_event); + } + } + + // Generic implementation + auto copy_and_cast_fn = + copy_and_cast_generic_dispatch_table[dst_type_id][src_type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event ©_and_cast_generic_ev = copy_and_cast_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // async free of shape_strides temporary + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_and_cast_generic_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_and_cast_generic_ev); +} + +void init_copy_and_cast_usm_to_usm_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastContigFactory; + DispatchTableBuilder + dtb_contig; + dtb_contig.populate_dispatch_table(copy_and_cast_contig_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastGenericFactory; + DispatchTableBuilder + dtb_generic; + dtb_generic.populate_dispatch_table(copy_and_cast_generic_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast::CopyAndCast1DFactory; + DispatchTableBuilder + dtb_1d; + dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp new file mode 100644 index 000000000000..d2a2dcaf7b85 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair copy_usm_ndarray_into_usm_ndarray( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4f6334054fc08df7c2c2f7657bc5f4569ee4363a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:18:36 -0800 Subject: [PATCH 006/145] Add pybind11 bindings for dpctl_ext.tensor._tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 502 ++++++++++++++++++ 1 file changed, 502 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp new file mode 100644 index 000000000000..b41b5c9ce423 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -0,0 +1,502 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" + +// #include "accumulators.hpp" +// #include "boolean_advanced_indexing.hpp" +// #include "clip.hpp" +#include "copy_and_cast_usm_to_usm.hpp" +#include "copy_as_contig.hpp" +// #include "copy_for_reshape.hpp" +// #include "copy_for_roll.hpp" +// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +// #include "device_support_queries.hpp" +// #include "eye_ctor.hpp" +// #include "full_ctor.hpp" +// #include "integer_advanced_indexing.hpp" +#include "kernels/dpctl_tensor_types.hpp" +// #include "linear_sequences.hpp" +// #include "repeat.hpp" +#include "simplify_iteration_space.hpp" +// #include "triul_ctor.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/strided_iters.hpp" +// #include "where.hpp" +// #include "zeros_ctor.hpp" + +namespace py = pybind11; + +static_assert(std::is_same_v); + +namespace +{ + +using dpctl::tensor::c_contiguous_strides; +using dpctl::tensor::f_contiguous_strides; + +using dpctl::tensor::overlap::MemoryOverlap; +using dpctl::tensor::overlap::SameLogicalTensors; + +using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::py_as_c_contig; +using dpctl::tensor::py_internal::py_as_f_contig; + +/* =========================== Copy for reshape ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; + +/* =========================== Copy for roll ============================= */ + +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; + +/* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ + +// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; + +/* ============= linear-sequence ==================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; + +/* ================ Full ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_full; + +/* ================ Zeros ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_zeros; + +/* ============== Advanced Indexing ============= */ +// using dpctl::tensor::py_internal::usm_ndarray_put; +// using dpctl::tensor::py_internal::usm_ndarray_take; + +// using dpctl::tensor::py_internal::py_extract; +// using dpctl::tensor::py_internal::py_mask_positions; +// using dpctl::tensor::py_internal::py_nonzero; +// using dpctl::tensor::py_internal::py_place; + +/* ================= Repeat ====================*/ +// using dpctl::tensor::py_internal::py_cumsum_1d; +// using dpctl::tensor::py_internal::py_repeat_by_scalar; +// using dpctl::tensor::py_internal::py_repeat_by_sequence; + +/* ================ Eye ================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_eye; + +/* =========================== Tril and triu ============================== */ + +// using dpctl::tensor::py_internal::usm_ndarray_triul; + +/* =========================== Where ============================== */ + +// using dpctl::tensor::py_internal::py_where; + +/* =========================== Clip ============================== */ +// using dpctl::tensor::py_internal::py_clip; + +// populate dispatch tables +void init_dispatch_tables(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_and_cast_usm_to_usm_dispatch_tables(); + // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + // init_advanced_indexing_dispatch_tables(); + // init_where_dispatch_tables(); + return; +} + +// populate dispatch vectors +void init_dispatch_vectors(void) +{ + using namespace dpctl::tensor::py_internal; + + init_copy_as_contig_dispatch_vectors(); + // init_copy_for_reshape_dispatch_vectors(); + // init_copy_for_roll_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); + // init_full_ctor_dispatch_vectors(); + // init_zeros_ctor_dispatch_vectors(); + // init_eye_ctor_dispatch_vectors(); + // init_triul_ctor_dispatch_vectors(); + + // populate_masked_extract_dispatch_vectors(); + // populate_masked_place_dispatch_vectors(); + + // populate_mask_positions_dispatch_vectors(); + + // populate_cumsum_1d_dispatch_vectors(); + // init_repeat_dispatch_vectors(); + + // init_clip_dispatch_vectors(); + + return; +} + +} // namespace + +PYBIND11_MODULE(_tensor_impl, m) +{ + init_dispatch_tables(); + init_dispatch_vectors(); + + using dpctl::tensor::strides::contract_iter; + m.def( + "_contract_iter", &contract_iter, + "Simplifies iteration of array of given shape & stride. Returns " + "a triple: shape, stride and offset for the new iterator of possible " + "smaller dimension, which traverses the same elements as the original " + "iterator, possibly in a different order."); + + m.def("_copy_usm_ndarray_into_usm_ndarray", + ©_usm_ndarray_into_usm_ndarray, + "Copies from usm_ndarray `src` into usm_ndarray `dst` of the same " + "shape. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_c_contig", &py_as_c_contig, + "Copies from usm_ndarray `src` into C-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_as_f_contig", &py_as_f_contig, + "Copies from usm_ndarray `src` into F-contiguous usm_ndarray " + "`dst` of the same shape and the same data type. " + "Returns a tuple of events: (host_task_event, compute_task_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + using dpctl::tensor::strides::contract_iter2; + m.def( + "_contract_iter2", &contract_iter2, + "Simplifies iteration over elements of pair of arrays of given shape " + "with strides stride1 and stride2. Returns " + "a 5-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter3; + m.def( + "_contract_iter3", &contract_iter3, + "Simplifies iteration over elements of 3-tuple of arrays of given " + "shape " + "with strides stride1, stride2, and stride3. Returns " + "a 7-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + using dpctl::tensor::strides::contract_iter4; + m.def( + "_contract_iter4", &contract_iter4, + "Simplifies iteration over elements of 4-tuple of arrays of given " + "shape " + "with strides stride1, stride2, stride3, and stride4. Returns " + "a 9-tuple: shape, stride and offset for the new iterator of possible " + "smaller dimension for each array, which traverses the same elements " + "as the original " + "iterator, possibly in a different order."); + + static constexpr char orderC = 'C'; + m.def( + "_ravel_multi_index", + [](const std::vector &mi, + const std::vector &shape, char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_ravel_multi_index_c(mi, + shape); + } + else { + return dpctl::tensor::py_internal::_ravel_multi_index_f(mi, + shape); + } + }, + ""); + + m.def( + "_unravel_index", + [](py::ssize_t flat_index, const std::vector &shape, + char order = 'C') { + if (order == orderC) { + return dpctl::tensor::py_internal::_unravel_index_c(flat_index, + shape); + } + else { + return dpctl::tensor::py_internal::_unravel_index_f(flat_index, + shape); + } + }, + ""); + + // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "number of elements using underlying 'C'-contiguous order for + // flat " "traversal. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for flat " + // "traversal with shift. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("src"), py::arg("dst"), py::arg("shift"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same + // " "shapes using underlying 'C'-contiguous order for " "traversal + // with shifts along each axis. " "Returns a tuple of events: + // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), + // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and step `dt`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + // "specified by " + // "starting point `start` and end point `end`. " + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_copy_numpy_ndarray_into_usm_ndarray", + // ©_numpy_ndarray_into_usm_ndarray, + // "Copy from numpy array `src` into usm_ndarray `dst` + // synchronously.", py::arg("src"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_full_usm_ndarray", &usm_ndarray_full, + // "Populate usm_ndarray `dst` with given fill_value.", + // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_take", &usm_ndarray_take, + // "Takes elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` from array `src` and copies them " + // "into usm_ndarray `dst` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("ind"), py::arg("dst"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_put", &usm_ndarray_put, + // "Puts elements at usm_ndarray indices `ind` and axes starting " + // "at axis `axis_start` into array `dst` from " + // "usm_ndarray `val` synchronously." + // "Returns a tuple of events: (hev, ev)", + // py::arg("dst"), py::arg("ind"), py::arg("val"), + // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_eye", &usm_ndarray_eye, + // "Fills input 2D contiguous usm_ndarray `dst` with " + // "zeros outside of the diagonal " + // "specified by " + // "the diagonal index `k` " + // "which is filled with ones." + // "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("default_device_fp_type", + // dpctl::tensor::py_internal::default_device_fp_type, + // "Gives default floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_int_type", + // dpctl::tensor::py_internal::default_device_int_type, + // "Gives default signed integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_uint_type", + // dpctl::tensor::py_internal::default_device_uint_type, + // "Gives default unsigned integer type supported by device.", + // py::arg("dev")); + + // m.def("default_device_bool_type", + // dpctl::tensor::py_internal::default_device_bool_type, + // "Gives default boolean type supported by device.", py::arg("dev")); + + // m.def("default_device_complex_type", + // dpctl::tensor::py_internal::default_device_complex_type, + // "Gives default complex floating point type supported by device.", + // py::arg("dev")); + + // m.def("default_device_index_type", + // dpctl::tensor::py_internal::default_device_index_type, + // "Gives default index type supported by device.", py::arg("dev")); + + // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + // }; + // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + // }; + // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + // py::arg("cumsum"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto overlap = [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &overlap = MemoryOverlap(); + return overlap(x1, x2); + }; + m.def("_array_overlap", overlap, + "Determines if the memory regions indexed by each array overlap", + py::arg("array1"), py::arg("array2")); + + // auto same_logical_tensors = + // [](const dpctl::tensor::usm_ndarray &x1, + // const dpctl::tensor::usm_ndarray &x2) -> bool { + // auto const &same_logical_tensors = SameLogicalTensors(); + // return same_logical_tensors(x1, x2); + // }; + // m.def("_same_logical_tensors", same_logical_tensors, + // "Determines if the memory regions indexed by each array are the + // same", py::arg("array1"), py::arg("array2")); + + // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + // py::arg("mask_shape"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), + // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const dpctl::tensor::usm_ndarray &reps, + // const dpctl::tensor::usm_ndarray &cumsum, + // std::optional axis, sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_sequence(src, dst, reps, cumsum, + // axis.value(), + // exec_q, depends); + // } + // else { + // return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + // depends); + // } + // }; + // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), + // py::arg("dst"), py::arg("reps"), py::arg("cumsum"), + // py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = + // py::list()); + + // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + // const dpctl::tensor::usm_ndarray &dst, + // const py::ssize_t reps, std::optional axis, + // sycl::queue &exec_q, + // const std::vector depends) + // -> std::pair { + // if (axis) { + // return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + // depends); + // } + // else { + // return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + // } + // }; + // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + // py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); + + // m.def("_clip", &py_clip, + // "Clamps elements of array `x` to the range " + // "[`min`, `max] and writes the result to the " + // "array `dst` for each element of `x`, `min`, and `max`." + // "Returns a tuple of events: (hev, ev)", + // py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); +} From 634579c5f0d64d44805d0a020cb4ca5ae1d5e774 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:24:11 -0800 Subject: [PATCH 007/145] Add CMake build files for dpctl_ext --- dpctl_ext/CMakeLists.txt | 205 ++++++++++++++++++++++++++++++++ dpctl_ext/tensor/CMakeLists.txt | 175 +++++++++++++++++++++++++++ 2 files changed, 380 insertions(+) create mode 100644 dpctl_ext/CMakeLists.txt create mode 100644 dpctl_ext/tensor/CMakeLists.txt diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt new file mode 100644 index 000000000000..bb33a4f57332 --- /dev/null +++ b/dpctl_ext/CMakeLists.txt @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +find_package(Python REQUIRED COMPONENTS NumPy) + +# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) +# -w is to set working directory (and correctly set __pyx_f[] array of filenames) +set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") +find_package(Cython REQUIRED) + +if(WIN32) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + ) + string(CONCAT SDL_FLAGS "/GS " "/DynamicBase ") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Ox ${WARNING_FLAGS} ${SDL_FLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O0 -g1 -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE + "${CMAKE_C_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_CXX_FLAGS_COVERAGE + "${CMAKE_CXX_FLAGS_DEBUG} ${WARNING_FLAGS} ${SDL_FLAGS} -O1 -g1 -DDEBUG" + ) + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "/NXCompat;/DynamicBase") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +elseif(UNIX) + string( + CONCAT WARNING_FLAGS + "-Wall " + "-Wextra " + "-Winit-self " + "-Wunused-function " + "-Wuninitialized " + "-Wmissing-declarations " + "-Wstrict-prototypes " + "-Wno-unused-parameter " + "-fdiagnostics-color=auto " + ) + string( + CONCAT SDL_FLAGS + "-fstack-protector " + "-fstack-protector-all " + "-fpic " + "-fPIC " + "-D_FORTIFY_SOURCE=2 " + "-Wformat " + "-Wformat-security " + # "-fno-strict-overflow " # no-strict-overflow is implied by -fwrapv + "-fno-delete-null-pointer-checks " + "-fwrapv " + ) + string(CONCAT CFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}") + set(CMAKE_C_FLAGS_DEBUG + "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_CXX_FLAGS_DEBUG + "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O0 -g -DDEBUG -Xsycl-target-frontend=spir64 \"-g0\"" + ) + set(CMAKE_C_FLAGS_COVERAGE "${CMAKE_C_FLAGS_DEBUG} ${CFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_CXX_FLAGS_COVERAGE "${CMAKE_CXX_FLAGS_DEBUG} ${CXXFLAGS} -O1 -g1 -DDEBUG") + set(CMAKE_MODULE_LINKER_FLAGS_COVERAGE "${CMAKE_MODULE_LINKER_FLAGS_DEBUG}") + set(DPCTL_LDFLAGS "-z,noexecstack,-z,relro,-z,now") + mark_as_advanced( + CMAKE_CXX_FLAGS_COVERAGE + CMAKE_C_FLAGS_COVERAGE + CMAKE_MODULE_LINKER_FLAGS_COVERAGE + ) +else() + message(FATAL_ERROR "Unsupported system.") +endif() + +# at build time create include/ directory and copy header files over +set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) + +set(CMAKE_INSTALL_RPATH "$ORIGIN") + +function(build_dpctl_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) + add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) + set(_cythonize_trgt "${_trgt}_cythonize_pyx") + python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) + if(BUILD_DPCTL_EXT_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${_trgt} PRIVATE --offload-compress) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${_trgt} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE Python::NumPy) + if(DPCTL_GENERATE_COVERAGE) + target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) + if(BUILD_DPCTL_EXT_SYCL) + target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) + endif() + endif() + target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") + target_link_options(${_trgt} PRIVATE ${_linker_options}) + get_filename_component(_name_wle ${_generated_src} NAME_WLE) + get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) + set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") + set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") + + # TODO: create separate folder inside build folder that contains only + # headers related to this target and appropriate folder structure to + # eliminate shadow dependencies + get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) + # TODO: do not set directory if we did not generate header + target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) + set(_rpath_value "$ORIGIN") + if(BUILD_DPCTL_EXT_RELATIVE_PATH) + set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") + endif() + if(DPCTL_WITH_REDIST) + set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") + endif() + set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) + + install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) + install( + FILES ${_generated_api_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + install( + FILES ${_generated_public_h} + # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` + DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} + OPTIONAL + ) + if(DPCTL_GENERATE_COVERAGE) + get_filename_component(_original_src_dir ${_src} DIRECTORY) + file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) + install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) + endif() + + # Create target with headers only, because python is managing all the + # library imports at runtime + set(_trgt_headers ${_trgt}_headers) + add_library(${_trgt_headers} INTERFACE) + add_dependencies(${_trgt_headers} ${_trgt}) + get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) +endfunction() + +add_subdirectory(tensor) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt new file mode 100644 index 000000000000..ed8294b76615 --- /dev/null +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +if(WIN32) + if(${CMAKE_VERSION} VERSION_LESS "3.23") + # this is a work-around for target_link_options inserting option after -link option, cause + # linker to ignore it. + set(CMAKE_CXX_LINK_FLAGS + "${CMAKE_CXX_LINK_FLAGS} -fsycl-device-code-split=per_kernel" + ) + endif() +endif() + +set(_static_lib_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp +) +set(_tensor_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp +) + +set(_static_lib_trgt simplify_iteration_space) + +add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) +target_include_directories( + ${_static_lib_trgt} + PRIVATE + ${Python_INCLUDE_DIRS} + ${DPCTL_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include +) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) + +set(_py_trgts) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + +set(_clang_prefix "") +if(WIN32) + set(_clang_prefix "/clang:") +endif() + +set(_no_fast_math_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp +) +list( + APPEND _no_fast_math_sources + # ${_elementwise_sources} + # ${_reduction_sources} + # ${_sorting_sources} + # ${_linalg_sources} + # ${_accumulator_sources} +) + +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() + +set(_compiler_definitions "") + +set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +foreach(python_module_name ${_py_trgts}) + target_compile_options( + ${python_module_name} + PRIVATE -fno-sycl-id-queries-fit-in-int + ) + target_link_options( + ${python_module_name} + PRIVATE -fsycl-device-code-split=per_kernel + ) + if(DPCTL_OFFLOAD_COMPRESS) + target_link_options(${python_module_name} PRIVATE --offload-compress) + endif() + + target_include_directories( + ${python_module_name} + PRIVATE + ${CMAKE_SOURCE_DIR}/dpnp/backend/include + ${Dpctl_INCLUDE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + if(DPCTL_GENERATE_COVERAGE) + if(DPCTL_GENERATE_COVERAGE_FOR_PYBIND11_EXTENSIONS) + target_compile_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + target_link_options( + ${python_module_name} + PRIVATE -fprofile-instr-generate -fcoverage-mapping + ) + endif() + if(_dpctl_sycl_targets) + # make fat binary + target_compile_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_compile_options} + ) + target_link_options( + ${python_module_name} + PRIVATE ${_dpctl_sycl_target_link_options} + ) + endif() + # TODO: update source so they reference individual libraries instead of + # dpctl4pybind11.hpp. It will allow to simplify dependency tree + # NOTE: dpctl C-API is resolved at runtime via Python + # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) + if(DPCTL_WITH_REDIST) + set_target_properties( + ${python_module_name} + PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." + ) + endif() + # TODO: revert to `DESTINATION "dpctl/tensor"` + install(TARGETS ${python_module_name} DESTINATION "dpctl_ext/tensor") +endforeach() From 79d40f235d10d1b9d514d9db07939d0bb447086c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:31:12 -0800 Subject: [PATCH 008/145] Add empty __init__ to dpctl_ext/ --- dpctl_ext/__init__.py | 27 +++++++++++++++++++++++++++ dpctl_ext/tensor/__init__.py | 27 +++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 dpctl_ext/__init__.py create mode 100644 dpctl_ext/tensor/__init__.py diff --git a/dpctl_ext/__init__.py b/dpctl_ext/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py new file mode 100644 index 000000000000..a71324cb88d8 --- /dev/null +++ b/dpctl_ext/tensor/__init__.py @@ -0,0 +1,27 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** From 7949c17c3586a4ad0222c6abbf3a616202834c68 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 04:53:03 -0800 Subject: [PATCH 009/145] Enable _same_logical_tensors in _tensor_impl --- .../tensor/libtensor/source/tensor_ctors.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b41b5c9ce423..ca3b7bd49116 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -430,15 +430,15 @@ PYBIND11_MODULE(_tensor_impl, m) "Determines if the memory regions indexed by each array overlap", py::arg("array1"), py::arg("array2")); - // auto same_logical_tensors = - // [](const dpctl::tensor::usm_ndarray &x1, - // const dpctl::tensor::usm_ndarray &x2) -> bool { - // auto const &same_logical_tensors = SameLogicalTensors(); - // return same_logical_tensors(x1, x2); - // }; - // m.def("_same_logical_tensors", same_logical_tensors, - // "Determines if the memory regions indexed by each array are the - // same", py::arg("array1"), py::arg("array2")); + auto same_logical_tensors = + [](const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2) -> bool { + auto const &same_logical_tensors = SameLogicalTensors(); + return same_logical_tensors(x1, x2); + }; + m.def("_same_logical_tensors", same_logical_tensors, + "Determines if the memory regions indexed by each array are the same", + py::arg("array1"), py::arg("array2")); // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), From 29d6c029190714cab8a460c02f32130c7ea59cc6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:14:28 -0800 Subject: [PATCH 010/145] Add device_support_queries to enable default device types --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../source/device_support_queries.cpp | 184 ++++++++++++++++++ .../source/device_support_queries.hpp | 58 ++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 56 +++--- 4 files changed, 271 insertions(+), 29 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/device_support_queries.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ed8294b76615..ee8da2e49506 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -56,7 +56,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp new file mode 100644 index 000000000000..51eb7dba1b6c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace +{ + +std::string _default_device_fp_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "f8"; + } + else { + return "f4"; + } +} + +int get_numpy_major_version() +{ + namespace py = pybind11; + + py::module_ numpy = py::module_::import("numpy"); + py::str version_string = numpy.attr("__version__"); + py::module_ numpy_lib = py::module_::import("numpy.lib"); + + py::object numpy_version = numpy_lib.attr("NumpyVersion")(version_string); + int major_version = numpy_version.attr("major").cast(); + + return major_version; +} + +std::string _default_device_int_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "i8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "l"; + } +} + +std::string _default_device_uint_type(const sycl::device &) +{ + const int np_ver = get_numpy_major_version(); + + if (np_ver >= 2) { + return "u8"; + } + else { + // code for numpy.dtype('long') to be consistent + // with NumPy's default integer type across + // platforms. + return "L"; + } +} + +std::string _default_device_complex_type(const sycl::device &d) +{ + if (d.has(sycl::aspect::fp64)) { + return "c16"; + } + else { + return "c8"; + } +} + +std::string _default_device_bool_type(const sycl::device &) +{ + return "b1"; +} + +std::string _default_device_index_type(const sycl::device &) +{ + return "i8"; +} + +sycl::device _extract_device(const py::object &arg) +{ + auto const &api = dpctl::detail::dpctl_capi::get(); + + PyObject *source = arg.ptr(); + if (api.PySyclQueue_Check_(source)) { + const sycl::queue &q = py::cast(arg); + return q.get_device(); + } + else if (api.PySyclDevice_Check_(source)) { + return py::cast(arg); + } + else { + throw py::type_error( + "Expected type `dpctl.SyclQueue` or `dpctl.SyclDevice`."); + } +} + +} // namespace + +std::string default_device_fp_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_fp_type(d); +} + +std::string default_device_int_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_int_type(d); +} + +std::string default_device_uint_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_uint_type(d); +} + +std::string default_device_bool_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_bool_type(d); +} + +std::string default_device_complex_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_complex_type(d); +} + +std::string default_device_index_type(const py::object &arg) +{ + const sycl::device &d = _extract_device(arg); + return _default_device_index_type(d); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp new file mode 100644 index 000000000000..6ea01dcd49d7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -0,0 +1,58 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::string default_device_fp_type(const py::object &); +extern std::string default_device_int_type(const py::object &); +extern std::string default_device_uint_type(const py::object &); +extern std::string default_device_bool_type(const py::object &); +extern std::string default_device_complex_type(const py::object &); +extern std::string default_device_index_type(const py::object &); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index ca3b7bd49116..911d75ebd925 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ // #include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" -// #include "device_support_queries.hpp" +#include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" // #include "integer_advanced_indexing.hpp" @@ -360,33 +360,33 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("default_device_fp_type", - // dpctl::tensor::py_internal::default_device_fp_type, - // "Gives default floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_int_type", - // dpctl::tensor::py_internal::default_device_int_type, - // "Gives default signed integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_uint_type", - // dpctl::tensor::py_internal::default_device_uint_type, - // "Gives default unsigned integer type supported by device.", - // py::arg("dev")); - - // m.def("default_device_bool_type", - // dpctl::tensor::py_internal::default_device_bool_type, - // "Gives default boolean type supported by device.", py::arg("dev")); - - // m.def("default_device_complex_type", - // dpctl::tensor::py_internal::default_device_complex_type, - // "Gives default complex floating point type supported by device.", - // py::arg("dev")); - - // m.def("default_device_index_type", - // dpctl::tensor::py_internal::default_device_index_type, - // "Gives default index type supported by device.", py::arg("dev")); + m.def("default_device_fp_type", + dpctl::tensor::py_internal::default_device_fp_type, + "Gives default floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_int_type", + dpctl::tensor::py_internal::default_device_int_type, + "Gives default signed integer type supported by device.", + py::arg("dev")); + + m.def("default_device_uint_type", + dpctl::tensor::py_internal::default_device_uint_type, + "Gives default unsigned integer type supported by device.", + py::arg("dev")); + + m.def("default_device_bool_type", + dpctl::tensor::py_internal::default_device_bool_type, + "Gives default boolean type supported by device.", py::arg("dev")); + + m.def("default_device_complex_type", + dpctl::tensor::py_internal::default_device_complex_type, + "Gives default complex floating point type supported by device.", + py::arg("dev")); + + m.def("default_device_index_type", + dpctl::tensor::py_internal::default_device_index_type, + "Gives default index type supported by device.", py::arg("dev")); // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, From 936e7198e2014330b34c5918a63230ea699e063e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 05:52:17 -0800 Subject: [PATCH 011/145] Enable building and packaging of dpctl_ext --- CMakeLists.txt | 1 + setup.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 386b17b44294..d2ee5e84c0c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,3 +336,4 @@ if(DEFINED SKBUILD) endif() add_subdirectory(dpnp) +add_subdirectory(dpctl_ext) diff --git a/setup.py b/setup.py index cc21221299c4..a0c54b066dcf 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,9 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", + # dpctl_ext + "dpctl_ext", + "dpctl_ext.tensor", ], package_data={ "dpnp": [ From cd85f1e333bcad154272946f71c127b9ea9a916b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 06:14:39 -0800 Subject: [PATCH 012/145] Use _tensor_impl from dpctl_ext.tensor in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 +- dpnp/dpnp_iface.py | 2 +- dpnp/dpnp_iface_searching.py | 2 +- dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 +- dpnp/scipy/linalg/_utils.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 57bf50422fa0..b63bf61f8dad 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -31,7 +31,6 @@ import dpctl.tensor as dpt import dpctl.tensor._copy_utils as dtc -import dpctl.tensor._tensor_impl as dti import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy @@ -45,6 +44,7 @@ _validate_dtype, ) +import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index fba1a215756a..832446c826ba 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -45,11 +45,11 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._device import normalize_queue_device +import dpctl_ext.tensor._tensor_impl as ti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 6eefe010b699..fdbd317d31dd 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -40,8 +40,8 @@ """ import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as dti +import dpctl_ext.tensor._tensor_impl as dti import dpnp from .dpnp_array import dpnp_array diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 30be5d1ff5cb..4d8e3cdfbd0d 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -28,7 +28,6 @@ import dpctl import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -38,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 282c645d1095..8eb9187236bf 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -42,9 +42,9 @@ from warnings import warn -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 0c6780a8f8b45e87263fbf316bc17aac5ed91dc1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:56:50 -0800 Subject: [PATCH 013/145] Move put() and take() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 11 + dpctl_ext/tensor/_indexing_functions.py | 329 +++++++ dpctl_ext/tensor/_numpy_helper.py | 45 + .../kernels/integer_advanced_indexing.hpp | 427 +++++++++ .../source/integer_advanced_indexing.cpp | 819 ++++++++++++++++++ .../source/integer_advanced_indexing.hpp | 73 ++ .../tensor/libtensor/source/tensor_ctors.cpp | 42 +- 8 files changed, 1726 insertions(+), 22 deletions(-) create mode 100644 dpctl_ext/tensor/_indexing_functions.py create mode 100644 dpctl_ext/tensor/_numpy_helper.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..ae8b72d71873 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -49,7 +49,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a71324cb88d8..35453dbf9a46 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -25,3 +25,14 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** + + +from dpctl_ext.tensor._indexing_functions import ( + put, + take, +) + +__all__ = [ + "put", + "take", +] diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py new file mode 100644 index 000000000000..106df09cf97e --- /dev/null +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -0,0 +1,329 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils + +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index + + +def _get_indexing_mode(name): + modes = {"wrap": 0, "clip": 1} + try: + return modes[name] + except KeyError: + raise ValueError( + "`mode` must be `wrap` or `clip`." "Got `{}`.".format(name) + ) + + +def put(x, indices, vals, /, *, axis=None, mode="wrap"): + """put(x, indices, vals, axis=None, mode="wrap") + + Puts values into an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array the values will be put into. + indices (usm_ndarray): + One-dimensional array of indices. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the result shape + ``x.shape[:axis] + indices.shape + x.shape[axis+1:]``. + axis (int, optional): + The axis along which the values will be placed. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work, e.g. + + :Example: + + .. code-block:: python + + from dpctl import tensor as dpt + + def put_vec_duplicates(vec, ind, vals): + "Put values into vec, handling possible duplicates in ind" + assert vec.ndim, ind.ndim, vals.ndim == 1, 1, 1 + + # find positions of last occurrences of each + # unique index + ind_flipped = dpt.flip(ind) + ind_uniq = dpt.unique_all(ind_flipped).indices + has_dups = len(ind) != len(ind_uniq) + + if has_dups: + ind_uniq = dpt.subtract(vec.size - 1, ind_uniq) + ind = dpt.take(ind, ind_uniq) + vals = dpt.take(vals, ind_uniq) + + dpt.put(vec, ind, vals) + + n = 512 + ind = dpt.concat((dpt.arange(n), dpt.arange(n, -1, step=-1))) + x = dpt.zeros(ind.size, dtype="int32") + vals = dpt.arange(ind.size, dtype=x.dtype) + + # Values corresponding to last positions of + # duplicate indices are written into the vector x + put_vec_duplicates(x, ind, vals) + + parts = (vals[-1:-n-2:-1], dpt.zeros(n, dtype=x.dtype)) + expected = dpt.concat(parts) + assert dpt.all(x == expected) + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + vals_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + val_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + val_shape = indices.shape + + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q + ) + # choose to throw here for consistency with `place` + if vals.size == 0: + raise ValueError( + "cannot put into non-empty indices along an empty axis" + ) + if vals.dtype == x.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, x.dtype) + rhs = dpt.broadcast_to(rhs, val_shape) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, put_ev = ti._put( + x, (indices,), rhs, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, put_ev) + + +def take(x, indices, /, *, axis=None, out=None, mode="wrap"): + """take(x, indices, axis=None, out=None, mode="wrap") + + Takes elements from an array along a given axis at given indices. + + Args: + x (usm_ndarray): + The array that elements will be taken from. + indices (usm_ndarray): + One-dimensional array of indices. + axis (int, optional): + The axis along which the values will be selected. + If ``x`` is one-dimensional, this argument is optional. + Default: ``None``. + out (Optional[usm_ndarray]): + Output array to populate. Array must have the correct + shape and the expected data type. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + Array with shape + ``x.shape[:axis] + indices.shape + x.shape[axis + 1:]`` + filled with elements from ``x``. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected instance of `dpt.usm_ndarray`, got `{}`.".format(type(x)) + ) + + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + "`indices` expected `dpt.usm_ndarray`, got `{}`.".format( + type(indices) + ) + ) + if indices.dtype.kind not in "ui": + raise IndexError( + "`indices` expected integer data type, got `{}`".format( + indices.dtype + ) + ) + if indices.ndim != 1: + raise ValueError( + "`indices` expected a 1D array, got `{}`".format(indices.ndim) + ) + exec_q = dpctl.utils.get_execution_queue([x.sycl_queue, indices.sycl_queue]) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + res_usm_type = dpctl.utils.get_coerced_usm_type( + [x.usm_type, indices.usm_type] + ) + + mode = _get_indexing_mode(mode) + + x_ndim = x.ndim + if axis is None: + if x_ndim > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format( + x_ndim + ) + ) + axis = 0 + + if x_ndim > 0: + axis = normalize_axis_index(operator.index(axis), x_ndim) + x_sh = x.shape + if x_sh[axis] == 0 and indices.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = x.shape[:axis] + indices.shape + x.shape[axis + 1 :] + else: + if axis != 0: + raise ValueError("`axis` must be 0 for an array of dimension 0.") + res_shape = indices.shape + + dt = x.dtype + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + if dt != out.dtype: + raise ValueError( + f"Output array of type {dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise dpctl.utils.ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if ti._array_overlap(x, out): + out = dpt.empty_like(out) + else: + out = dpt.empty( + res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + hev, take_ev = ti._take( + x, (indices,), out, axis, mode, sycl_queue=exec_q, depends=deps_ev + ) + _manager.add_event_pair(hev, take_ev) + + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[take_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_ev) + out = orig_out + + return out diff --git a/dpctl_ext/tensor/_numpy_helper.py b/dpctl_ext/tensor/_numpy_helper.py new file mode 100644 index 000000000000..4ad735823cb3 --- /dev/null +++ b/dpctl_ext/tensor/_numpy_helper.py @@ -0,0 +1,45 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import numpy as np + +_npver = np.lib.NumpyVersion(np.__version__) + +if _npver < "1.25.0": # pragma: no cover + from numpy import AxisError +else: + from numpy.exceptions import AxisError + +if _npver >= "2.0.0": + from numpy._core.numeric import normalize_axis_index, normalize_axis_tuple +else: # pragma: no cover + from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple + + +__all__ = ["AxisError", "normalize_axis_index", "normalize_axis_tuple"] diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..1b2c79d2e2a5 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -0,0 +1,427 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/indexing_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace indexing +{ + +using dpctl::tensor::ssize_t; + +template +class TakeFunctor +{ +private: + const char *src_ = nullptr; + char *dst_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + TakeFunctor(const char *src_cp, + char *dst_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : src_(src_cp), dst_(dst_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + const T *src = reinterpret_cast(src_); + T *dst = reinterpret_cast(dst_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t src_offset = orthog_offsets.get_first_offset(); + ssize_t dst_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + src_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + dst_offset += axes_strider(i_along); + + dst[dst_offset] = src[src_offset]; + } +}; + +template +class take_kernel; + +typedef sycl::event (*take_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + const char *, + char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event take_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + const char *src_p, + char *dst_p, + char **ind_p, + ssize_t src_offset, + ssize_t dst_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event take_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, src_offset, dst_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + take_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + TakeFunctor( + src_p, dst_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return take_ev; +} + +template +class PutFunctor +{ +private: + char *dst_ = nullptr; + const char *val_ = nullptr; + char **ind_ = nullptr; + int k_ = 0; + std::size_t ind_nelems_ = 0; + const ssize_t *axes_shape_and_strides_ = nullptr; + OrthogIndexer orthog_strider; + IndicesIndexer ind_strider; + AxesIndexer axes_strider; + +public: + PutFunctor(char *dst_cp, + const char *val_cp, + char **ind_cp, + int k, + std::size_t ind_nelems, + const ssize_t *axes_shape_and_strides, + const OrthogIndexer &orthog_strider_, + const IndicesIndexer &ind_strider_, + const AxesIndexer &axes_strider_) + : dst_(dst_cp), val_(val_cp), ind_(ind_cp), k_(k), + ind_nelems_(ind_nelems), + axes_shape_and_strides_(axes_shape_and_strides), + orthog_strider(orthog_strider_), ind_strider(ind_strider_), + axes_strider(axes_strider_) + { + } + + void operator()(sycl::id<1> id) const + { + T *dst = reinterpret_cast(dst_); + const T *val = reinterpret_cast(val_); + + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); + + auto orthog_offsets = orthog_strider(i_orthog); + + ssize_t dst_offset = orthog_offsets.get_first_offset(); + ssize_t val_offset = orthog_offsets.get_second_offset(); + + static constexpr ProjectorT proj{}; + for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { + indT *ind_data = reinterpret_cast(ind_[axis_idx]); + + ssize_t ind_offset = ind_strider(i_along, axis_idx); + + // proj produces an index in the range of the given axis + ssize_t projected_idx = + proj(axes_shape_and_strides_[axis_idx], ind_data[ind_offset]); + dst_offset += + projected_idx * axes_shape_and_strides_[k_ + axis_idx]; + } + + val_offset += axes_strider(i_along); + + dst[dst_offset] = val[val_offset]; + } +}; + +template +class put_kernel; + +typedef sycl::event (*put_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + int, + int, + int, + const ssize_t *, + const ssize_t *, + const ssize_t *, + char *, + const char *, + char **, + ssize_t, + ssize_t, + const ssize_t *, + const std::vector &); + +template +sycl::event put_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t ind_nelems, + int nd, + int ind_nd, + int k, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, + char *dst_p, + const char *val_p, + char **ind_p, + ssize_t dst_offset, + ssize_t val_offset, + const ssize_t *ind_offsets, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + sycl::event put_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using OrthogIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + const OrthogIndexerT orthog_indexer{nd, dst_offset, val_offset, + orthog_shape_and_strides}; + + using NthStrideIndexerT = dpctl::tensor::offset_utils::NthStrideOffset; + const NthStrideIndexerT indices_indexer{ind_nd, ind_offsets, + ind_shape_and_strides}; + + using AxesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const AxesIndexerT axes_indexer{ind_nd, 0, + axes_shape_and_strides + (2 * k)}; + + using KernelName = + put_kernel; + + const std::size_t gws = orthog_nelems * ind_nelems; + + cgh.parallel_for( + sycl::range<1>(gws), + PutFunctor( + dst_p, val_p, ind_p, k, ind_nelems, axes_shape_and_strides, + orthog_indexer, indices_indexer, axes_indexer)); + }); + + return put_ev; +} + +template +struct TakeWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct TakeClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = take_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutWrapFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::WrapIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +template +struct PutClipFactory +{ + fnT get() + { + if constexpr (std::is_integral::value && + !std::is_same::value) { + using dpctl::tensor::indexing_utils::ClipIndex; + fnT fn = put_impl, T, indT>; + return fn; + } + else { + fnT fn = nullptr; + return fn; + } + } +}; + +} // namespace indexing +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp new file mode 100644 index 000000000000..244acfe3955f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -0,0 +1,819 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.take and +/// dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/integer_advanced_indexing.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "integer_advanced_indexing.hpp" + +#define INDEXING_MODES 2 +#define WRAP_MODE 0 +#define CLIP_MODE 1 + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing::put_fn_ptr_t; +using dpctl::tensor::kernels::indexing::take_fn_ptr_t; + +static take_fn_ptr_t take_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +static put_fn_ptr_t put_dispatch_table[INDEXING_MODES][td_ns::num_types] + [td_ns::num_types]; + +namespace py = pybind11; + +using dpctl::utils::keep_args_alive; + +std::vector + _populate_kernel_params(sycl::queue &exec_q, + std::vector &host_task_events, + char **device_ind_ptrs, + py::ssize_t *device_ind_sh_st, + py::ssize_t *device_ind_offsets, + py::ssize_t *device_orthog_sh_st, + py::ssize_t *device_along_sh_st, + const py::ssize_t *inp_shape, + const py::ssize_t *arr_shape, + std::vector &inp_strides, + std::vector &arr_strides, + std::vector &ind_sh_sts, + std::vector &ind_ptrs, + std::vector &ind_offsets, + int axis_start, + int k, + int ind_nd, + int inp_nd, + int orthog_sh_elems, + int ind_sh_elems) +{ + + using usm_host_allocator_T = + dpctl::tensor::alloc_utils::usm_host_allocator; + using ptrT = std::vector; + + usm_host_allocator_T ptr_allocator(exec_q); + std::shared_ptr host_ind_ptrs_shp = + std::make_shared(k, ptr_allocator); + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using shT = std::vector; + + usm_host_allocatorT sz_allocator(exec_q); + std::shared_ptr host_ind_sh_st_shp = + std::make_shared(ind_sh_elems * (k + 1), sz_allocator); + + std::shared_ptr host_ind_offsets_shp = + std::make_shared(k, sz_allocator); + + std::shared_ptr host_orthog_sh_st_shp = + std::make_shared(3 * orthog_sh_elems, sz_allocator); + + std::shared_ptr host_along_sh_st_shp = + std::make_shared(2 * (k + ind_sh_elems), sz_allocator); + + std::copy(ind_sh_sts.begin(), ind_sh_sts.end(), + host_ind_sh_st_shp->begin()); + std::copy(ind_ptrs.begin(), ind_ptrs.end(), host_ind_ptrs_shp->begin()); + std::copy(ind_offsets.begin(), ind_offsets.end(), + host_ind_offsets_shp->begin()); + + const sycl::event &device_ind_ptrs_copy_ev = exec_q.copy( + host_ind_ptrs_shp->data(), device_ind_ptrs, host_ind_ptrs_shp->size()); + + const sycl::event &device_ind_sh_st_copy_ev = + exec_q.copy(host_ind_sh_st_shp->data(), device_ind_sh_st, + host_ind_sh_st_shp->size()); + + const sycl::event &device_ind_offsets_copy_ev = exec_q.copy( + host_ind_offsets_shp->data(), device_ind_offsets, + host_ind_offsets_shp->size()); + + int orthog_nd = inp_nd - k; + + if (orthog_nd > 0) { + if (axis_start > 0) { + std::copy(inp_shape, inp_shape + axis_start, + host_orthog_sh_st_shp->begin()); + std::copy(inp_strides.begin(), inp_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + orthog_sh_elems); + std::copy(arr_strides.begin(), arr_strides.begin() + axis_start, + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems); + } + if (inp_nd > (axis_start + k)) { + std::copy(inp_shape + axis_start + k, inp_shape + inp_nd, + host_orthog_sh_st_shp->begin() + axis_start); + std::copy(inp_strides.begin() + axis_start + k, inp_strides.end(), + host_orthog_sh_st_shp->begin() + orthog_sh_elems + + axis_start); + + std::copy(arr_strides.begin() + axis_start + ind_nd, + arr_strides.end(), + host_orthog_sh_st_shp->begin() + 2 * orthog_sh_elems + + axis_start); + } + } + + if (inp_nd > 0) { + std::copy(inp_shape + axis_start, inp_shape + axis_start + k, + host_along_sh_st_shp->begin()); + + std::copy(inp_strides.begin() + axis_start, + inp_strides.begin() + axis_start + k, + host_along_sh_st_shp->begin() + k); + } + + if (ind_nd > 0) { + std::copy(arr_shape + axis_start, arr_shape + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k); + std::copy(arr_strides.begin() + axis_start, + arr_strides.begin() + axis_start + ind_nd, + host_along_sh_st_shp->begin() + 2 * k + ind_nd); + } + + const sycl::event &device_orthog_sh_st_copy_ev = exec_q.copy( + host_orthog_sh_st_shp->data(), device_orthog_sh_st, + host_orthog_sh_st_shp->size()); + + const sycl::event &device_along_sh_st_copy_ev = exec_q.copy( + host_along_sh_st_shp->data(), device_along_sh_st, + host_along_sh_st_shp->size()); + + const sycl::event &shared_ptr_cleanup_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on({device_along_sh_st_copy_ev, + device_orthog_sh_st_copy_ev, + device_ind_offsets_copy_ev, + device_ind_sh_st_copy_ev, device_ind_ptrs_copy_ev}); + cgh.host_task( + [host_ind_offsets_shp = std::move(host_ind_offsets_shp), + host_ind_sh_st_shp = std::move(host_ind_sh_st_shp), + host_ind_ptrs_shp = std::move(host_ind_ptrs_shp), + host_orthog_sh_st_shp = std::move(host_orthog_sh_st_shp), + host_along_sh_st_shp = std::move(host_along_sh_st_shp)] {}); + }); + host_task_events.push_back(shared_ptr_cleanup_ev); + + std::vector sh_st_pack_deps{ + device_ind_ptrs_copy_ev, device_ind_sh_st_copy_ev, + device_ind_offsets_copy_ev, device_orthog_sh_st_copy_ev, + device_along_sh_st_copy_ev}; + return sh_st_pack_deps; +} + +/* Utility to parse python object py_ind into vector of `usm_ndarray`s */ +std::vector parse_py_ind(const sycl::queue &q, + const py::object &py_ind) +{ + std::size_t ind_count = py::len(py_ind); + std::vector res; + res.reserve(ind_count); + + bool nd_is_known = false; + int nd = -1; + for (std::size_t i = 0; i < ind_count; ++i) { + py::object el_i = py_ind[py::cast(i)]; + dpctl::tensor::usm_ndarray arr_i = + py::cast(el_i); + if (!dpctl::utils::queues_are_compatible(q, {arr_i})) { + throw py::value_error("Index allocation queue is not compatible " + "with execution queue"); + } + if (nd_is_known) { + if (nd != arr_i.get_ndim()) { + throw py::value_error( + "Indices must have the same number of dimensions."); + } + } + else { + nd_is_known = true; + nd = arr_i.get_ndim(); + } + res.push_back(arr_i); + } + + return res; +} + +std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &src, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &dst, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + + int k = ind.size(); + + if (k == 0) { + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(src_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(src_nd)); + } + if (src_nd == 0) { + if (dst_nd != ind_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + else { + if (dst_nd != (src_nd - k + ind_nd)) { + throw py::value_error( + "Destination is not of appropriate dimension for take kernel."); + } + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (src_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(src_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (src_shape[idx1] == dst_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Array memory overlap."); + } + + py::ssize_t src_offset = py::ssize_t(0); + py::ssize_t dst_offset = py::ssize_t(0); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_type_id != dst_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == dst_shape[axis_start + i])) { + throw py::value_error( + "Indices shape does not match shape of axis in destination."); + } + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * ind_nelems); + + int ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + + std::vector ind_offsets; + ind_offsets.reserve(k); + + std::vector ind_sh_sts((k + 1) * ind_sh_elems, 0); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_nd, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(dst, ind_)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // rearrange to past where indices shapes are checked + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(src_nd - k, 1); + + // packed_shapes_strides = [src_shape[:axis] + src_shape[axis+k:], + // src_strides[:axis] + src_strides[axis+k:], + // dst_strides[:axis] + + // dst_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [src_shape[axis:axis+k], + // src_strides[axis:axis+k], + // dst_shape[axis:axis+ind.ndim], + // dst_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + src_shape, dst_shape, src_strides, dst_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, src_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = take_dispatch_table[mode][src_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event take_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, src_data, dst_data, packed_ind_ptrs, + src_offset, dst_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {take_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, py_ind, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, take_generic_ev); +} + +std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &dst, + const py::object &py_ind, + const dpctl::tensor::usm_ndarray &val, + int axis_start, + std::uint8_t mode, + sycl::queue &exec_q, + const std::vector &depends) +{ + std::vector ind = parse_py_ind(exec_q, py_ind); + int k = ind.size(); + + if (k == 0) { + // no indices to write to + throw py::value_error("List of indices is empty."); + } + + if (axis_start < 0) { + throw py::value_error("Axis cannot be negative."); + } + + if (mode != 0 && mode != 1) { + throw py::value_error("Mode must be 0 or 1."); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const dpctl::tensor::usm_ndarray ind_rep = ind[0]; + + int dst_nd = dst.get_ndim(); + int val_nd = val.get_ndim(); + int ind_nd = ind_rep.get_ndim(); + + auto sh_elems = std::max(dst_nd, 1); + + if (axis_start + k > sh_elems) { + throw py::value_error("Axes are out of range for array of dimension " + + std::to_string(dst_nd)); + } + if (dst_nd == 0) { + if (val_nd != ind_nd) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + else { + if (val_nd != (dst_nd - k + ind_nd)) { + throw py::value_error("Destination is not of appropriate dimension " + "for put function."); + } + } + + std::size_t dst_nelems = dst.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *val_shape = val.get_shape_raw(); + + bool orthog_shapes_equal(true); + std::size_t orthog_nelems(1); + for (int i = 0; i < (dst_nd - k); ++i) { + auto idx1 = (i < axis_start) ? i : i + k; + auto idx2 = (i < axis_start) ? i : i + ind_nd; + + orthog_nelems *= static_cast(dst_shape[idx1]); + orthog_shapes_equal = + orthog_shapes_equal && (dst_shape[idx1] == val_shape[idx2]); + } + + if (!orthog_shapes_equal) { + throw py::value_error( + "Axes of basic indices are not of matching shapes."); + } + + if (orthog_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + char *dst_data = dst.get_data(); + char *val_data = val.get_data(); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, val})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(val, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + py::ssize_t dst_offset = py::ssize_t(0); + py::ssize_t val_offset = py::ssize_t(0); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int dst_typenum = dst.get_typenum(); + int val_typenum = val.get_typenum(); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + int val_type_id = array_types.typenum_to_lookup_id(val_typenum); + + if (dst_type_id != val_type_id) { + throw py::type_error("Array data types are not the same."); + } + + const py::ssize_t *ind_shape = ind_rep.get_shape_raw(); + + int ind_typenum = ind_rep.get_typenum(); + int ind_type_id = array_types.typenum_to_lookup_id(ind_typenum); + + std::size_t ind_nelems(1); + for (int i = 0; i < ind_nd; ++i) { + ind_nelems *= static_cast(ind_shape[i]); + + if (!(ind_shape[i] == val_shape[axis_start + i])) { + throw py::value_error( + "Indices shapes does not match shape of axis in vals."); + } + } + + auto ind_sh_elems = std::max(ind_nd, 1); + + std::vector ind_ptrs; + ind_ptrs.reserve(k); + std::vector ind_offsets; + ind_offsets.reserve(k); + std::vector ind_sh_sts((k + 1) * ind_sh_elems, py::ssize_t(0)); + if (ind_nd > 0) { + std::copy(ind_shape, ind_shape + ind_sh_elems, ind_sh_sts.begin()); + } + for (int i = 0; i < k; ++i) { + dpctl::tensor::usm_ndarray ind_ = ind[i]; + + if (!dpctl::utils::queues_are_compatible(exec_q, {ind_})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // ndim, type, and shape are checked against the first array + if (i > 0) { + if (!(ind_.get_ndim() == ind_nd)) { + throw py::value_error("Index dimensions are not the same"); + } + + if (!(ind_type_id == + array_types.typenum_to_lookup_id(ind_.get_typenum()))) { + throw py::type_error( + "Indices array data types are not all the same."); + } + + const py::ssize_t *ind_shape_ = ind_.get_shape_raw(); + for (int dim = 0; dim < ind_nd; ++dim) { + if (!(ind_shape[dim] == ind_shape_[dim])) { + throw py::value_error("Indices shapes are not all equal."); + } + } + } + + // check for overlap with destination + if (overlap(ind_, dst)) { + throw py::value_error( + "Arrays index overlapping segments of memory"); + } + + char *ind_data = ind_.get_data(); + + // strides are initialized to 0 for 0D indices, so skip here + if (ind_nd > 0) { + auto ind_strides = ind_.get_strides_vector(); + std::copy(ind_strides.begin(), ind_strides.end(), + ind_sh_sts.begin() + (i + 1) * ind_nd); + } + + ind_ptrs.push_back(ind_data); + ind_offsets.push_back(py::ssize_t(0)); + } + + if (ind_nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto packed_ind_ptrs_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + char **packed_ind_ptrs = packed_ind_ptrs_owner.get(); + + // packed_ind_shapes_strides = [ind_shape, + // ind[0] strides, + // ..., + // ind[k] strides] + auto packed_ind_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + (k + 1) * ind_sh_elems, exec_q); + py::ssize_t *packed_ind_shapes_strides = + packed_ind_shapes_strides_owner.get(); + + auto packed_ind_offsets_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(k, exec_q); + py::ssize_t *packed_ind_offsets = packed_ind_offsets_owner.get(); + + int orthog_sh_elems = std::max(dst_nd - k, 1); + + // packed_shapes_strides = [dst_shape[:axis] + dst_shape[axis+k:], + // dst_strides[:axis] + dst_strides[axis+k:], + // val_strides[:axis] + + // val_strides[axis+ind.ndim:]] + auto packed_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 3 * orthog_sh_elems, exec_q); + py::ssize_t *packed_shapes_strides = packed_shapes_strides_owner.get(); + + // packed_axes_shapes_strides = [dst_shape[axis:axis+k], + // dst_strides[axis:axis+k], + // val_shape[axis:axis+ind.ndim], + // val_strides[axis:axis+ind.ndim]] + auto packed_axes_shapes_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + 2 * (k + ind_sh_elems), exec_q); + py::ssize_t *packed_axes_shapes_strides = + packed_axes_shapes_strides_owner.get(); + + auto dst_strides = dst.get_strides_vector(); + auto val_strides = val.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + std::vector pack_deps = _populate_kernel_params( + exec_q, host_task_events, packed_ind_ptrs, packed_ind_shapes_strides, + packed_ind_offsets, packed_shapes_strides, packed_axes_shapes_strides, + dst_shape, val_shape, dst_strides, val_strides, ind_sh_sts, ind_ptrs, + ind_offsets, axis_start, k, ind_nd, dst_nd, orthog_sh_elems, + ind_sh_elems); + + std::vector all_deps; + all_deps.reserve(depends.size() + pack_deps.size()); + all_deps.insert(std::end(all_deps), std::begin(pack_deps), + std::end(pack_deps)); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + auto fn = put_dispatch_table[mode][dst_type_id][ind_type_id]; + + if (fn == nullptr) { + sycl::event::wait(host_task_events); + throw std::runtime_error("Indices must be integer type, got " + + std::to_string(ind_type_id)); + } + + sycl::event put_generic_ev = + fn(exec_q, orthog_nelems, ind_nelems, orthog_sh_elems, ind_sh_elems, k, + packed_shapes_strides, packed_axes_shapes_strides, + packed_ind_shapes_strides, dst_data, val_data, packed_ind_ptrs, + dst_offset, val_offset, packed_ind_offsets, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {put_generic_ev}, packed_shapes_strides_owner, + packed_axes_shapes_strides_owner, packed_ind_shapes_strides_owner, + packed_ind_ptrs_owner, packed_ind_offsets_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {dst, py_ind, val}, host_task_events); + + return std::make_pair(arg_cleanup_ev, put_generic_ev); +} + +void init_advanced_indexing_dispatch_tables(void) +{ + using namespace td_ns; + + using dpctl::tensor::kernels::indexing::TakeClipFactory; + DispatchTableBuilder + dtb_takeclip; + dtb_takeclip.populate_dispatch_table(take_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::TakeWrapFactory; + DispatchTableBuilder + dtb_takewrap; + dtb_takewrap.populate_dispatch_table(take_dispatch_table[WRAP_MODE]); + + using dpctl::tensor::kernels::indexing::PutClipFactory; + DispatchTableBuilder dtb_putclip; + dtb_putclip.populate_dispatch_table(put_dispatch_table[CLIP_MODE]); + + using dpctl::tensor::kernels::indexing::PutWrapFactory; + DispatchTableBuilder dtb_putwrap; + dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp new file mode 100644 index 000000000000..57f0ddda132c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -0,0 +1,73 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.take and dpctl.tensor.put +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_take(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern std::pair + usm_ndarray_put(const dpctl::tensor::usm_ndarray &, + const py::object &, + const dpctl::tensor::usm_ndarray &, + int, + std::uint8_t, + sycl::queue &, + const std::vector & = {}); + +extern void init_advanced_indexing_dispatch_tables(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..c18761031fd0 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -55,7 +55,7 @@ #include "device_support_queries.hpp" // #include "eye_ctor.hpp" // #include "full_ctor.hpp" -// #include "integer_advanced_indexing.hpp" +#include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" // #include "repeat.hpp" @@ -110,8 +110,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; // using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ -// using dpctl::tensor::py_internal::usm_ndarray_put; -// using dpctl::tensor::py_internal::usm_ndarray_take; +using dpctl::tensor::py_internal::usm_ndarray_put; +using dpctl::tensor::py_internal::usm_ndarray_take; // using dpctl::tensor::py_internal::py_extract; // using dpctl::tensor::py_internal::py_mask_positions; @@ -145,7 +145,7 @@ void init_dispatch_tables(void) init_copy_and_cast_usm_to_usm_dispatch_tables(); // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); - // init_advanced_indexing_dispatch_tables(); + init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; } @@ -332,23 +332,23 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_take", &usm_ndarray_take, - // "Takes elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` from array `src` and copies them " - // "into usm_ndarray `dst` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("src"), py::arg("ind"), py::arg("dst"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); - - // m.def("_put", &usm_ndarray_put, - // "Puts elements at usm_ndarray indices `ind` and axes starting " - // "at axis `axis_start` into array `dst` from " - // "usm_ndarray `val` synchronously." - // "Returns a tuple of events: (hev, ev)", - // py::arg("dst"), py::arg("ind"), py::arg("val"), - // py::arg("axis_start"), py::arg("mode"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_take", &usm_ndarray_take, + "Takes elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` from array `src` and copies them " + "into usm_ndarray `dst` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("ind"), py::arg("dst"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + m.def("_put", &usm_ndarray_put, + "Puts elements at usm_ndarray indices `ind` and axes starting " + "at axis `axis_start` into array `dst` from " + "usm_ndarray `val` synchronously." + "Returns a tuple of events: (hev, ev)", + py::arg("dst"), py::arg("ind"), py::arg("val"), py::arg("axis_start"), + py::arg("mode"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_eye", &usm_ndarray_eye, // "Fills input 2D contiguous usm_ndarray `dst` with " From 87e5482f2faf3bff2549b48c999bbab516fce168 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 09:59:18 -0800 Subject: [PATCH 014/145] Use put/take from dpctl_ext.tensor in dpnp --- dpnp/dpnp_iface_indexing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6e7ab778299b..6421f39fd4e4 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -52,6 +52,8 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp # pylint: disable=no-name-in-module @@ -295,7 +297,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti._array_overlap(x, out): + if ti_ext._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -304,7 +306,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti._take( + h_ev, take_ev = ti_ext._take( src=x, ind=(inds,), dst=out, @@ -813,7 +815,7 @@ def extract(condition, a): usm_a = dpt.reshape(usm_a, -1) usm_cond = dpt.reshape(usm_cond, -1) - usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0]) + usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: usm_a = dpt.reshape(usm_a, -1) @@ -1713,7 +1715,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if axis is None and usm_a.ndim > 1: usm_a = dpt.reshape(usm_a, -1) - dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) + dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) From b537f30115be31858782e6a7ace1fc52f54c5f9d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Feb 2026 10:33:51 -0800 Subject: [PATCH 015/145] Move full() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 169 ++++++++++ .../include/kernels/constructors.hpp | 171 ++++++++++ .../tensor/libtensor/source/full_ctor.cpp | 315 ++++++++++++++++++ .../tensor/libtensor/source/full_ctor.hpp | 60 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 14 +- 7 files changed, 727 insertions(+), 8 deletions(-) create mode 100644 dpctl_ext/tensor/_ctors.py create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/full_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ae8b72d71873..0c52d766afbf 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -52,7 +52,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 35453dbf9a46..9f4c27608a99 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,12 +27,16 @@ # ***************************************************************************** +from dpctl_ext.tensor._ctors import ( + full, +) from dpctl_ext.tensor._indexing_functions import ( put, take, ) __all__ = [ + "full", "put", "take", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py new file mode 100644 index 000000000000..5caa07099c56 --- /dev/null +++ b/dpctl_ext/tensor/_ctors.py @@ -0,0 +1,169 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from numbers import Number + +import dpctl +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._data_types import _get_dtype +from dpctl.tensor._device import normalize_queue_device + +import dpctl_ext.tensor._tensor_impl as ti + + +def _cast_fill_val(fill_val, dt): + """ + Casts the Python scalar `fill_val` to another Python type coercible to the + requested data type `dt`, if necessary. + """ + val_type = type(fill_val) + if val_type in [float, complex] and np.issubdtype(dt, np.integer): + return int(fill_val.real) + elif val_type is complex and np.issubdtype(dt, np.floating): + return fill_val.real + elif val_type is int and np.issubdtype(dt, np.integer): + return _to_scalar(fill_val, dt) + else: + return fill_val + + +def _to_scalar(obj, sc_ty): + """A way to convert object to NumPy scalar type. + Raises OverflowError if obj can not be represented + using the requested scalar type. + """ + zd_arr = np.asarray(obj, dtype=sc_ty) + return zd_arr[()] + + +def _validate_fill_value(fill_val): + """Validates that `fill_val` is a numeric or boolean scalar.""" + # TODO: verify if `np.True_` and `np.False_` should be instances of + # Number in NumPy, like other NumPy scalars and like Python bools + # check for `np.bool_` separately as NumPy<2 has no `np.bool` + if not isinstance(fill_val, Number) and not isinstance(fill_val, np.bool_): + raise TypeError( + f"array cannot be filled with scalar of type {type(fill_val)}" + ) + + +def full( + shape, + fill_value, + *, + dtype=None, + order="C", + device=None, + usm_type=None, + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with `fill_value`. + + Args: + shape (tuple): + Dimensions of the array to be created. + fill_value (int,float,complex,usm_ndarray): + fill value + dtype (optional): data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=True) + + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + if ( + isinstance(fill_value, dpt.usm_ndarray) + and sycl_queue is None + and device is None + ): + sycl_queue = fill_value.sycl_queue + else: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + X = dpt.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + return dpt.copy(dpt.broadcast_to(X, shape), order=order) + else: + _validate_fill_value(fill_value) + + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + usm_type = usm_type if usm_type is not None else "device" + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + fill_value = _cast_fill_val(fill_value, dtype) + + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp new file mode 100644 index 000000000000..dfd1b889aafe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -0,0 +1,171 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor constructors. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/strided_iters.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace constructors +{ + +using dpctl::tensor::ssize_t; + +/*! + @defgroup CtorKernels + */ + +template +class full_strided_kernel; + +using namespace dpctl::tensor::offset_utils; + +/* ================ Full ================== */ + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &q, + std::size_t nelems, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + dstTy *p = reinterpret_cast(dst_p); + cgh.fill(p, fill_v, nelems); + }); + + return fill_ev; +} + +template +class FullStridedFunctor +{ +private: + Ty *p = nullptr; + Ty fill_v; + IndexerT indexer; + +public: + FullStridedFunctor(Ty *p_, const Ty &fill_v_, const IndexerT &indexer_) + : p(p_), fill_v(fill_v_), indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + auto offset = indexer(id.get(0)); + p[offset] = fill_v; + } +}; + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param fill_v Value to fill the array with + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &q, + int nd, + std::size_t nelems, + const ssize_t *shape_strides, + dstTy fill_v, + char *dst_p, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(q); + + dstTy *dst_tp = reinterpret_cast(dst_p); + + using dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexer strided_indexer(nd, 0, shape_strides); + + sycl::event fill_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = full_strided_kernel; + using Impl = FullStridedFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(dst_tp, fill_v, strided_indexer)); + }); + + return fill_ev; +} + +} // namespace constructors +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp new file mode 100644 index 000000000000..e1f61be4a12a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -0,0 +1,315 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "full_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + sycl::event fill_ev; + + if constexpr (sizeof(dstTy) == sizeof(char)) { + const auto memset_val = sycl::bit_cast(fill_v); + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + bool is_zero = false; + if constexpr (sizeof(dstTy) == 1) { + is_zero = (std::uint8_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 2) { + is_zero = + (std::uint16_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 4) { + is_zero = + (std::uint32_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 8) { + is_zero = + (std::uint64_t{0} == sycl::bit_cast(fill_v)); + } + else if constexpr (sizeof(dstTy) == 16) { + struct UInt128 + { + + constexpr UInt128() : v1{}, v2{} {} + UInt128(const UInt128 &) = default; + + operator bool() const + { + return bool(!v1) && bool(!v2); + } + + std::uint64_t v1; + std::uint64_t v2; + }; + is_zero = static_cast(sycl::bit_cast(fill_v)); + } + + if (is_zero) { + static constexpr int memset_val = 0; + fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + } + else { + using dpctl::tensor::kernels::constructors::full_contig_impl; + + fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + } + } + + return fill_ev; +} + +template +struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; + +typedef sycl::event (*full_strided_fn_ptr_t)(sycl::queue &, + int, + std::size_t, + py::ssize_t *, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given strided memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nd Array dimensionality + * @param nelems Length of the sequence + * @param shape_strides Kernel accessible USM pointer to packed shape and + * strides of array. + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_strided_impl(sycl::queue &exec_q, + int nd, + std::size_t nelems, + py::ssize_t *shape_strides, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v = py::cast(py_value); + + using dpctl::tensor::kernels::constructors::full_strided_impl; + sycl::event fill_ev = full_strided_impl( + exec_q, nd, nelems, shape_strides, fill_v, dst_p, depends); + + return fill_ev; +} + +template +struct FullStridedFactory +{ + fnT get() + { + fnT f = full_strided_impl; + return f; + } +}; + +static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types]; +static full_strided_fn_ptr_t full_strided_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // py_value should be coercible into data type of dst + + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = full_contig_dispatch_vector[dst_typeid]; + + sycl::event full_contig_event = + fn(exec_q, static_cast(dst_nelems), py_value, dst_data, + depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {full_contig_event}), + full_contig_event); + } + else { + int nd = dst.get_ndim(); + auto const &dst_shape = dst.get_shape_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + auto fn = full_strided_dispatch_vector[dst_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, dst_shape, dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + py::ssize_t *shape_strides = shape_strides_owner.get(); + + const sycl::event &full_strided_ev = + fn(exec_q, nd, dst_nelems, shape_strides, py_value, dst_data, + {copy_shape_ev}); + + // free shape_strides + const auto &temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {full_strided_ev}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {dst}, host_task_events), + full_strided_ev); + } +} + +void init_full_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(full_contig_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(full_strided_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp new file mode 100644 index 000000000000..d664b2013506 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -0,0 +1,60 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_full(const py::object &py_value, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_full_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c18761031fd0..c72c0b49622a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -54,7 +54,7 @@ // #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" -// #include "full_ctor.hpp" +#include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" @@ -103,7 +103,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ================ Full ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_full; +using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ @@ -159,7 +159,7 @@ void init_dispatch_vectors(void) // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); - // init_full_ctor_dispatch_vectors(); + init_full_ctor_dispatch_vectors(); // init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -327,10 +327,10 @@ PYBIND11_MODULE(_tensor_impl, m) // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_full_usm_ndarray", &usm_ndarray_full, - // "Populate usm_ndarray `dst` with given fill_value.", - // py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_full_usm_ndarray", &usm_ndarray_full, + "Populate usm_ndarray `dst` with given fill_value.", + py::arg("fill_value"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_take", &usm_ndarray_take, "Takes elements at usm_ndarray indices `ind` and axes starting " From d50f263f089dfd52edb4daa15edd3f86807965e5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:06:00 -0800 Subject: [PATCH 016/145] Use full and _full_usm_ndarray from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 6 ++++-- dpnp/dpnp_container.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 112ea3af0fdb..f7e6f0f608b1 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,12 +32,14 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val from dpctl.tensor._tensor_impl import ( - _copy_usm_ndarray_into_usm_ndarray, - _full_usm_ndarray, _zeros_usm_ndarray, ) import dpnp +from dpctl_ext.tensor._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _full_usm_ndarray, +) def dpnp_fill(arr, val): diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 4975db17c717..b13bf96cda28 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,7 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -228,7 +229,7 @@ def full( fill_value = fill_value.get_array() """Creates `dpnp_array` having a specified shape, filled with fill_value.""" - array_obj = dpt.full( + array_obj = dpt_ext.full( shape, fill_value, dtype=dtype, From f189dc540477ceadf35dcb127325056c5e0c406b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:22:55 -0800 Subject: [PATCH 017/145] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..4ae07ccbbdb9 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +dpctl_ext/**/*.cpython*.so From f9a181721784c843907c16e2e1d5569c487cf9e3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:23:51 -0800 Subject: [PATCH 018/145] Move _zeros_usm_ndarray to dpctl_ext --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../tensor/libtensor/source/tensor_ctors.cpp | 12 +- .../tensor/libtensor/source/zeros_ctor.cpp | 168 ++++++++++++++++++ .../tensor/libtensor/source/zeros_ctor.hpp | 59 ++++++ 4 files changed, 234 insertions(+), 7 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 0c52d766afbf..cb468b9a226d 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -53,7 +53,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index c72c0b49622a..b55439162f90 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -64,7 +64,7 @@ #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" -// #include "zeros_ctor.hpp" +#include "zeros_ctor.hpp" namespace py = pybind11; @@ -107,7 +107,7 @@ using dpctl::tensor::py_internal::usm_ndarray_full; /* ================ Zeros ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_zeros; +using dpctl::tensor::py_internal::usm_ndarray_zeros; /* ============== Advanced Indexing ============= */ using dpctl::tensor::py_internal::usm_ndarray_put; @@ -160,7 +160,7 @@ void init_dispatch_vectors(void) // init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); - // init_zeros_ctor_dispatch_vectors(); + init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); // init_triul_ctor_dispatch_vectors(); @@ -323,9 +323,9 @@ PYBIND11_MODULE(_tensor_impl, m) // synchronously.", py::arg("src"), py::arg("dst"), // py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, - // "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, + "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); m.def("_full_usm_ndarray", &usm_ndarray_full, "Populate usm_ndarray `dst` with given fill_value.", diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp new file mode 100644 index 000000000000..4558743b3c22 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -0,0 +1,168 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "zeros_ctor.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +typedef sycl::event (*zeros_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with zeros. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event zeros_contig_impl(sycl::queue &exec_q, + std::size_t nelems, + char *dst_p, + const std::vector &depends) +{ + + static constexpr int memset_val(0); + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst_p), memset_val, + nelems * sizeof(dstTy)); + }); + + return fill_ev; +} + +template +struct ZerosContigFactory +{ + fnT get() + { + fnT f = zeros_contig_impl; + return f; + } +}; + +static zeros_contig_fn_ptr_t zeros_contig_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + char *dst_data = dst.get_data(); + + if (dst_nelems == 1 || dst.is_c_contiguous() || dst.is_f_contiguous()) { + auto fn = zeros_contig_dispatch_vector[dst_typeid]; + + sycl::event zeros_contig_event = + fn(exec_q, static_cast(dst_nelems), dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {zeros_contig_event}), + zeros_contig_event); + } + else { + throw std::runtime_error( + "Only population of contiguous usm_ndarray objects is supported."); + } +} + +void init_zeros_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(zeros_contig_dispatch_vector); + + return; +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp new file mode 100644 index 000000000000..51270a3443cc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -0,0 +1,59 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_zeros(const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_zeros_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From 4b8505acf111ec2636afa0d2a9a25cf8677e02c7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:25:05 -0800 Subject: [PATCH 019/145] Use _zeros_usm_ndarray from dpctl_ext in dpnp_fill.py --- dpnp/dpnp_algo/dpnp_fill.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index f7e6f0f608b1..0d6640c3b8b5 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -31,14 +31,12 @@ import dpctl.tensor as dpt import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -from dpctl.tensor._tensor_impl import ( - _zeros_usm_ndarray, -) import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, + _zeros_usm_ndarray, ) From 61106b2e208d7f331bebc3335a49bc23212510c1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:39:35 -0800 Subject: [PATCH 020/145] Move linear-sequence implementations to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 178 ++++++++++ .../libtensor/source/linear_sequences.cpp | 312 ++++++++++++++++++ .../libtensor/source/linear_sequences.hpp | 69 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 579 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index cb468b9a226d..af0e2a7aa49f 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index dfd1b889aafe..20775b071ea8 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,11 +58,189 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ +template +class linear_sequence_step_kernel; +template +class linear_sequence_affine_kernel; template class full_strided_kernel; +// template class eye_kernel; using namespace dpctl::tensor::offset_utils; +template +class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +template +class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + std::size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using reT = typename Ty::value_type; + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); + re_comb = + sycl::fma(end_v.real(), _wc, + re_comb); // start_v.real() * _w + end_v.real() * _wc; + auto im_comb = + sycl::fma(start_v.imag(), _w, + reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; + im_comb = sycl::fma(end_v.imag(), _wc, im_comb); + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else if constexpr (std::is_floating_point::value) { + Ty _w = static_cast(w); + Ty _wc = static_cast(wc); + auto affine_comb = + sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; + affine_comb = sycl::fma(end_v, _wc, affine_comb); + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const bool device_supports_doubles = + exec_q.get_device().has(sycl::aspect::fp64); + const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; + + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + else { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + }); + + return lin_space_affine_event; +} + /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 000000000000..02c4a8ad0fa1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,312 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpnp4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "linear_sequences.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty step_v = py::cast(step); + + using dpctl::tensor::kernels::constructors::lin_space_step_impl; + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty end_v = py::cast(end); + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_linear_sequence_step(const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair + usm_ndarray_linear_sequence_affine(const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template +struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template +struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 000000000000..321cd2f23efe --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair usm_ndarray_linear_sequence_step( + const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index b55439162f90..dd660c497f9a 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -// #include "linear_sequences.hpp" +#include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" // #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - // init_linear_sequences_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and step `dt`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("dt"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and step `dt`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("dt"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - // "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - // "specified by " - // "starting point `start` and end point `end`. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("end"), py::arg("dst"), - // py::arg("include_endpoint"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and end point `end`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("end"), py::arg("dst"), + py::arg("include_endpoint"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From a030579be8525d6f23674d5c9a4a171ab842f500 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 02:40:33 -0800 Subject: [PATCH 021/145] Use _tensor_impl from dpctl_ext in dpnp_utils_fft.py --- dpnp/fft/dpnp_utils_fft.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 4e2b7aaaf842..c692774a424f 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,7 +42,6 @@ from collections.abc import Sequence import dpctl -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -51,6 +50,7 @@ ) from dpctl.utils import ExecutionPlacementError +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.fft._fft_impl as fi From a1d6fa39ba8607b191177d6acb0ca2f3cf8f49fc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:03:08 -0800 Subject: [PATCH 022/145] Move tril()/triu() to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_ctors.py | 157 +++++++++++ .../include/kernels/constructors.hpp | 138 ++++++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 46 ++-- .../tensor/libtensor/source/triul_ctor.cpp | 253 ++++++++++++++++++ .../tensor/libtensor/source/triul_ctor.hpp | 62 +++++ 7 files changed, 638 insertions(+), 24 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/triul_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index af0e2a7aa49f..1375c8316754 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -54,7 +54,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 9f4c27608a99..3c6939eff7a0 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -29,6 +29,8 @@ from dpctl_ext.tensor._ctors import ( full, + tril, + triu, ) from dpctl_ext.tensor._indexing_functions import ( put, @@ -39,4 +41,6 @@ "full", "put", "take", + "tril", + "triu", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 5caa07099c56..a0e7b28e66ff 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -26,6 +26,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import operator from numbers import Number import dpctl @@ -167,3 +168,159 @@ def full( hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) _manager.add_event_pair(hev, full_ev) return res + + +def tril(x, /, *, k=0): + """ + Returns the lower triangular part of a matrix (or a stack of matrices) + ``x``. + + The lower triangular part of the matrix is defined as the elements on and + below the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal above which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + A lower-triangular array or a stack of lower-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k >= shape[nd - 1] - 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + elif k < -shape[nd - 2]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, tril_ev = ti._tril( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, tril_ev) + + return res + + +def triu(x, /, *, k=0): + """ + Returns the upper triangular part of a matrix (or a stack of matrices) + ``x``. + + The upper triangular part of the matrix is defined as the elements on and + above the specified diagonal ``k``. + + Args: + x (usm_ndarray): + Input array + k (int, optional): + Specifies the diagonal below which to set + elements to zero. If ``k = 0``, the diagonal is the main diagonal. + If ``k < 0``, the diagonal is below the main diagonal. + If ``k > 0``, the diagonal is above the main diagonal. + Default: ``0`` + + Returns: + usm_ndarray: + An upper-triangular array or a stack of upper-triangular arrays. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected argument of type dpctl.tensor.usm_ndarray, " + f"got {type(x)}." + ) + + k = operator.index(k) + + order = "F" if (x.flags.f_contiguous) else "C" + + shape = x.shape + nd = x.ndim + if nd < 2: + raise ValueError("Array dimensions less than 2.") + + q = x.sycl_queue + if k > shape[nd - 1]: + res = dpt.zeros( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + elif k <= -shape[nd - 2] + 1: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=res, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + res = dpt.empty( + x.shape, + dtype=x.dtype, + order=order, + usm_type=x.usm_type, + sycl_queue=q, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hev, triu_ev = ti._triu( + src=x, dst=res, k=k, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hev, triu_ev) + + return res diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 20775b071ea8..8d53655b2754 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -343,6 +343,144 @@ sycl::event full_strided_impl(sycl::queue &q, return fill_ev; } +/* =========================== Tril and triu ============================== */ + +// define function type +typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, + ssize_t, // inner_range //ssize_t + ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + ssize_t, // nd + ssize_t *, // shape_and_strides + ssize_t, // k + const std::vector &, + const std::vector &); + +/*! + * @brief Function to copy triangular matrices from source stack to destination + * stack. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param inner_range Number of elements in each matrix. + * @param outer_range Number of matrices to copy. + * @param src_p Kernel accessible USM pointer for the source array. + * @param dst_p Kernel accessible USM pointer for the destination array. + * @param nd The array dimensionality of source and destination arrays. + * @param shape_and_strides Kernel accessible USM pointer to packed shape and + * strides of arrays. + * @param k Position of the diagonal above/below which to copy filling the rest + * with zero elements. + * @param depends List of events to wait for before starting computations, if + * any. + * @param additional_depends List of additional events to wait for before + * starting computations, if any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +class tri_kernel; +template +sycl::event tri_impl(sycl::queue &exec_q, + ssize_t inner_range, + ssize_t outer_range, + char *src_p, + char *dst_p, + ssize_t nd, + ssize_t *shape_and_strides, + ssize_t k, + const std::vector &depends, + const std::vector &additional_depends) +{ + static constexpr int d2 = 2; + ssize_t src_s = nd; + ssize_t dst_s = 2 * nd; + ssize_t nd_1 = nd - 1; + ssize_t nd_2 = nd - 2; + Ty *src = reinterpret_cast(src_p); + Ty *dst = reinterpret_cast(dst_p); + + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + sycl::event tri_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.depends_on(additional_depends); + + cgh.parallel_for>( + sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { + ssize_t outer_gid = idx[0] / inner_range; + ssize_t inner_gid = idx[0] - inner_range * outer_gid; + + ssize_t src_inner_offset = 0, dst_inner_offset = 0; + bool to_copy{false}; + + { + using dpctl::tensor::strides::CIndexer_array; + CIndexer_array indexer_i( + {shape_and_strides[nd_2], shape_and_strides[nd_1]}); + indexer_i.set(inner_gid); + const std::array &inner = indexer_i.get(); + src_inner_offset = + inner[0] * shape_and_strides[src_s + nd_2] + + inner[1] * shape_and_strides[src_s + nd_1]; + dst_inner_offset = + inner[0] * shape_and_strides[dst_s + nd_2] + + inner[1] * shape_and_strides[dst_s + nd_1]; + + if constexpr (upper) + to_copy = (inner[0] + k >= inner[1]); + else + to_copy = (inner[0] + k <= inner[1]); + } + + ssize_t src_offset = 0; + ssize_t dst_offset = 0; + { + using dpctl::tensor::strides::CIndexer_vector; + CIndexer_vector outer(nd - d2); + outer.get_displacement( + outer_gid, shape_and_strides, shape_and_strides + src_s, + shape_and_strides + dst_s, src_offset, dst_offset); + } + + src_offset += src_inner_offset; + dst_offset += dst_inner_offset; + + dst[dst_offset] = (to_copy) ? src[src_offset] : Ty(0); + }); + }); + return tri_ev; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TrilGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct TriuGenericFactory +{ + fnT get() + { + fnT f = tri_impl; + return f; + } +}; + } // namespace constructors } // namespace kernels } // namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index dd660c497f9a..f2afce105f7f 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -60,7 +60,7 @@ #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" -// #include "triul_ctor.hpp" +#include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" // #include "where.hpp" @@ -129,7 +129,7 @@ using dpctl::tensor::py_internal::usm_ndarray_take; /* =========================== Tril and triu ============================== */ -// using dpctl::tensor::py_internal::usm_ndarray_triul; +using dpctl::tensor::py_internal::usm_ndarray_triul; /* =========================== Where ============================== */ @@ -162,7 +162,7 @@ void init_dispatch_vectors(void) init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); - // init_triul_ctor_dispatch_vectors(); + init_triul_ctor_dispatch_vectors(); // populate_masked_extract_dispatch_vectors(); // populate_masked_place_dispatch_vectors(); @@ -388,27 +388,27 @@ PYBIND11_MODULE(_tensor_impl, m) dpctl::tensor::py_internal::default_device_index_type, "Gives default index type supported by device.", py::arg("dev")); - // auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); - // }; - // m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto tril_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'l', k, depends); + }; + m.def("_tril", tril_fn, "Tril helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); - // auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); - // }; - // m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), - // py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto triu_fn = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, py::ssize_t k, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + return usm_ndarray_triul(exec_q, src, dst, 'u', k, depends); + }; + m.def("_triu", triu_fn, "Triu helper function.", py::arg("src"), + py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), // py::arg("cumsum"), py::arg("sycl_queue"), diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp new file mode 100644 index 000000000000..0890dfdb4766 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -0,0 +1,253 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include // for std::copy +#include // for std::size_t +#include // for std::make_shared +#include // for std::runtime_error +#include // for std::pair, std::move +#include // for std::vector, std::begin, std::end + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "kernels/constructors.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::tri_fn_ptr_t; + +static tri_fn_ptr_t tril_generic_dispatch_vector[td_ns::num_types]; +static tri_fn_ptr_t triu_generic_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}) +{ + // array dimensions must be the same + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("Array dimensions are not the same."); + } + + if (src_nd < 2) { + throw py::value_error("Array dimensions less than 2."); + } + + // shapes must be the same + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t src_nelems(1); + + for (int i = 0; shapes_equal && i < src_nd; ++i) { + src_nelems *= static_cast(src_shape[i]); + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + } + if (!shapes_equal) { + throw py::value_error("Array shapes are not the same."); + } + + if (src_nelems == 0) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + // TODO: could use a temporary, but this is done by the caller + throw py::value_error("Arrays index overlapping segments of memory"); + } + + auto array_types = td_ns::usm_ndarray_types(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (dst_typeid != src_typeid) { + throw py::value_error("Array dtype are not the same."); + } + + // check same queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation contexts"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto src_strides = src.get_strides_vector(); + auto dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd - 2; + const py::ssize_t *shape = src_shape; + + const shT iter_src_strides(std::begin(src_strides), + std::begin(src_strides) + nd); + const shT iter_dst_strides(std::begin(dst_strides), + std::begin(dst_strides) + nd); + + simplify_iteration_space(nd, shape, iter_src_strides, iter_dst_strides, + // output + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + if (src_offset != 0 || dst_offset != 0) { + throw py::value_error("Reversed slice for dst is not supported"); + } + + nd += 2; + + using usm_host_allocatorT = + dpctl::tensor::alloc_utils::usm_host_allocator; + using usmshT = std::vector; + + usm_host_allocatorT allocator(exec_q); + auto shp_host_shape_and_strides = + std::make_shared(3 * nd, allocator); + + std::copy(simplified_shape.begin(), simplified_shape.end(), + shp_host_shape_and_strides->begin()); + (*shp_host_shape_and_strides)[nd - 2] = src_shape[src_nd - 2]; + (*shp_host_shape_and_strides)[nd - 1] = src_shape[src_nd - 1]; + + std::copy(simplified_src_strides.begin(), simplified_src_strides.end(), + shp_host_shape_and_strides->begin() + nd); + (*shp_host_shape_and_strides)[2 * nd - 2] = src_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[2 * nd - 1] = src_strides[src_nd - 1]; + + std::copy(simplified_dst_strides.begin(), simplified_dst_strides.end(), + shp_host_shape_and_strides->begin() + 2 * nd); + (*shp_host_shape_and_strides)[3 * nd - 2] = dst_strides[src_nd - 2]; + (*shp_host_shape_and_strides)[3 * nd - 1] = dst_strides[src_nd - 1]; + + auto dev_shape_and_strides_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(3 * nd, + exec_q); + py::ssize_t *dev_shape_and_strides = dev_shape_and_strides_owner.get(); + + const sycl::event ©_shape_and_strides = exec_q.copy( + shp_host_shape_and_strides->data(), dev_shape_and_strides, 3 * nd); + + py::ssize_t inner_range = src_shape[src_nd - 1] * src_shape[src_nd - 2]; + py::ssize_t outer_range = src_nelems / inner_range; + + sycl::event tri_ev; + if (part == 'l') { + auto fn = tril_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + else { + auto fn = triu_generic_dispatch_vector[src_typeid]; + tri_ev = + fn(exec_q, inner_range, outer_range, src_data, dst_data, nd, + dev_shape_and_strides, k, depends, {copy_shape_and_strides}); + } + + const auto &temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(tri_ev); + const auto &ctx = exec_q.get_context(); + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [shp_host_shape_and_strides = std::move(shp_host_shape_and_strides), + dev_shape_and_strides, ctx]() { + // capture of shp_host_shape_and_strides ensure the underlying + // vector exists for the entire execution of copying kernel + sycl_free_noexcept(dev_shape_and_strides, ctx); + }); + }); + // since host_task now owns USM allocation, release ownership by smart + // pointer + dev_shape_and_strides_owner.release(); + + return std::make_pair( + keep_args_alive(exec_q, {src, dst}, {temporaries_cleanup_ev}), tri_ev); +} + +void init_triul_ctor_dispatch_vectors(void) +{ + + using namespace td_ns; + using dpctl::tensor::kernels::constructors::TrilGenericFactory; + using dpctl::tensor::kernels::constructors::TriuGenericFactory; + + DispatchVectorBuilder dvb1; + dvb1.populate_dispatch_vector(tril_generic_dispatch_vector); + + DispatchVectorBuilder dvb2; + dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp new file mode 100644 index 000000000000..08889df6227f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair + usm_ndarray_triul(sycl::queue &exec_q, + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + char part, + py::ssize_t k = 0, + const std::vector &depends = {}); + +extern void init_triul_ctor_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl From f1d6e5650910eec6f330b2de902a93a1ae95df5f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 6 Feb 2026 03:05:03 -0800 Subject: [PATCH 023/145] Use tril/triu/_tril from dpctl_ext.tensor in dpnp --- dpnp/dpnp_container.py | 4 ++-- dpnp/linalg/dpnp_utils_linalg.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index b13bf96cda28..c8e28529cd57 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -270,13 +270,13 @@ def ones( def tril(x1, /, *, k=0): """Creates `dpnp_array` as lower triangular part of an input array.""" - array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) def triu(x1, /, *, k=0): """Creates `dpnp_array` as upper triangular part of an input array.""" - array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 196cd2ae9da5..5fb1c099dde2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -42,12 +42,12 @@ from typing import NamedTuple -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li from dpnp.dpnp_utils import get_usm_allocations From 668079060d9ece02fbb6887c2313edca9e6ecbef Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Feb 2026 02:47:35 -0800 Subject: [PATCH 024/145] Disable pylint no-name-in-module for dpctl_ext --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 1 + dpnp/dpnp_iface.py | 3 +-- dpnp/dpnp_iface_searching.py | 1 + dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index b63bf61f8dad..d8235b84e2d0 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -44,6 +44,7 @@ _validate_dtype, ) +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 832446c826ba..6220c61db6d9 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -40,6 +40,7 @@ """ # pylint: disable=protected-access +# pylint: disable=no-name-in-module import os @@ -53,8 +54,6 @@ import dpnp from .dpnp_array import dpnp_array - -# pylint: disable=no-name-in-module from .dpnp_utils import ( dpnp_descriptor, map_dtype_to_device, diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index fdbd317d31dd..74fbc9b37d13 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -41,6 +41,7 @@ import dpctl.tensor as dpt +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 4d8e3cdfbd0d..2de2bc15372c 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -37,6 +37,7 @@ ) from dpctl.utils import ExecutionPlacementError +# pylint: disable=no-name-in-module import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi From 263b7175f4aab799cd4fa100602011e8e23d046b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:31:01 -0800 Subject: [PATCH 025/145] Add TODO comments --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 ++ dpnp/dpnp_iface.py | 2 ++ dpnp/dpnp_iface_searching.py | 2 ++ dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 2 ++ dpnp/scipy/linalg/_utils.py | 2 ++ setup.py | 2 +- 6 files changed, 11 insertions(+), 1 deletion(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index d8235b84e2d0..88abcee5035c 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -45,6 +45,8 @@ ) # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 6220c61db6d9..50b474014666 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,8 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 74fbc9b37d13..16ab633d506b 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -42,6 +42,8 @@ import dpctl.tensor as dpt # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as dti import dpnp diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py index 2de2bc15372c..3dfd3c23ee7f 100644 --- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py +++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py @@ -38,6 +38,8 @@ from dpctl.utils import ExecutionPlacementError # pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.blas._blas_impl as bi diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index 8eb9187236bf..ce832d8f4529 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,8 @@ import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/setup.py b/setup.py index a0c54b066dcf..7ffef3bed9d8 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ "dpnp.scipy", "dpnp.scipy.linalg", "dpnp.scipy.special", - # dpctl_ext + # TODO: replace with dpctl; dpctl.tensor "dpctl_ext", "dpctl_ext.tensor", ], From 4130c1b80aa108ca127040a6c4ea15bcaa86173f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 04:53:39 -0800 Subject: [PATCH 026/145] Use default_device_complex_type from dpctl_ext on test_array_api_info.py --- dpnp/tests/test_array_api_info.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpnp/tests/test_array_api_info.py b/dpnp/tests/test_array_api_info.py index b310192ffc59..32730c8724dc 100644 --- a/dpnp/tests/test_array_api_info.py +++ b/dpnp/tests/test_array_api_info.py @@ -1,9 +1,11 @@ -import numpy import pytest from dpctl import SyclDeviceCreationError, get_devices, select_default_device -from dpctl.tensor._tensor_impl import default_device_complex_type import dpnp + +# TODO: revert to `from dpctl.tensor....` +# when dpnp fully migrates dpctl/tensor +from dpctl_ext.tensor._tensor_impl import default_device_complex_type from dpnp.tests.helper import ( has_support_aspect64, is_win_platform, From 17ca9ab52368f3bbdbfbdf6410b82823c98c53c0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 06:59:55 -0800 Subject: [PATCH 027/145] Remove unused build_dpctl_ext function --- dpctl_ext/CMakeLists.txt | 80 ---------------------------------------- 1 file changed, 80 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index bb33a4f57332..cdb007a2d230 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -122,84 +122,4 @@ set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") -function(build_dpctl_ext _trgt _src _dest) - set(options SYCL) - cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN}) - add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) - set(_cythonize_trgt "${_trgt}_cythonize_pyx") - python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) - if(BUILD_DPCTL_EXT_SYCL) - add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) - target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int) - target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel) - if(DPCTL_OFFLOAD_COMPRESS) - target_link_options(${_trgt} PRIVATE --offload-compress) - endif() - if(_dpctl_sycl_targets) - # make fat binary - target_compile_options( - ${_trgt} - PRIVATE ${_dpctl_sycl_target_compile_options} - ) - target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options}) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE Python::NumPy) - if(DPCTL_GENERATE_COVERAGE) - target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1) - if(BUILD_DPCTL_EXT_SYCL) - target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer) - endif() - endif() - target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface) - set(_linker_options "LINKER:${DPCTL_LDFLAGS}") - target_link_options(${_trgt} PRIVATE ${_linker_options}) - get_filename_component(_name_wle ${_generated_src} NAME_WLE) - get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY) - set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h") - set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h") - - # TODO: create separate folder inside build folder that contains only - # headers related to this target and appropriate folder structure to - # eliminate shadow dependencies - get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY) - # TODO: do not set directory if we did not generate header - target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir}) - set(_rpath_value "$ORIGIN") - if(BUILD_DPCTL_EXT_RELATIVE_PATH) - set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}") - endif() - if(DPCTL_WITH_REDIST) - set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..") - endif() - set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value}) - - install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) - install( - FILES ${_generated_api_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - install( - FILES ${_generated_public_h} - # TODO: revert to `${CMAKE_INSTALL_PREFIX}/dpctl/include/${_dest}` - DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest} - OPTIONAL - ) - if(DPCTL_GENERATE_COVERAGE) - get_filename_component(_original_src_dir ${_src} DIRECTORY) - file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir}) - install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir}) - endif() - - # Create target with headers only, because python is managing all the - # library imports at runtime - set(_trgt_headers ${_trgt}_headers) - add_library(${_trgt_headers} INTERFACE) - add_dependencies(${_trgt_headers} ${_trgt}) - get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES) - target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir}) -endfunction() - add_subdirectory(tensor) From 79cb2a45f28f5099701c0728a6def5c8961c5279 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 07:51:45 -0800 Subject: [PATCH 028/145] Apply remarks for CMake files --- dpctl_ext/CMakeLists.txt | 10 ++------- dpctl_ext/tensor/CMakeLists.txt | 38 ++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt index cdb007a2d230..e58693091422 100644 --- a/dpctl_ext/CMakeLists.txt +++ b/dpctl_ext/CMakeLists.txt @@ -27,13 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python REQUIRED COMPONENTS NumPy) - -# -t is to only Cythonize sources with timestamps newer than existing CXX files (if present) -# -w is to set working directory (and correctly set __pyx_f[] array of filenames) -set(CYTHON_FLAGS "-t -w \"${CMAKE_SOURCE_DIR}\"") -find_package(Cython REQUIRED) - +# TODO: rework this logic to remove current duplication if(WIN32) string( CONCAT WARNING_FLAGS @@ -118,7 +112,7 @@ else() endif() # at build time create include/ directory and copy header files over -set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_INSTALL_RPATH "$ORIGIN") diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ee8da2e49506..28e7a4cb55f4 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,8 +27,10 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +find_package(Python COMPONENTS Development) + if(WIN32) - if(${CMAKE_VERSION} VERSION_LESS "3.23") + if(${CMAKE_VERSION} VERSION_LESS "3.27") # this is a work-around for target_link_options inserting option after -link option, cause # linker to ignore it. set(CMAKE_CXX_LINK_FLAGS @@ -37,6 +39,7 @@ if(WIN32) endif() endif() +# TODO: reuse this library for dpnp ufunc extension build set(_static_lib_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp ) @@ -67,11 +70,11 @@ add_library(${_static_lib_trgt} STATIC ${_static_lib_sources}) target_include_directories( ${_static_lib_trgt} PRIVATE - ${Python_INCLUDE_DIRS} - ${DPCTL_INCLUDE_DIR} + # ${Python_INCLUDE_DIRS} + # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers ${Python_LIBRARIES}) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts) @@ -94,14 +97,14 @@ set(_no_fast_math_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) -list( - APPEND _no_fast_math_sources - # ${_elementwise_sources} - # ${_reduction_sources} - # ${_sorting_sources} - # ${_linalg_sources} - # ${_accumulator_sources} -) +#list( +#APPEND _no_fast_math_sources +# ${_elementwise_sources} +# ${_reduction_sources} +# ${_sorting_sources} +# ${_linalg_sources} +# ${_accumulator_sources} +#) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) @@ -114,7 +117,7 @@ endforeach() set(_compiler_definitions "") -set(_linker_options "LINKER:${DPCTL_LDFLAGS}") +set(_linker_options "LINKER:${DPNP_LDFLAGS}") foreach(python_module_name ${_py_trgts}) target_compile_options( ${python_module_name} @@ -124,6 +127,7 @@ foreach(python_module_name ${_py_trgts}) ${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel ) + # TODO: expand DPCTL_OFFLOAD_COMPRESS to the whole dpnp level if(DPCTL_OFFLOAD_COMPRESS) target_link_options(${python_module_name} PRIVATE --offload-compress) endif() @@ -149,22 +153,22 @@ foreach(python_module_name ${_py_trgts}) PRIVATE -fprofile-instr-generate -fcoverage-mapping ) endif() - if(_dpctl_sycl_targets) + if(_dpnp_sycl_targets) # make fat binary target_compile_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_compile_options} + PRIVATE ${_dpnp_sycl_target_compile_options} ) target_link_options( ${python_module_name} - PRIVATE ${_dpctl_sycl_target_link_options} + PRIVATE ${_dpnp_sycl_target_link_options} ) endif() # TODO: update source so they reference individual libraries instead of # dpctl4pybind11.hpp. It will allow to simplify dependency tree # NOTE: dpctl C-API is resolved at runtime via Python # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI) - if(DPCTL_WITH_REDIST) + if(DPNP_WITH_REDIST) set_target_properties( ${python_module_name} PROPERTIES INSTALL_RPATH "$ORIGIN/../../../.." From 4bf080edc0e5d277441fe39b31733571fbad0de3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 12 Feb 2026 08:30:03 -0800 Subject: [PATCH 029/145] Apply remarks for c++ files --- .../include/kernels/copy_and_cast.hpp | 18 ++++----------- .../include/kernels/copy_as_contiguous.hpp | 19 ++++----------- .../source/copy_and_cast_usm_to_usm.cpp | 23 ++++--------------- .../source/copy_and_cast_usm_to_usm.hpp | 11 ++------- .../libtensor/source/copy_as_contig.cpp | 14 ++++------- .../libtensor/source/copy_as_contig.hpp | 11 ++------- .../source/device_support_queries.cpp | 13 ++++------- .../source/device_support_queries.hpp | 12 ++-------- .../source/simplify_iteration_space.cpp | 12 ++++------ .../source/simplify_iteration_space.hpp | 11 +++------ .../tensor/libtensor/source/tensor_ctors.cpp | 10 ++++---- 11 files changed, 43 insertions(+), 111 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp index a07d311a7fcb..d6001a11e471 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_and_cast +namespace dpctl::tensor::kernels::copy_and_cast { using dpctl::tensor::ssize_t; @@ -1282,7 +1277,4 @@ struct CopyForRollNDShiftFactory } }; -} // namespace copy_and_cast -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_and_cast diff --git a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp index b4f367448758..37126a22dc64 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/copy_as_contiguous.hpp @@ -33,11 +33,12 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include +#include #include #include #include -#include +#include #include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" @@ -45,13 +46,7 @@ #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace copy_as_contig +namespace dpctl::tensor::kernels::copy_as_contig { using dpctl::tensor::ssize_t; @@ -648,8 +643,4 @@ struct AsCContigNDBatchOfSquareMatricesFactory return as_c_contiguous_nd_batch_of_square_matrices_impl; } }; - -} // namespace copy_as_contig -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::copy_as_contig diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp index 0458aa75ac32..3d20be02f885 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp @@ -32,21 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include +#include #include -#include -#include #include -#include -#include +#include #include +#include #include "dpnp4pybind11.hpp" -#include -#include #include -#include #include "kernels/copy_and_cast.hpp" #include "utils/memory_overlap.hpp" @@ -54,16 +48,11 @@ #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -305,6 +294,4 @@ void init_copy_and_cast_usm_to_usm_dispatch_tables(void) dtb_1d.populate_dispatch_table(copy_and_cast_1d_dispatch_table); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp index d2a2dcaf7b85..d2e07b08d38f 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp @@ -38,13 +38,8 @@ #include #include "dpnp4pybind11.hpp" -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair copy_usm_ndarray_into_usm_ndarray( @@ -55,6 +50,4 @@ extern std::pair copy_usm_ndarray_into_usm_ndarray( extern void init_copy_and_cast_usm_to_usm_dispatch_tables(); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 53b39ff5874c..7105202fe2ff 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -32,10 +32,11 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include #include +#include #include #include +#include #include #include @@ -54,13 +55,10 @@ #include "copy_as_contig.hpp" #include "simplify_iteration_space.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::kernels::copy_as_contig:: @@ -753,6 +751,4 @@ std::pair ascontig_ev); } -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp index 2de67098b7fa..bfe3159c8813 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.hpp @@ -32,14 +32,9 @@ #include #include "dpnp4pybind11.hpp" -#include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { std::pair @@ -56,6 +51,4 @@ std::pair void init_copy_as_contig_dispatch_vectors(void); -} // end of namespace py_internal -} // end of namespace tensor -} // end of namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp index 51eb7dba1b6c..97a8ba83831e 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.cpp @@ -39,13 +39,11 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { +namespace py = pybind11; + namespace { @@ -61,7 +59,6 @@ std::string _default_device_fp_type(const sycl::device &d) int get_numpy_major_version() { - namespace py = pybind11; py::module_ numpy = py::module_::import("numpy"); py::str version_string = numpy.attr("__version__"); @@ -179,6 +176,4 @@ std::string default_device_index_type(const py::object &arg) return _default_device_index_type(d); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp index 6ea01dcd49d7..adde7aefe3dd 100644 --- a/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp +++ b/dpctl_ext/tensor/libtensor/source/device_support_queries.hpp @@ -36,14 +36,8 @@ #include "dpnp4pybind11.hpp" #include -#include -#include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::string default_device_fp_type(const py::object &); @@ -53,6 +47,4 @@ extern std::string default_device_bool_type(const py::object &); extern std::string default_device_complex_type(const py::object &); extern std::string default_device_index_type(const py::object &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp index 2526f022e0ac..e3cff701ed50 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.cpp @@ -34,15 +34,13 @@ #include "simplify_iteration_space.hpp" #include "utils/strided_iters.hpp" +#include #include +#include #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -539,6 +537,4 @@ std::vector _unravel_index_f(py::ssize_t flat_index, return mi; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp index d3448ee1f5fd..acbc833157d1 100644 --- a/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp +++ b/dpctl_ext/tensor/libtensor/source/simplify_iteration_space.hpp @@ -36,11 +36,7 @@ #include #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace py = pybind11; @@ -125,6 +121,5 @@ std::vector _unravel_index_c(py::ssize_t, std::vector const &); std::vector _unravel_index_f(py::ssize_t, std::vector const &); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 911d75ebd925..be69ee1a8c7e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,15 +32,17 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -#include -#include -#include +// #include +// #include +// #include #include #include #include -#include +// #include +#include #include #include +#include #include "dpnp4pybind11.hpp" From cfa6cd69735591e79ca3437cc05c326ce115ffc9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:31:42 -0800 Subject: [PATCH 030/145] Remove linear-sequence implementations --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 177 ---------- .../libtensor/source/linear_sequences.cpp | 312 ------------------ .../libtensor/source/linear_sequences.hpp | 69 ---- .../tensor/libtensor/source/tensor_ctors.cpp | 38 +-- 5 files changed, 19 insertions(+), 579 deletions(-) delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp delete mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 1375c8316754..baf8ef5ce5f6 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 8d53655b2754..f43614e13766 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -58,189 +58,12 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ -template -class linear_sequence_step_kernel; -template -class linear_sequence_affine_kernel; template class full_strided_kernel; // template class eye_kernel; using namespace dpctl::tensor::offset_utils; -template -class LinearSequenceStepFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty step_v; - -public: - LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) - : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - p[i] = Ty{start_v.real() + i * step_v.real(), - start_v.imag() + i * step_v.imag()}; - } - else { - p[i] = start_v + i * step_v; - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting value and - * increment. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start_v Typed starting value of the sequence - * @param step_v Typed increment of the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty step_v, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - cgh.parallel_for>( - sycl::range<1>{nelems}, - LinearSequenceStepFunctor(array_data, start_v, step_v)); - }); - - return lin_space_step_event; -} - -// Constructor to populate tensor with linear sequence defined by -// start and and data - -template -class LinearSequenceAffineFunctor -{ -private: - Ty *p = nullptr; - Ty start_v; - Ty end_v; - std::size_t n; - -public: - LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) - : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), - n((den == 0) ? 1 : den) - { - } - - void operator()(sycl::id<1> wiid) const - { - auto i = wiid.get(0); - wTy wc = wTy(i) / n; - wTy w = wTy(n - i) / n; - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using reT = typename Ty::value_type; - auto _w = static_cast(w); - auto _wc = static_cast(wc); - auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); - re_comb = - sycl::fma(end_v.real(), _wc, - re_comb); // start_v.real() * _w + end_v.real() * _wc; - auto im_comb = - sycl::fma(start_v.imag(), _w, - reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; - im_comb = sycl::fma(end_v.imag(), _wc, im_comb); - Ty affine_comb = Ty{re_comb, im_comb}; - p[i] = affine_comb; - } - else if constexpr (std::is_floating_point::value) { - Ty _w = static_cast(w); - Ty _wc = static_cast(wc); - auto affine_comb = - sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; - affine_comb = sycl::fma(end_v, _wc, affine_comb); - p[i] = affine_comb; - } - else { - using dpctl::tensor::type_utils::convert_impl; - auto affine_comb = start_v * w + end_v * wc; - p[i] = convert_impl(affine_comb); - } - } -}; - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by typed starting and end values. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence. - * @param start_v Stating value of the sequence. - * @param end_v End-value of the sequence. - * @param include_endpoint Whether the end-value is included in the sequence. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - Ty start_v, - Ty end_v, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - dpctl::tensor::type_utils::validate_type_for_device(exec_q); - - const bool device_supports_doubles = - exec_q.get_device().has(sycl::aspect::fp64); - const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; - - sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - if (device_supports_doubles) { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - else { - using KernelName = linear_sequence_affine_kernel; - using Impl = LinearSequenceAffineFunctor; - - cgh.parallel_for(sycl::range<1>{nelems}, - Impl(array_data, start_v, end_v, den)); - } - }); - - return lin_space_affine_event; -} - /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp deleted file mode 100644 index 02c4a8ad0fa1..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp +++ /dev/null @@ -1,312 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include "dpnp4pybind11.hpp" -#include -#include -#include -#include -#include -#include -#include - -#include "kernels/constructors.hpp" -#include "utils/output_validation.hpp" -#include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" - -#include "linear_sequences.hpp" - -namespace py = pybind11; -namespace td_ns = dpctl::tensor::type_dispatch; - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -// Constructor to populate tensor with linear sequence defined by -// start and step data - -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &step, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting value and increment - * given as Python objects. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start Starting value of the sequence as Python object. Must be - * convertible to array element data type `Ty`. - * @param step Increment of the sequence as Python object. Must be convertible - * to array element data type `Ty`. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &step, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty step_v = py::cast(step); - - using dpctl::tensor::kernels::constructors::lin_space_step_impl; - - auto lin_space_step_event = lin_space_step_impl( - exec_q, nelems, start_v, step_v, array_data, depends); - - return lin_space_step_event; -} - -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue &, - std::size_t, // num_elements - const py::object &start, - const py::object &end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); - -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting and end values given - * as Python objects. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence - * @param start Stating value of the sequence as Python object. Must be - * convertible to array data element type `Ty`. - * @param end End-value of the sequence as Python object. Must be convertible - * to array data element type `Ty`. - * @param include_endpoint Whether the end-value is included in the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - std::size_t nelems, - const py::object &start, - const py::object &end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v = py::cast(start); - Ty end_v = py::cast(end); - - using dpctl::tensor::kernels::constructors::lin_space_affine_impl; - - auto lin_space_affine_event = lin_space_affine_impl( - exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); - - return lin_space_affine_event; -} - -using dpctl::utils::keep_args_alive; - -static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; - -static lin_space_affine_fn_ptr_t - lin_space_affine_dispatch_vector[td_ns::num_types]; - -std::pair - usm_ndarray_linear_sequence_step(const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue is not compatible with the allocation queue"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_step_event; - - auto fn = lin_space_step_dispatch_vector[dst_typeid]; - - linspace_step_event = - fn(exec_q, static_cast(len), start, dt, dst_data, depends); - - return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), - linspace_step_event); -} - -std::pair - usm_ndarray_linear_sequence_affine(const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends) -{ - // dst must be 1D and C-contiguous - // start, end should be coercible into data type of dst - - if (dst.get_ndim() != 1) { - throw py::value_error( - "usm_ndarray_linspace: Expecting 1D array to populate"); - } - - if (!dst.is_c_contiguous()) { - throw py::value_error( - "usm_ndarray_linspace: Non-contiguous arrays are not supported"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { - throw py::value_error( - "Execution queue context is not the same as allocation context"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); - - auto array_types = td_ns::usm_ndarray_types(); - int dst_typenum = dst.get_typenum(); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - py::ssize_t len = dst.get_shape(0); - if (len == 0) { - // nothing to do - return std::make_pair(sycl::event{}, sycl::event{}); - } - - char *dst_data = dst.get_data(); - sycl::event linspace_affine_event; - - auto fn = lin_space_affine_dispatch_vector[dst_typeid]; - - linspace_affine_event = fn(exec_q, static_cast(len), start, - end, include_endpoint, dst_data, depends); - - return std::make_pair( - keep_args_alive(exec_q, {dst}, {linspace_affine_event}), - linspace_affine_event); -} - -/*! - * @brief Factor to get function pointer of type `fnT` for array with elements - * of type `Ty`. - * @defgroup CtorKernels - */ -template -struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - -/*! - * @brief Factory to get function pointer of type `fnT` for array data type - * `Ty`. - */ -template -struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - -void init_linear_sequences_dispatch_vectors(void) -{ - using namespace td_ns; - - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); - - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp deleted file mode 100644 index 321cd2f23efe..000000000000 --- a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp +++ /dev/null @@ -1,69 +0,0 @@ -//***************************************************************************** -// Copyright (c) 2026, Intel Corporation -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// - Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// - Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF -// THE POSSIBILITY OF SUCH DAMAGE. -//***************************************************************************** -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#pragma once -#include -#include -#include - -#include "dpnp4pybind11.hpp" -#include - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -extern std::pair usm_ndarray_linear_sequence_step( - const py::object &start, - const py::object &dt, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern std::pair usm_ndarray_linear_sequence_affine( - const py::object &start, - const py::object &end, - const dpctl::tensor::usm_ndarray &dst, - bool include_endpoint, - sycl::queue &exec_q, - const std::vector &depends = {}); - -extern void init_linear_sequences_dispatch_vectors(void); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index f2afce105f7f..7e4253c0cbb6 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -#include "linear_sequences.hpp" +// #include "linear_sequences.hpp" // #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" @@ -98,8 +98,8 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= linear-sequence ==================== */ -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -158,7 +158,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); // init_copy_for_reshape_dispatch_vectors(); // init_copy_for_roll_dispatch_vectors(); - init_linear_sequences_dispatch_vectors(); + // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); // init_eye_ctor_dispatch_vectors(); @@ -300,22 +300,20 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = // py::list()); - m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and step `dt`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("dt"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " - "specified by " - "starting point `start` and end point `end`. " - "Returns a tuple of events: (ht_event, comp_event)", - py::arg("start"), py::arg("end"), py::arg("dst"), - py::arg("include_endpoint"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); + // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and step + // `dt`. " "Returns a tuple of events: (ht_event, comp_event)", + // py::arg("start"), py::arg("dt"), py::arg("dst"), + // py::arg("sycl_queue"), py::arg("depends") = py::list()); + + // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + // "Fills input 1D contiguous usm_ndarray `dst` with linear + // sequence " "specified by " "starting point `start` and end + // point `end`. " "Returns a tuple of events: (ht_event, + // comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"), + // py::arg("include_endpoint"), py::arg("sycl_queue"), + // py::arg("depends") = py::list()); // m.def("_copy_numpy_ndarray_into_usm_ndarray", // ©_numpy_ndarray_into_usm_ndarray, From 087a2ecbfff6262224ff115c9948202ecf45e6ba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 11:58:15 -0800 Subject: [PATCH 031/145] Use _tensor_impl from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_fill.py | 3 +++ dpnp/dpnp_iface.py | 1 + dpnp/dpnp_iface_indexing.py | 11 +++++++---- dpnp/fft/dpnp_utils_fft.py | 14 +++++++++++--- dpnp/linalg/dpnp_utils_linalg.py | 3 +++ dpnp/scipy/linalg/_utils.py | 1 + 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 0d6640c3b8b5..4137a2794747 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -33,6 +33,9 @@ from dpctl.tensor._ctors import _cast_fill_val import dpnp + +# TODO: revert to `from dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 50b474014666..533bdc36c617 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -50,6 +50,7 @@ import numpy from dpctl.tensor._device import normalize_queue_device +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6421f39fd4e4..a01a036e16cc 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -45,7 +45,6 @@ from collections.abc import Iterable import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._copy_utils import _nonzero_impl @@ -53,7 +52,11 @@ from dpctl.tensor._numpy_helper import normalize_axis_index import dpctl_ext.tensor as dpt_ext -import dpctl_ext.tensor._tensor_impl as ti_ext + +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti import dpnp # pylint: disable=no-name-in-module @@ -297,7 +300,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): "Input and output allocation queues are not compatible" ) - if ti_ext._array_overlap(x, out): + if ti._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: @@ -306,7 +309,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events - h_ev, take_ev = ti_ext._take( + h_ev, take_ev = ti._take( src=x, ind=(inds,), dst=out, diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index c692774a424f..60f89a933284 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,6 +42,11 @@ from collections.abc import Sequence import dpctl + +# pylint: disable=no-name-in-module +# TODO: remove it when ti.__linspace_step +# is migrated to dpctl_ext/tensor +import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -50,7 +55,10 @@ ) from dpctl.utils import ExecutionPlacementError -import dpctl_ext.tensor._tensor_impl as ti +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti_ext import dpnp import dpnp.backend.extensions.fft._fft_impl as fi @@ -196,7 +204,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides): if ( out is not None and out.strides == tuple(out_strides) - and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) + and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) ): res_usm = dpnp.get_usm_ndarray(out) result = out @@ -524,7 +532,7 @@ def _truncate_or_pad(a, shape, axes): ) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(a), dst=z.get_array()[tuple(index)], sycl_queue=exec_q, diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 5fb1c099dde2..171ac38a141c 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -47,6 +47,9 @@ from dpctl.tensor._numpy_helper import normalize_axis_index from numpy import prod +# pylint: disable=no-name-in-module +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li diff --git a/dpnp/scipy/linalg/_utils.py b/dpnp/scipy/linalg/_utils.py index ce832d8f4529..665a4e1595ad 100644 --- a/dpnp/scipy/linalg/_utils.py +++ b/dpnp/scipy/linalg/_utils.py @@ -44,6 +44,7 @@ import dpctl.utils as dpu +# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti From f4492fbc8048d2fcc598a089715b85ed6504f02d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:28:16 -0800 Subject: [PATCH 032/145] Add missing include --- .../tensor/libtensor/include/kernels/constructors.hpp | 3 ++- .../include/kernels/integer_advanced_indexing.hpp | 4 +--- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 8 ++++---- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 5 ++++- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++++++---- .../libtensor/source/integer_advanced_indexing.hpp | 6 +++++- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 3 +-- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 7 ++----- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 3 ++- 10 files changed, 29 insertions(+), 22 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index f43614e13766..3bc4a1d16271 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -33,8 +33,9 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include +#include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 1b2c79d2e2a5..d0ec5227731c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,12 +33,10 @@ //===----------------------------------------------------------------------===// #pragma once -#include -#include #include -#include #include #include +#include #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index e1f61be4a12a..279bb9f470bc 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,15 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include -#include -#include +#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index d664b2013506..43b30fc8341c 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -33,13 +33,16 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 244acfe3955f..ed72096bff8f 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,21 +34,23 @@ //===----------------------------------------------------------------------===// #include -#include #include #include +#include +#include #include -#include +#include #include +#include + +#include #include "dpnp4pybind11.hpp" -#include #include #include #include "kernels/integer_advanced_indexing.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 57f0ddda132c..5dfbd2f04d93 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -34,13 +34,17 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include #include +#include + #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index 0890dfdb4766..f0f592c52938 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -34,8 +34,8 @@ #include // for std::copy #include // for std::size_t +#include // for std::begin, std::end #include // for std::make_shared -#include // for std::runtime_error #include // for std::pair, std::move #include // for std::vector, std::begin, std::end @@ -47,7 +47,6 @@ #include "kernels/constructors.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index 08889df6227f..c61d95eef7ec 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -40,6 +40,8 @@ #include "dpnp4pybind11.hpp" #include +namespace py = pybind11; + namespace dpctl { namespace tensor diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index 4558743b3c22..d7370f55e8cb 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,21 +32,18 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// -#include #include #include -#include #include #include +#include + #include "dpnp4pybind11.hpp" -#include #include -#include "kernels/constructors.hpp" #include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" #include "zeros_ctor.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index 51270a3443cc..ec3bce994ef6 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -33,10 +33,11 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include #include +#include + #include "dpnp4pybind11.hpp" #include From b367c9fd3b4b538e132afb5838584137a6f8a25c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 16 Feb 2026 12:36:24 -0800 Subject: [PATCH 033/145] Use nested namespace syntax --- .../libtensor/include/kernels/constructors.hpp | 13 ++----------- .../include/kernels/integer_advanced_indexing.hpp | 13 ++----------- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/full_ctor.hpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.cpp | 10 ++-------- .../libtensor/source/integer_advanced_indexing.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/triul_ctor.hpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 10 ++-------- dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp | 10 ++-------- 10 files changed, 20 insertions(+), 86 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 3bc4a1d16271..47726319b3e1 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -44,13 +44,7 @@ #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace constructors +namespace dpctl::tensor::kernels::constructors { using dpctl::tensor::ssize_t; @@ -305,7 +299,4 @@ struct TriuGenericFactory } }; -} // namespace constructors -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::constructors diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index d0ec5227731c..7351502dbc11 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -43,13 +43,7 @@ #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ -namespace indexing +namespace dpctl::tensor::kernels::indexing { using dpctl::tensor::ssize_t; @@ -419,7 +413,4 @@ struct PutClipFactory } }; -} // namespace indexing -} // namespace kernels -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index 279bb9f470bc..ca4a17f28f77 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -53,11 +53,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -310,6 +306,4 @@ void init_full_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(full_strided_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp index 43b30fc8341c..18c15de87a40 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.hpp @@ -43,11 +43,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -58,6 +54,4 @@ extern std::pair extern void init_full_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index ed72096bff8f..77322381d517 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -62,11 +62,7 @@ #define WRAP_MODE 0 #define CLIP_MODE 1 -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { namespace td_ns = dpctl::tensor::type_dispatch; @@ -816,6 +812,4 @@ void init_advanced_indexing_dispatch_tables(void) dtb_putwrap.populate_dispatch_table(put_dispatch_table[WRAP_MODE]); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp index 5dfbd2f04d93..bc0136288e1c 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.hpp @@ -45,11 +45,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -72,6 +68,4 @@ extern std::pair extern void init_advanced_indexing_dispatch_tables(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp index f0f592c52938..13e909196460 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.cpp @@ -54,11 +54,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -247,6 +243,4 @@ void init_triul_ctor_dispatch_vectors(void) dvb2.populate_dispatch_vector(triu_generic_dispatch_vector); } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp index c61d95eef7ec..47cc4ce8892d 100644 --- a/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/triul_ctor.hpp @@ -42,11 +42,7 @@ namespace py = pybind11; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -59,6 +55,4 @@ extern std::pair extern void init_triul_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index d7370f55e8cb..b9a2e01bea4a 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -50,11 +50,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { using dpctl::utils::keep_args_alive; @@ -160,6 +156,4 @@ void init_zeros_ctor_dispatch_vectors(void) return; } -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp index ec3bce994ef6..51a1903a0f36 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.hpp @@ -41,11 +41,7 @@ #include "dpnp4pybind11.hpp" #include -namespace dpctl -{ -namespace tensor -{ -namespace py_internal +namespace dpctl::tensor::py_internal { extern std::pair @@ -55,6 +51,4 @@ extern std::pair extern void init_zeros_ctor_dispatch_vectors(void); -} // namespace py_internal -} // namespace tensor -} // namespace dpctl +} // namespace dpctl::tensor::py_internal From 3113716a13a131dc44f819140489176be5ff7cba Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 02:50:47 -0800 Subject: [PATCH 034/145] Add missing include complex --- dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp | 1 + .../libtensor/include/kernels/integer_advanced_indexing.hpp | 4 +++- dpctl_ext/tensor/libtensor/source/full_ctor.cpp | 2 ++ .../tensor/libtensor/source/integer_advanced_indexing.cpp | 2 ++ dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp | 2 ++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 47726319b3e1..22189ee3129c 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -34,6 +34,7 @@ #pragma once #include +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 7351502dbc11..7be2b3ea8591 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -33,11 +33,13 @@ //===----------------------------------------------------------------------===// #pragma once +#include #include -#include #include #include +#include + #include "dpctl_tensor_types.hpp" #include "utils/indexing_utils.hpp" #include "utils/offset_utils.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp index ca4a17f28f77..aef57836666e 100644 --- a/dpctl_ext/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/full_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -41,6 +42,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "kernels/constructors.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp index 77322381d517..925cc2e895ed 100644 --- a/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp +++ b/dpctl_ext/tensor/libtensor/source/integer_advanced_indexing.cpp @@ -34,6 +34,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp index b9a2e01bea4a..2eb05e49f382 100644 --- a/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp +++ b/dpctl_ext/tensor/libtensor/source/zeros_ctor.cpp @@ -32,6 +32,7 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===--------------------------------------------------------------------===// +#include #include #include #include @@ -40,6 +41,7 @@ #include #include "dpnp4pybind11.hpp" +#include #include #include "utils/output_validation.hpp" From 978afee9115d8feaebe72c80ce3e827e13c66770 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 03:13:50 -0800 Subject: [PATCH 035/145] Add missing memory and queue checks --- .../libtensor/source/copy_as_contig.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp index 7105202fe2ff..bbee24c95d4d 100644 --- a/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp +++ b/dpctl_ext/tensor/libtensor/source/copy_as_contig.cpp @@ -189,6 +189,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -314,6 +320,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + const auto &src_strides_vec = src.get_strides_vector(); if (src_nd >= 2) { @@ -459,6 +471,12 @@ std::pair "Execution queue is not compatible with allocation queues"); } + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + if (nelems == 0) { // nothing to do return std::make_pair(sycl::event(), sycl::event()); @@ -624,6 +642,20 @@ std::pair throw py::value_error("Unexpected destination array layout"); } + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // check that arrays do not overlap, and concurrent copying is safe. + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + int src_typenum = src.get_typenum(); int dst_typenum = dst.get_typenum(); From fec84ec7eafcfbd9a3d0e80f6a1c0e35c5312769 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:17:56 -0800 Subject: [PATCH 036/145] Move ti._copy_numpy_ndarray_into_usm_ndarray() --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../copy_numpy_ndarray_into_usm_ndarray.cpp | 368 ++++++++++++++++++ .../copy_numpy_ndarray_into_usm_ndarray.hpp | 57 +++ .../tensor/libtensor/source/tensor_ctors.cpp | 16 +- 4 files changed, 434 insertions(+), 9 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 53a0dfd27ba3..4e3fa580f99e 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -48,7 +48,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp new file mode 100644 index 000000000000..e97e8aeb1ca1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp @@ -0,0 +1,368 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_blocking_fn_ptr_t; + +static copy_and_cast_from_host_blocking_fn_ptr_t + copy_and_cast_from_host_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::copy_and_cast:: + copy_and_cast_from_host_contig_blocking_fn_ptr_t; + +static copy_and_cast_from_host_contig_blocking_fn_ptr_t + copy_and_cast_from_host_contig_blocking_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_ndim = npy_src.ndim(); + int dst_ndim = dst.get_ndim(); + + if (src_ndim != dst_ndim) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "different array ranks, " + "i.e. different number of indices needed to " + "address array elements."); + } + + const py::ssize_t *src_shape = npy_src.shape(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool shapes_equal(true); + std::size_t src_nelems(1); + for (int i = 0; shapes_equal && (i < src_ndim); ++i) { + shapes_equal = shapes_equal && (src_shape[i] == dst_shape[i]); + src_nelems *= static_cast(src_shape[i]); + } + + if (!shapes_equal) { + throw py::value_error("Source ndarray and destination usm_ndarray have " + "difference shapes."); + } + + if (src_nelems == 0) { + // nothing to do + return; + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // here we assume that NumPy's type numbers agree with ours for types + // supported in both + int src_typenum = + py::detail::array_descriptor_proxy(npy_src.dtype().ptr())->type_num; + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_type_id = array_types.typenum_to_lookup_id(src_typenum); + int dst_type_id = array_types.typenum_to_lookup_id(dst_typenum); + + py::buffer_info src_pybuf = npy_src.request(); + const char *const src_data = static_cast(src_pybuf.ptr); + char *dst_data = dst.get_data(); + + int src_flags = npy_src.flags(); + + // check for applicability of special cases: + // (same type && (both C-contiguous || both F-contiguous) + const bool both_c_contig = + ((src_flags & py::array::c_style) && dst.is_c_contiguous()); + const bool both_f_contig = + ((src_flags & py::array::f_style) && dst.is_f_contiguous()); + + const bool same_data_types = (src_type_id == dst_type_id); + + if (both_c_contig || both_f_contig) { + if (same_data_types) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = + exec_q.memcpy(static_cast(dst_data), + static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + copy_ev.wait(); + } + + return; + } + else { + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table + [dst_type_id][src_type_id]; + + static constexpr py::ssize_t zero_offset(0); + + copy_and_cast_from_host_contig_blocking_fn( + exec_q, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + return; + } + } + + auto const &dst_strides = + dst.get_strides_vector(); // N.B.: strides in elements + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_ndim; + const py::ssize_t *shape = src_shape; + + const py::ssize_t *src_strides_p = + npy_src.strides(); // N.B.: strides in bytes + py::ssize_t src_itemsize = npy_src.itemsize(); // item size in bytes + + bool is_src_c_contig = ((src_flags & py::array::c_style) != 0); + bool is_src_f_contig = ((src_flags & py::array::f_style) != 0); + + shT src_strides_in_elems; + if (src_strides_p) { + src_strides_in_elems.resize(nd); + // copy and convert strides from bytes to elements + std::transform( + src_strides_p, src_strides_p + nd, std::begin(src_strides_in_elems), + [src_itemsize](py::ssize_t el) { + py::ssize_t q = el / src_itemsize; + if (q * src_itemsize != el) { + throw std::runtime_error( + "NumPy array strides are not multiple of itemsize"); + } + return q; + }); + } + else { + if (is_src_c_contig) { + src_strides_in_elems = + dpctl::tensor::c_contiguous_strides(nd, src_shape); + } + else if (is_src_f_contig) { + src_strides_in_elems = + dpctl::tensor::f_contiguous_strides(nd, src_shape); + } + else { + throw py::value_error("NumPy source array has null strides but is " + "neither C- nor F-contiguous."); + } + } + + // nd, simplified_* vectors and offsets are modified by reference + simplify_iteration_space(nd, shape, src_strides_in_elems, dst_strides, + // outputs + simplified_shape, simplified_src_strides, + simplified_dst_strides, src_offset, dst_offset); + + assert(simplified_shape.size() == static_cast(nd)); + assert(simplified_src_strides.size() == static_cast(nd)); + assert(simplified_dst_strides.size() == static_cast(nd)); + + // handle nd == 0 + if (nd == 0) { + nd = 1; + simplified_shape.reserve(nd); + simplified_shape.push_back(1); + + simplified_src_strides.reserve(nd); + simplified_src_strides.push_back(1); + + simplified_dst_strides.reserve(nd); + simplified_dst_strides.push_back(1); + } + + const bool is_contig_vector = + ((nd == 1) && (simplified_src_strides.front() == 1) && + (simplified_dst_strides.front() == 1)); + + const bool can_use_memcpy = (same_data_types && is_contig_vector && + (src_offset == 0) && (dst_offset == 0)); + + if (can_use_memcpy) { + int src_elem_size = npy_src.itemsize(); + + sycl::event copy_ev = exec_q.memcpy( + static_cast(dst_data), static_cast(src_data), + src_nelems * src_elem_size, depends); + + { + // wait for copy_ev to complete + // release GIL to allow other threads (host_tasks) + // a chance to acquire GIL + py::gil_scoped_release lock{}; + + copy_ev.wait(); + } + + return; + } + + // Minimum and maximum element offsets for source np.ndarray + py::ssize_t npy_src_min_nelem_offset(src_offset); + py::ssize_t npy_src_max_nelem_offset(src_offset); + for (int i = 0; i < nd; ++i) { + if (simplified_src_strides[i] < 0) { + npy_src_min_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + else { + npy_src_max_nelem_offset += + simplified_src_strides[i] * (simplified_shape[i] - 1); + } + } + + if (is_contig_vector) { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + auto copy_and_cast_from_host_contig_blocking_fn = + copy_and_cast_from_host_contig_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_contig_blocking_fn(exec_q, src_nelems, src_data, + src_offset, dst_data, + dst_offset, depends); + + return; + } + + std::vector host_task_events; + host_task_events.reserve(1); + + // Copy shape strides into device memory + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + { + // release GIL for the blocking call + py::gil_scoped_release lock{}; + + // Get implementation function pointer + auto copy_and_cast_from_host_blocking_fn = + copy_and_cast_from_host_blocking_dispatch_table[dst_type_id] + [src_type_id]; + + copy_and_cast_from_host_blocking_fn( + exec_q, src_nelems, nd, shape_strides, src_data, src_offset, + npy_src_min_nelem_offset, npy_src_max_nelem_offset, dst_data, + dst_offset, depends, {copy_shape_ev}); + + // invoke USM deleter in smart pointer while GIL is held + shape_strides_owner.reset(nullptr); + } + + return; +} + +void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyAndCastFromHostFactory; + + DispatchTableBuilder + dtb_copy_from_numpy; + + dtb_copy_from_numpy.populate_dispatch_table( + copy_and_cast_from_host_blocking_dispatch_table); + + using dpctl::tensor::kernels::copy_and_cast:: + CopyAndCastFromHostContigFactory; + + DispatchTableBuilder + dtb_copy_from_numpy_contig; + + dtb_copy_from_numpy_contig.populate_dispatch_table( + copy_and_cast_from_host_contig_blocking_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp new file mode 100644 index 000000000000..f2de95f97cca --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void copy_numpy_ndarray_into_usm_ndarray( + const py::array &npy_src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 182124ac4ae5..07c8fd1bf99f 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -53,7 +53,7 @@ #include "copy_as_contig.hpp" // #include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" -// #include "copy_numpy_ndarray_into_usm_ndarray.hpp" +#include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" #include "full_ctor.hpp" @@ -96,7 +96,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ -// using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; +using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; /* ============= linear-sequence ==================== */ @@ -146,7 +146,7 @@ void init_dispatch_tables(void) using namespace dpctl::tensor::py_internal; init_copy_and_cast_usm_to_usm_dispatch_tables(); - // init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); + init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); init_advanced_indexing_dispatch_tables(); // init_where_dispatch_tables(); return; @@ -317,11 +317,11 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("include_endpoint"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // m.def("_copy_numpy_ndarray_into_usm_ndarray", - // ©_numpy_ndarray_into_usm_ndarray, - // "Copy from numpy array `src` into usm_ndarray `dst` - // synchronously.", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_numpy_ndarray_into_usm_ndarray", + ©_numpy_ndarray_into_usm_ndarray, + "Copy from numpy array `src` into usm_ndarray `dst` synchronously.", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_zeros_usm_ndarray", &usm_ndarray_zeros, "Populate usm_ndarray `dst` with zeros.", py::arg("dst"), From 497e81030c9e64b7129191d4347c7516224fe234 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:43:05 -0800 Subject: [PATCH 037/145] Move asnumpy(),from_numpy(), to_numpy() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 8 ++ dpctl_ext/tensor/_copy_utils.py | 202 ++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 dpctl_ext/tensor/_copy_utils.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 3c6939eff7a0..2ce61a9ab242 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,6 +27,11 @@ # ***************************************************************************** +from dpctl_ext.tensor._copy_utils import ( + asnumpy, + from_numpy, + to_numpy, +) from dpctl_ext.tensor._ctors import ( full, tril, @@ -38,9 +43,12 @@ ) __all__ = [ + "asnumpy", + "from_numpy", "full", "put", "take", + "to_numpy", "tril", "triu", ] diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py new file mode 100644 index 000000000000..9041be7686f6 --- /dev/null +++ b/dpctl_ext/tensor/_copy_utils.py @@ -0,0 +1,202 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.memory as dpm +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._device import normalize_queue_device + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +__doc__ = ( + "Implementation module for copy- and cast- operations on " + ":class:`dpctl.tensor.usm_ndarray`." +) + +int32_t_max = 1 + np.iinfo(np.int32).max + + +def _copy_to_numpy(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") + if ary.size == 0: + # no data needs to be copied for zero sized array + return np.ndarray(ary.shape, dtype=ary.dtype) + nb = ary.usm_data.nbytes + q = ary.sycl_queue + hh = dpm.MemoryUSMHost(nb, queue=q) + h = np.ndarray(nb, dtype="u1", buffer=hh).view(ary.dtype) + itsz = ary.itemsize + strides_bytes = tuple(si * itsz for si in ary.strides) + offset = ary._element_offset * itsz + # ensure that content of ary.usm_data is final + q.wait() + hh.copy_from_device(ary.usm_data) + return np.ndarray( + ary.shape, + dtype=ary.dtype, + buffer=h, + strides=strides_bytes, + offset=offset, + ) + + +def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): + """Copies numpy array `np_ary` into a new usm_ndarray""" + # This may perform a copy to meet stated requirements + Xnp = np.require(np_ary, requirements=["A", "E"]) + alloc_q = normalize_queue_device(sycl_queue=sycl_queue, device=None) + dt = Xnp.dtype + if dt.char in "dD" and alloc_q.sycl_device.has_aspect_fp64 is False: + Xusm_dtype = ( + dpt.dtype("float32") if dt.char == "d" else dpt.dtype("complex64") + ) + else: + Xusm_dtype = dt + Xusm = dpt.empty( + Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue + ) + _copy_from_numpy_into(Xusm, Xnp) + return Xusm + + +def _copy_from_numpy_into(dst, np_ary): + """Copies `np_ary` into `dst` of type :class:`dpctl.tensor.usm_ndarray""" + if not isinstance(np_ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(np_ary)}") + if not isinstance(dst, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(dst)}") + if np_ary.flags["OWNDATA"]: + Xnp = np_ary + else: + # Determine base of input array + base = np_ary.base + while isinstance(base, np.ndarray): + base = base.base + if isinstance(base, dpm._memory._Memory): + # we must perform a copy, since subsequent + # _copy_numpy_ndarray_into_usm_ndarray is implemented using + # sycl::buffer, and using USM-pointers with sycl::buffer + # results is undefined behavior + Xnp = np_ary.copy() + else: + Xnp = np_ary + src_ary = np.broadcast_to(Xnp, dst.shape) + copy_q = dst.sycl_queue + if copy_q.sycl_device.has_aspect_fp64 is False: + src_ary_dt_c = src_ary.dtype.char + if src_ary_dt_c == "d": + src_ary = src_ary.astype(np.float32) + elif src_ary_dt_c == "D": + src_ary = src_ary.astype(np.complex64) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_ev = _manager.submitted_events + # synchronizing call + ti._copy_numpy_ndarray_into_usm_ndarray( + src=src_ary, dst=dst, sycl_queue=copy_q, depends=dep_ev + ) + + +def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): + """ + from_numpy(arg, device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` from instance of + :class:`numpy.ndarray`. + + Args: + arg: + Input convertible to :class:`numpy.ndarray` + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is ``None``, + returned array is created on the default-selected device. + Default: ``None`` + usm_type (str): The requested USM allocation type for the + output array. Recognized values are ``"device"``, + ``"shared"``, or ``"host"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + A SYCL queue that determines output array allocation device + as well as execution placement of data movement operations. + The ``device`` and ``sycl_queue`` arguments + are equivalent. Only one of them should be specified. If both + are provided, they must be consistent and result in using the + same execution queue. Default: ``None`` + + The returned array has the same shape, and the same data type kind. + If the device does not support the data type of input array, a + closest support data type of the same kind may be returned, e.g. + input array of type ``float16`` may be upcast to ``float32`` if the + target device does not support 16-bit floating point type. + """ + q = normalize_queue_device(sycl_queue=sycl_queue, device=device) + return _copy_from_numpy(np_ary, usm_type=usm_type, sycl_queue=q) + + +def to_numpy(usm_ary, /): + """ + to_numpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content of + ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) + + +def asnumpy(usm_ary): + """ + asnumpy(usm_ary) + + Copies content of :class:`dpctl.tensor.usm_ndarray` instance ``usm_ary`` + into :class:`numpy.ndarray` instance of the same shape and same data + type. + + Args: + usm_ary (usm_ndarray): + Input array + Returns: + :class:`numpy.ndarray`: + An instance of :class:`numpy.ndarray` populated with content + of ``usm_ary`` + """ + return _copy_to_numpy(usm_ary) From 3be4e1498182891b7d59facdfe4c2a9f6ed3dc9f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 04:45:48 -0800 Subject: [PATCH 038/145] Update dpnp.asnumpy to use dpctl_ext functions --- dpnp/dpnp_array.py | 5 ++++- dpnp/dpnp_iface.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 6a2b2fd1977f..d122aff2c13f 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -41,6 +41,9 @@ import dpctl.tensor._type_utils as dtu from dpctl.tensor._numpy_helper import AxisError +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from . import memory as dpm @@ -764,7 +767,7 @@ def asnumpy(self): """ - return dpt.asnumpy(self._array_obj) + return dpt_ext.asnumpy(self._array_obj) def astype( self, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 533bdc36c617..6c050a208981 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -53,6 +53,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -136,7 +137,7 @@ def asnumpy(a, order="C"): return a.asnumpy() if isinstance(a, dpt.usm_ndarray): - return dpt.asnumpy(a) + return dpt_ext.asnumpy(a) return numpy.asarray(a, order=order) From 1d883652555cac390ba19d6fa294284690688ed1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 05:02:03 -0800 Subject: [PATCH 039/145] Move copy(), astype() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_copy_utils.py | 553 ++++++++++++++++++++++++++++++++ 2 files changed, 557 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 2ce61a9ab242..a02cf85ed591 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -29,6 +29,8 @@ from dpctl_ext.tensor._copy_utils import ( asnumpy, + astype, + copy, from_numpy, to_numpy, ) @@ -44,6 +46,8 @@ __all__ = [ "asnumpy", + "astype", + "copy", "from_numpy", "full", "put", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 9041be7686f6..c62218893a2c 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -26,12 +26,16 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import builtins + import dpctl import dpctl.memory as dpm import dpctl.tensor as dpt import dpctl.utils import numpy as np +from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._type_utils import _dtype_supported_by_device_impl # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor @@ -200,3 +204,552 @@ def asnumpy(usm_ary): of ``usm_ary`` """ return _copy_to_numpy(usm_ary) + + +class Dummy: + """Helper class with specified ``__sycl_usm_array_interface__`` attribute""" + + def __init__(self, iface): + self.__sycl_usm_array_interface__ = iface + + +def _copy_overlapping(dst, src): + """Assumes src and dst have the same shape.""" + q = normalize_queue_device(sycl_queue=dst.sycl_queue) + tmp = dpt.usm_ndarray( + src.shape, + dtype=src.dtype, + buffer="device", + order="C", + buffer_ctor_kwargs={"queue": q}, + ) + _manager = dpctl.utils.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + hcp1, cp1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(hcp1, cp1) + hcp2, cp2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp, dst=dst, sycl_queue=q, depends=[cp1] + ) + _manager.add_event_pair(hcp2, cp2) + + +def _copy_same_shape(dst, src): + """Assumes src and dst have the same shape.""" + # check that memory regions do not overlap + if ti._array_overlap(dst, src): + if src._pointer == dst._pointer and ( + src is dst + or (src.strides == dst.strides and src.dtype == dst.dtype) + ): + return + _copy_overlapping(src=src, dst=dst) + return + + copy_q = dst.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src, dst=dst, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + +if hasattr(np, "broadcast_shapes"): + + def _broadcast_shapes(sh1, sh2): + return np.broadcast_shapes(sh1, sh2) + +else: + + def _broadcast_shapes(sh1, sh2): + # use arrays with zero strides, whose memory footprint + # is independent of the number of array elements + return np.broadcast( + np.empty(sh1, dtype=[]), + np.empty(sh2, dtype=[]), + ).shape + + +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def _copy_from_usm_ndarray_to_usm_ndarray(dst, src): + if any( + not isinstance(arg, dpt.usm_ndarray) + for arg in ( + dst, + src, + ) + ): + raise TypeError( + "Both types are expected to be dpctl.tensor.usm_ndarray, " + f"got {type(dst)} and {type(src)}." + ) + + if dst.ndim == src.ndim and dst.shape == src.shape: + _copy_same_shape(dst, src) + return + + try: + common_shape = _broadcast_shapes(dst.shape, src.shape) + except ValueError as exc: + raise ValueError("Shapes of two arrays are not compatible") from exc + + if dst.size < src.size and dst.size < np.prod(common_shape): + raise ValueError("Destination is smaller ") + + if len(common_shape) > dst.ndim: + ones_count = len(common_shape) - dst.ndim + for k in range(ones_count): + if common_shape[k] != 1: + raise ValueError + common_shape = common_shape[ones_count:] + + if src.ndim < len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + elif src.ndim == len(common_shape): + new_src_strides = _broadcast_strides( + src.shape, src.strides, len(common_shape) + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src, + strides=new_src_strides, + offset=src._element_offset, + ) + else: + # since broadcasting succeeded, src.ndim is greater because of + # leading sequence of ones, so we trim it + n = len(common_shape) + new_src_strides = _broadcast_strides( + src.shape[-n:], src.strides[-n:], n + ) + src_same_shape = dpt.usm_ndarray( + common_shape, + dtype=src.dtype, + buffer=src.usm_data, + strides=new_src_strides, + offset=src._element_offset, + ) + + _copy_same_shape(dst, src_same_shape) + + +def _make_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty array with shape and strides like `x`, with dtype `dt`, + USM type `usm_type`, on device `dev`. + """ + st = list(x.strides) + perm = sorted( + range(x.ndim), + key=lambda d: builtins.abs(st[d]) if x.shape[d] > 1 else 0, + reverse=True, + ) + inv_perm = sorted(range(x.ndim), key=lambda i: perm[i]) + sh = x.shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if min(st) < 0: + st_sorted = [st[i] for i in perm] + sl = tuple( + ( + slice(None, None, -1) + if st_sorted[i] < 0 + else slice(None, None, None) + ) + for i in range(x.ndim) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_orderK(x, dt, usm_type=None, dev=None): + """ + Returns empty array like `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(x)}") + if usm_type is None: + usm_type = x.usm_type + if dev is None: + dev = x.device + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty_like( + x, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _from_numpy_empty_like_orderK(x, dt, usm_type, dev): + """ + Returns empty usm_ndarray like NumPy array `x`, using order='K' + + For an array `x` that was obtained by permutation of a contiguous + array the returned array will have the same shape and the same + strides as `x`. + """ + if not isinstance(x, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(x)}") + fl = x.flags + if fl["C"] or x.size <= 1: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + elif fl["F"]: + return dpt.empty( + x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + return _make_empty_like_orderK(x, dt, usm_type, dev) + + +def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + nd1 = X1.ndim + nd2 = X2.ndim + if nd1 > nd2 and X1.shape == res_shape: + return _empty_like_orderK(X1, dt, usm_type, dev) + elif nd1 < nd2 and X2.shape == res_shape: + return _empty_like_orderK(X2, dt, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + if fl1["C"] or fl2["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + max_ndim = max(nd1, nd2) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if (st1_sorted[i] < 0 and st2_sorted[i] < 0) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): + if not isinstance(X1, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X1)}") + if not isinstance(X2, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X2)}") + if not isinstance(X3, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray, got {type(X3)}") + nd1 = X1.ndim + nd2 = X2.ndim + nd3 = X3.ndim + if X1.shape == res_shape and X2.shape == res_shape and len(res_shape) > nd3: + return _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev) + elif ( + X2.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd1 + ): + return _empty_like_pair_orderK(X2, X3, dt, res_shape, usm_type, dev) + elif ( + X1.shape == res_shape and X3.shape == res_shape and len(res_shape) > nd2 + ): + return _empty_like_pair_orderK(X1, X3, dt, res_shape, usm_type, dev) + fl1 = X1.flags + fl2 = X2.flags + fl3 = X3.flags + if fl1["C"] or fl2["C"] or fl3["C"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) + if fl1["F"] and fl2["F"] and fl3["F"]: + return dpt.empty( + res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" + ) + st1 = list(X1.strides) + st2 = list(X2.strides) + st3 = list(X3.strides) + max_ndim = max(nd1, nd2, nd3) + st1 += [0] * (max_ndim - len(st1)) + st2 += [0] * (max_ndim - len(st2)) + st3 += [0] * (max_ndim - len(st3)) + sh1 = list(X1.shape) + [0] * (max_ndim - nd1) + sh2 = list(X2.shape) + [0] * (max_ndim - nd2) + sh3 = list(X3.shape) + [0] * (max_ndim - nd3) + perm = sorted( + range(max_ndim), + key=lambda d: ( + builtins.abs(st1[d]) if sh1[d] > 1 else 0, + builtins.abs(st2[d]) if sh2[d] > 1 else 0, + builtins.abs(st3[d]) if sh3[d] > 1 else 0, + ), + reverse=True, + ) + inv_perm = sorted(range(max_ndim), key=lambda i: perm[i]) + st1_sorted = [st1[i] for i in perm] + st2_sorted = [st2[i] for i in perm] + st3_sorted = [st3[i] for i in perm] + sh = res_shape + sh_sorted = tuple(sh[i] for i in perm) + R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0: + sl = tuple( + ( + slice(None, None, -1) + if ( + st1_sorted[i] < 0 + and st2_sorted[i] < 0 + and st3_sorted[i] < 0 + ) + else slice(None, None, None) + ) + for i in range(nd1) + ) + R = R[sl] + return dpt.permute_dims(R, inv_perm) + + +def copy(usm_ary, /, *, order="K"): + """copy(ary, order="K") + + Creates a copy of given instance of :class:`dpctl.tensor.usm_ndarray`. + + Args: + ary (usm_ndarray): + Input array + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + Controls the memory layout of the output array + Returns: + usm_ndarray: + A copy of the input array. + + Memory layout of the copy is controlled by ``order`` keyword, + following NumPy's conventions. The ``order`` keywords can be + one of the following: + + .. list-table:: + + * - ``"C"`` + - C-contiguous memory layout + * - ``"F"`` + - Fortran-contiguous memory layout + * - ``"A"`` + - Fortran-contiguous if the input array is also Fortran-contiguous, + otherwise C-contiguous + * - ``"K"`` + - match the layout of ``usm_ary`` as closely as possible. + + """ + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + if not isinstance(usm_ary, dpt.usm_ndarray): + raise TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, usm_ary.dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=usm_ary.dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_same_shape(R, usm_ary) + return R + + +def astype( + usm_ary, newdtype, /, *, order="K", casting="unsafe", copy=True, device=None +): + """astype(array, new_dtype, order="K", casting="unsafe", \ + copy=True, device=None) + + Returns a copy of the :class:`dpctl.tensor.usm_ndarray`, cast to a + specified type. + + Args: + array (usm_ndarray): + An input array. + new_dtype (dtype): + The data type of the resulting array. If `None`, gives default + floating point type supported by device where the resulting array + will be located. + order ({"C", "F", "A", "K"}, optional): + Controls memory layout of the resulting array if a copy + is returned. + casting ({'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional): + Controls what kind of data casting may occur. Please see + :meth:`numpy.ndarray.astype` for description of casting modes. + copy (bool, optional): + By default, `astype` always returns a newly allocated array. + If this keyword is set to `False`, a view of the input array + may be returned when possible. + device (object): array API specification of device where the + output array is created. Device can be specified by + a filter selector string, an instance of + :class:`dpctl.SyclDevice`, an instance of + :class:`dpctl.SyclQueue`, or an instance of + :class:`dpctl.tensor.Device`. If the value is `None`, + returned array is created on the same device as `array`. + Default: `None`. + + Returns: + usm_ndarray: + An array with requested data type. + + A view can be returned, if possible, when `copy=False` is used. + """ + if not isinstance(usm_ary, dpt.usm_ndarray): + return TypeError( + f"Expected object of type dpt.usm_ndarray, got {type(usm_ary)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + ary_dtype = usm_ary.dtype + if device is not None: + if not isinstance(device, dpctl.SyclQueue): + if isinstance(device, dpt.Device): + device = device.sycl_queue + else: + device = dpt.Device.create_device(device).sycl_queue + d = device.sycl_device + target_dtype = _get_dtype(newdtype, device) + if not _dtype_supported_by_device_impl( + target_dtype, d.has_aspect_fp16, d.has_aspect_fp64 + ): + raise ValueError( + f"Requested dtype '{target_dtype}' is not supported by the " + "target device" + ) + usm_ary = usm_ary.to_device(device) + else: + target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) + + if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): + raise TypeError( + f"Can not cast from {ary_dtype} to {newdtype} " + f"according to rule {casting}." + ) + c_contig = usm_ary.flags.c_contiguous + f_contig = usm_ary.flags.f_contiguous + needs_copy = copy or not ary_dtype == target_dtype + if not needs_copy and (order != "K"): + # ensure that order="F" for C-contig input triggers copy, + # and order="C" for F-contig input triggers copy too. + # 1D arrays which are both C- and F- contig should not + # force copying for neither order="F", nor order="C", see gh-1926 + needs_copy = ( + c_contig and not f_contig and order not in ["A", "C"] + ) or (not c_contig and f_contig and order not in ["A", "F"]) + if not needs_copy: + return usm_ary + copy_order = "C" + if order == "C": + pass + elif order == "F": + copy_order = order + elif order == "A": + if usm_ary.flags.f_contiguous: + copy_order = "F" + elif order == "K": + if usm_ary.flags.f_contiguous: + copy_order = "F" + else: + raise ValueError( + "Unrecognized value of the order keyword. " + "Recognized values are 'A', 'C', 'F', or 'K'" + ) + if order == "K": + R = _empty_like_orderK(usm_ary, target_dtype) + else: + R = dpt.usm_ndarray( + usm_ary.shape, + dtype=target_dtype, + buffer=usm_ary.usm_type, + order=copy_order, + buffer_ctor_kwargs={"queue": usm_ary.sycl_queue}, + ) + _copy_from_usm_ndarray_to_usm_ndarray(R, usm_ary) + return R From fd18db07dc59fa927e14d39f730fce9d6c2b42ec Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 05:24:47 -0800 Subject: [PATCH 040/145] reuse astype(), copy() from dpctl_ext --- dpctl_ext/tensor/_ctors.py | 5 ++++- dpctl_ext/tensor/_indexing_functions.py | 5 ++++- dpnp/dpnp_algo/dpnp_arraycreation.py | 5 ++++- dpnp/dpnp_algo/dpnp_elementwise_common.py | 13 +++++++------ dpnp/dpnp_algo/dpnp_fill.py | 6 +++--- dpnp/dpnp_container.py | 4 +++- dpnp/dpnp_iface_arraycreation.py | 7 +++++-- dpnp/dpnp_iface_indexing.py | 15 ++++++++------- dpnp/dpnp_iface_statistics.py | 5 ++++- 9 files changed, 42 insertions(+), 23 deletions(-) diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index a0e7b28e66ff..5a39e9367e9c 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -36,6 +36,9 @@ from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti @@ -147,7 +150,7 @@ def full( usm_type=usm_type, sycl_queue=sycl_queue, ) - return dpt.copy(dpt.broadcast_to(X, shape), order=order) + return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order) else: _validate_fill_value(fill_value) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 106df09cf97e..df4f3e953042 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -32,6 +32,9 @@ import dpctl.tensor as dpt import dpctl.utils +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index @@ -185,7 +188,7 @@ def put_vec_duplicates(vec, ind, vals): if vals.dtype == x.dtype: rhs = vals else: - rhs = dpt.astype(vals, x.dtype) + rhs = dpt_ext.astype(vals, x.dtype) rhs = dpt.broadcast_to(rhs, val_shape) _manager = dpctl.utils.SequentialOrderManager[exec_q] diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index d94a031801f3..27f095158a85 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -33,6 +33,9 @@ import dpctl.utils as dpu import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device @@ -256,7 +259,7 @@ def dpnp_linspace( if dpnp.issubdtype(dtype, dpnp.integer): dpt.floor(usm_res, out=usm_res) - res = dpt.astype(usm_res, dtype, copy=False) + res = dpt_ext.astype(usm_res, dtype, copy=False) res = dpnp_array._create_from_usm_ndarray(res) if retstep is True: diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 88abcee5035c..55d74e8c1803 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -47,6 +47,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as dti import dpnp import dpnp.backend.extensions.vm._vm_impl as vmi @@ -212,7 +213,7 @@ def __call__( x_usm = dpnp.get_usm_ndarray(x) if dtype is not None: - x_usm = dpt.astype(x_usm, dtype, copy=False) + x_usm = dpt_ext.astype(x_usm, dtype, copy=False) out = self._unpack_out_kw(out) out_usm = None if out is None else dpnp.get_usm_ndarray(out) @@ -718,9 +719,9 @@ def __call__( sycl_queue=x2.sycl_queue, usm_type=x2.usm_type, ) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) elif dpnp.isscalar(x2): - x1_usm = dpt.astype(x1_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) x2_usm = dpt.asarray( x2, dtype=dtype, @@ -728,8 +729,8 @@ def __call__( usm_type=x1.usm_type, ) else: - x1_usm = dpt.astype(x1_usm, dtype, copy=False) - x2_usm = dpt.astype(x2_usm, dtype, copy=False) + x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) + x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order) @@ -1325,7 +1326,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None): res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm) if dtype is not None: - res_usm = dpt.astype(res_usm, dtype, copy=False) + res_usm = dpt_ext.astype(res_usm, dtype, copy=False) if out is not None and isinstance(out, dpnp_array): return out diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index 4137a2794747..ddba9f634cb1 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -32,10 +32,10 @@ import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val -import dpnp - # TODO: revert to `from dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _full_usm_ndarray, @@ -56,7 +56,7 @@ def dpnp_fill(arr, val): raise dpu.ExecutionPlacementError( "Input arrays have incompatible queues." ) - a_val = dpt.astype(val, arr.dtype) + a_val = dpt_ext.astype(val, arr.dtype) a_val = dpt.broadcast_to(a_val, arr.shape) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index c8e28529cd57..acda579a5f5e 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -38,6 +38,8 @@ import dpctl.tensor as dpt import dpctl.utils as dpu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array @@ -141,7 +143,7 @@ def copy(x1, /, *, order="K"): if order is None: order = "K" - array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order) + array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order) return dpnp_array._create_from_usm_ndarray(array_obj) diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 8d4ebdd1a6c2..404d9e0fe37d 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -46,6 +46,9 @@ import dpctl.tensor as dpt import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp import dpnp_container @@ -934,7 +937,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None): order = "K" usm_x = dpnp.get_usm_ndarray(x) - usm_res = dpt.astype( + usm_res = dpt_ext.astype( usm_x, dtype, order=order, casting=casting, copy=copy, device=device ) @@ -3129,7 +3132,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): output = dpt.broadcast_arrays(*output) if copy: - output = [dpt.copy(x) for x in output] + output = [dpt_ext.copy(x) for x in output] return [dpnp_array._create_from_usm_ndarray(x) for x in output] diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a01a036e16cc..a77877d8d685 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -51,11 +51,10 @@ from dpctl.tensor._indexing_functions import _get_indexing_mode from dpctl.tensor._numpy_helper import normalize_axis_index -import dpctl_ext.tensor as dpt_ext - # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti import dpnp @@ -243,7 +242,7 @@ def choose(a, choices, out=None, mode="wrap"): # NumPy will cast up to int64 in general but # int32 is more than safe for bool if ind_dt == dpnp.bool: - inds = dpt.astype(inds, dpt.int32) + inds = dpt_ext.astype(inds, dpt.int32) else: raise TypeError("input index array must be of integer data type") @@ -256,7 +255,7 @@ def choose(a, choices, out=None, mode="wrap"): choices = tuple( map( lambda chc: ( - chc if chc.dtype == res_dt else dpt.astype(chc, res_dt) + chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt) ), choices, ) @@ -1616,7 +1615,9 @@ def place(a, mask, vals): if usm_vals.dtype != usm_a.dtype: # dpt.place casts values to a.dtype with "unsafe" rule, # while numpy.place does that with "safe" casting rule - usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False) + usm_vals = dpt_ext.astype( + usm_vals, usm_a.dtype, casting="safe", copy=False + ) dpt.place(usm_a, usm_mask, usm_vals) @@ -1712,7 +1713,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.put supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe") in_usm_a = usm_a if axis is None and usm_a.ndim > 1: @@ -2171,7 +2172,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.take supports only integer dtype for array of indices - usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe") + usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe") usm_res = _take_index( usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index 7e092184366c..daff981d5cc4 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -47,6 +47,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -1204,7 +1207,7 @@ def mean(a, /, axis=None, dtype=None, out=None, keepdims=False, *, where=True): usm_a = dpnp.get_usm_ndarray(a) usm_res = dpt.mean(usm_a, axis=axis, keepdims=keepdims) if dtype is not None: - usm_res = dpt.astype(usm_res, dtype) + usm_res = dpt_ext.astype(usm_res, dtype) return dpnp.get_result_array(usm_res, out, casting="unsafe") From 09761710153cee6060ffe27461af6444d3e4699b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:00:04 -0800 Subject: [PATCH 041/145] Move _copy_usm_ndarray_for_reshape --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../libtensor/source/copy_for_reshape.cpp | 184 ++++++++++++++++++ .../libtensor/source/copy_for_reshape.hpp | 54 +++++ .../tensor/libtensor/source/tensor_ctors.cpp | 17 +- 4 files changed, 248 insertions(+), 9 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 4e3fa580f99e..b30a1ac8b029 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -49,7 +49,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp new file mode 100644 index 000000000000..524bfcfdb98b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.cpp @@ -0,0 +1,184 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_reshape.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_reshape_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_reshape_fn_ptr_t + copy_for_reshape_generic_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + py::ssize_t src_nelems = src.get_size(); + py::ssize_t dst_nelems = dst.get_size(); + + // Must have the same number of elements + if (src_nelems != dst_nelems) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same number of elements."); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_reshape requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + // dimensions may be different + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + auto fn = copy_for_reshape_generic_dispatch_vector[type_id]; + + auto src_shape = src.get_shape_vector(); + auto src_strides = src.get_strides_vector(); + + auto dst_shape = dst.get_shape_vector(); + auto dst_strides = dst.get_strides_vector(); + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_shape, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, src_shape, src_strides, dst_shape, + dst_strides); + auto copy_shape_ev = std::get<2>(ptr_size_event_tuple); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_reshape_event = + fn(exec_q, src_nelems, src_nd, dst_nd, shape_strides, src_data, + dst_data, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_reshape_event}, shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_reshape_event); +} + +void init_copy_for_reshape_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForReshapeGenericFactory; + + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(copy_for_reshape_generic_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp new file mode 100644 index 000000000000..c5af885ad6cd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_reshape.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_reshape(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_reshape_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 07c8fd1bf99f..4903f22bd481 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -51,7 +51,7 @@ // #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" -// #include "copy_for_reshape.hpp" +#include "copy_for_reshape.hpp" // #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" @@ -87,7 +87,7 @@ using dpctl::tensor::py_internal::py_as_f_contig; /* =========================== Copy for reshape ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* =========================== Copy for roll ============================= */ @@ -279,12 +279,13 @@ PYBIND11_MODULE(_tensor_impl, m) }, ""); - // m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "number of elements using underlying 'C'-contiguous order for - // flat " "traversal. " "Returns a tuple of events: (ht_event, - // comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_reshape", ©_usm_ndarray_for_reshape, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "number of elements using underlying 'C'-contiguous order for flat " + "traversal. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same From 318692ef0f33820ddf5850ddf5a9a0a030b139a9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:26:33 -0800 Subject: [PATCH 042/145] Move reshape() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_reshape.py | 206 +++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 dpctl_ext/tensor/_reshape.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a02cf85ed591..416216074f8a 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -43,6 +43,7 @@ put, take, ) +from dpctl_ext.tensor._reshape import reshape __all__ = [ "asnumpy", @@ -51,6 +52,7 @@ "from_numpy", "full", "put", + "reshape", "take", "to_numpy", "tril", diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py new file mode 100644 index 000000000000..6afa1dc245c3 --- /dev/null +++ b/dpctl_ext/tensor/_reshape.py @@ -0,0 +1,206 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils +import numpy as np +from dpctl.tensor._tensor_impl import ( + _copy_usm_ndarray_for_reshape, + _ravel_multi_index, + _unravel_index, +) + +__doc__ = "Implementation module for :func:`dpctl.tensor.reshape`." + + +def _make_unit_indexes(shape): + """ + Construct a diagonal matrix with with one on the diagonal + except if the corresponding element of shape is 1. + """ + nd = len(shape) + mi = np.zeros((nd, nd), dtype="u4") + for i, dim in enumerate(shape): + mi[i, i] = 1 if dim > 1 else 0 + return mi + + +def ti_unravel_index(flat_index, shape, order="C"): + return _unravel_index(flat_index, shape, order) + + +def ti_ravel_multi_index(multi_index, shape, order="C"): + return _ravel_multi_index(multi_index, shape, order) + + +def reshaped_strides(old_sh, old_sts, new_sh, order="C"): + """ + When reshaping array with `old_sh` shape and `old_sts` strides + into the new shape `new_sh`, returns the new stride if the reshape + can be a view, otherwise returns `None`. + """ + eye_new_mi = _make_unit_indexes(new_sh) + new_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + old_sts, ti_unravel_index(flat_index, old_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, new_sh, order=order) + for unitvec in eye_new_mi + ] + ] + eye_old_mi = _make_unit_indexes(old_sh) + check_sts = [ + sum( + st_i * ind_i + for st_i, ind_i in zip( + new_sts, ti_unravel_index(flat_index, new_sh, order=order) + ) + ) + for flat_index in [ + ti_ravel_multi_index(unitvec, old_sh, order=order) + for unitvec in eye_old_mi + ] + ] + valid = all( + check_st == old_st or old_dim == 1 + for check_st, old_st, old_dim in zip(check_sts, old_sts, old_sh) + ) + return new_sts if valid else None + + +def reshape(X, /, shape, *, order="C", copy=None): + """reshape(x, shape, order="C") + + Reshapes array ``x`` into new shape. + + Args: + x (usm_ndarray): + input array + shape (Tuple[int]): + the desired shape of the resulting array. + order ("C", "F", optional): + memory layout of the resulting array + if a copy is found to be necessary. Supported + choices are ``"C"`` for C-contiguous, or row-major layout; + and ``"F"`` for F-contiguous, or column-major layout. + + Returns: + usm_ndarray: + Reshaped array is a view, if possible, + and a copy otherwise with memory layout as indicated + by ``order`` keyword. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError + if not isinstance(shape, (list, tuple)): + shape = (shape,) + if order in "cfCF": + order = order.upper() + else: + raise ValueError( + f"Keyword 'order' not recognized. Expecting 'C' or 'F', got {order}" + ) + if copy not in (True, False, None): + raise ValueError( + f"Keyword 'copy' not recognized. Expecting True, False, " + f"or None, got {copy}" + ) + shape = [operator.index(d) for d in shape] + negative_ones_count = 0 + for nshi in shape: + if nshi == -1: + negative_ones_count = negative_ones_count + 1 + if (nshi < -1) or negative_ones_count > 1: + raise ValueError( + "Target shape should have at most 1 negative " + "value which can only be -1" + ) + if negative_ones_count: + sz = -np.prod(shape) + if sz == 0: + raise ValueError( + f"Can not reshape array of size {X.size} into " + f"shape {tuple(i for i in shape if i >= 0)}" + ) + v = X.size // sz + shape = [v if d == -1 else d for d in shape] + if X.size != np.prod(shape): + raise ValueError(f"Can not reshape into {shape}") + if X.size: + newsts = reshaped_strides(X.shape, X.strides, shape, order=order) + else: + newsts = (1,) * len(shape) + copy_required = newsts is None + if copy_required and (copy is False): + raise ValueError( + "Reshaping the array requires a copy, but no copying was " + "requested by using copy=False" + ) + copy_q = X.sycl_queue + if copy_required or (copy is True): + # must perform a copy + copy_q = X.sycl_queue + flat_res = dpt.usm_ndarray( + (X.size,), + dtype=X.dtype, + buffer=X.usm_type, + buffer_ctor_kwargs={"queue": copy_q}, + ) + _manager = dpctl.utils.SequentialOrderManager[copy_q] + dep_evs = _manager.submitted_events + if order == "C": + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + else: + X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) + hev, r_e = _copy_usm_ndarray_for_reshape( + src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs + ) + _manager.add_event_pair(hev, r_e) + return dpt.usm_ndarray( + tuple(shape), dtype=X.dtype, buffer=flat_res, order=order + ) + # can form a view + if (len(shape) == X.ndim) and all( + s1 == s2 for s1, s2 in zip(shape, X.shape) + ): + return X + return dpt.usm_ndarray( + shape, + dtype=X.dtype, + buffer=X, + strides=tuple(newsts), + offset=X._element_offset, + ) From 3c0c1132df85fc9ae76e6c953392ba674e0dd4f3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:27:16 -0800 Subject: [PATCH 043/145] Reuse reshape from dpctl_ext in dpnp --- dpnp/dpnp_algo/dpnp_arraycreation.py | 4 +++- dpnp/dpnp_iface_arraycreation.py | 8 ++++---- dpnp/dpnp_iface_indexing.py | 24 ++++++++++++------------ dpnp/dpnp_iface_manipulation.py | 9 ++++++--- dpnp/dpnp_iface_sorting.py | 5 ++++- dpnp/tests/test_arraycreation.py | 5 ++++- 6 files changed, 33 insertions(+), 22 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 27f095158a85..47edf63a68b4 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -233,7 +233,9 @@ def dpnp_linspace( usm_type=_usm_type, sycl_queue=sycl_queue_normalized, ) - usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False) + usm_res = dpt_ext.reshape( + usm_res, (-1,) + (1,) * delta.ndim, copy=False + ) if step_num > 0: step = delta / step_num diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 404d9e0fe37d..c2dd5793f827 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3117,7 +3117,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): s0 = (1,) * ndim output = [ - dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) + dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :]) for i, x in enumerate(xi) ] @@ -3125,8 +3125,8 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): _, _ = get_usm_allocations(output) if indexing == "xy" and ndim > 1: - output[0] = dpt.reshape(output[0], (1, -1) + s0[2:]) - output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:]) + output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:]) + output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:]) if not sparse: output = dpt.broadcast_arrays(*output) @@ -3932,7 +3932,7 @@ def vander( tmp = m[:, ::-1] if not increasing else m dpnp.power( - dpt.reshape(usm_x, (-1, 1)), + dpt_ext.reshape(usm_x, (-1, 1)), dpt.arange( N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue ), diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a77877d8d685..439ec288ebe2 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -814,14 +814,14 @@ def extract(condition, a): ) if usm_cond.size != usm_a.size: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: - usm_a = dpt.reshape(usm_a, -1) - usm_cond = dpt.reshape(usm_cond, -1) + usm_a = dpt_ext.reshape(usm_a, -1) + usm_cond = dpt_ext.reshape(usm_cond, -1) usm_res = dpt.extract(usm_cond, usm_a) @@ -958,18 +958,18 @@ def fill_diagonal(a, val, wrap=False): # a.flat[:end:step] = val # but need to consider use case when `a` is usm_ndarray also a_sh = a.shape - tmp_a = dpt.reshape(usm_a, -1) + tmp_a = dpt_ext.reshape(usm_a, -1) if dpnp.isscalar(usm_val): tmp_a[:end:step] = usm_val else: - usm_val = dpt.reshape(usm_val, -1) + usm_val = dpt_ext.reshape(usm_val, -1) # Setitem can work only if index size equal val size. # Using loop for general case without dependencies of val size. for i in range(0, usm_val.size): tmp_a[step * i : end : step * (i + 1)] = usm_val[i] - tmp_a = dpt.reshape(tmp_a, a_sh) + tmp_a = dpt_ext.reshape(tmp_a, a_sh) usm_a[:] = tmp_a @@ -1610,7 +1610,7 @@ def place(a, mask, vals): if usm_vals.ndim != 1: # dpt.place supports only 1-D array of values - usm_vals = dpt.reshape(usm_vals, -1) + usm_vals = dpt_ext.reshape(usm_vals, -1) if usm_vals.dtype != usm_a.dtype: # dpt.place casts values to a.dtype with "unsafe" rule, @@ -1709,7 +1709,7 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): if usm_ind.ndim != 1: # dpt.put supports only 1-D array of indices - usm_ind = dpt.reshape(usm_ind, -1, copy=False) + usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False) if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer): # dpt.put supports only integer dtype for array of indices @@ -1717,11 +1717,11 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"): in_usm_a = usm_a if axis is None and usm_a.ndim > 1: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode) if in_usm_a._pointer != usm_a._pointer: # pylint: disable=protected-access - in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False) + in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False) def put_along_axis(a, ind, values, axis, mode="wrap"): @@ -2163,7 +2163,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): if axis is None: if a_ndim > 1: # flatten input array - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = 0 elif a_ndim == 0: axis = normalize_axis_index(operator.index(axis), 1) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 9df5278bd16b..f992cc366e20 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -53,6 +53,9 @@ normalize_axis_tuple, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .dpnp_array import dpnp_array @@ -415,7 +418,7 @@ def _get_first_nan_index(usm_a): dpt.place( usm_res.inverse_indices, usm_res.inverse_indices > first_nan, - dpt.reshape(first_nan, 1), + dpt_ext.reshape(first_nan, 1), ) result += (usm_res.inverse_indices,) @@ -3057,7 +3060,7 @@ def reshape(a, /, shape, order="C", *, copy=None): ) usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy) + usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy) return dpnp_array._create_from_usm_ndarray(usm_res) @@ -3259,7 +3262,7 @@ def roll(x, shift, axis=None): shift = dpnp.asnumpy(shift) if axis is None: - return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape) + return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape) usm_res = dpt.roll(usm_x, shift=shift, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index db33a88c7488..3e5cdd4da6a6 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -43,6 +43,9 @@ import numpy from dpctl.tensor._numpy_helper import normalize_axis_index +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp # pylint: disable=no-name-in-module @@ -84,7 +87,7 @@ def _wrap_sort_argsort( usm_a = dpnp.get_usm_ndarray(a) if axis is None: - usm_a = dpt.reshape(usm_a, -1) + usm_a = dpt_ext.reshape(usm_a, -1) axis = -1 axis = normalize_axis_index(axis, ndim=usm_a.ndim) diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index eb20f9b3ffe5..423004470bad 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -13,6 +13,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -969,7 +972,7 @@ def test_ones_like(array, dtype, order): ], ) def test_dpctl_tensor_input(func, args): - x0 = dpt.reshape(dpt.arange(9), (3, 3)) + x0 = dpt_ext.reshape(dpt.arange(9), (3, 3)) new_args = [eval(val, {"x0": x0}) for val in args] X = getattr(dpt, func)(*new_args) Y = getattr(dpnp, func)(*new_args) From 30f2c53ee0d6dcddfbc8d0933a667705b048e8b1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:44:48 -0800 Subject: [PATCH 044/145] Move _copy_usm_ndarray_for_roll --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../tensor/libtensor/source/copy_for_roll.cpp | 400 ++++++++++++++++++ .../tensor/libtensor/source/copy_for_roll.hpp | 65 +++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 +- 4 files changed, 485 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index b30a1ac8b029..a39368c7baa3 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -50,7 +50,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp new file mode 100644 index 000000000000..a187b2247677 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.cpp @@ -0,0 +1,400 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "copy_for_roll.hpp" +#include "kernels/copy_and_cast.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_contig_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast:: + copy_for_roll_ndshift_strided_fn_ptr_t; +using dpctl::tensor::kernels::copy_and_cast::copy_for_roll_strided_fn_ptr_t; +using dpctl::utils::keep_args_alive; + +// define static vector +static copy_for_roll_strided_fn_ptr_t + copy_for_roll_strided_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_contig_fn_ptr_t + copy_for_roll_contig_dispatch_vector[td_ns::num_types]; + +static copy_for_roll_ndshift_strided_fn_ptr_t + copy_for_roll_ndshift_dispatch_vector[td_ns::num_types]; + +/* + * Copies src into dst (same data type) of different shapes by using flat + * iterations. + * + * Equivalent to the following loop: + * + * for i for range(src.size): + * dst[np.multi_index(i, dst.shape)] = src[np.multi_index(i, src.shape)] + */ +std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same number of dimensions."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_1d requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check same contexts + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + const bool is_src_c_contig = src.is_c_contiguous(); + const bool is_src_f_contig = src.is_f_contiguous(); + + const bool is_dst_c_contig = dst.is_c_contiguous(); + const bool is_dst_f_contig = dst.is_f_contiguous(); + + const bool both_c_contig = is_src_c_contig && is_dst_c_contig; + const bool both_f_contig = is_src_f_contig && is_dst_f_contig; + + // normalize shift parameter to be 0 <= offset < src_nelems + std::size_t offset = + (shift > 0) ? (shift % src_nelems) : src_nelems + (shift % src_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + if (both_c_contig || both_f_contig) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + static constexpr py::ssize_t zero_offset = 0; + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, zero_offset, dst_data, + zero_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t dst_offset(0); + + int nd = src_nd; + const py::ssize_t *shape = src_shape_ptr; + + // nd, simplified_* and *_offset are modified by reference + dpctl::tensor::py_internal::simplify_iteration_space( + nd, shape, src_strides, dst_strides, + // output + simplified_shape, simplified_src_strides, simplified_dst_strides, + src_offset, dst_offset); + + if (nd == 1 && simplified_src_strides[0] == 1 && + simplified_dst_strides[0] == 1) { + auto fn = copy_for_roll_contig_dispatch_vector[type_id]; + + if (fn != nullptr) { + + sycl::event copy_for_roll_ev = + fn(exec_q, offset, src_nelems, src_data, src_offset, dst_data, + dst_offset, depends); + + sycl::event ht_ev = + keep_args_alive(exec_q, {src, dst}, {copy_for_roll_ev}); + + return std::make_pair(ht_ev, copy_for_roll_ev); + } + } + + auto fn = copy_for_roll_strided_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_shape, simplified_src_strides, + simplified_dst_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, offset, src_nelems, src_nd, shape_strides, src_data, + src_offset, dst_data, dst_offset, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + // Must have the same number of dimensions + if (src_nd != dst_nd) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same number of dimensions."); + } + + if (static_cast(src_nd) != shifts.size()) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires shifts to " + "contain an integral shift for each array dimension."); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + if (!std::equal(src_shape_ptr, src_shape_ptr + src_nd, dst_shape_ptr)) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same shape."); + } + + py::ssize_t src_nelems = src.get_size(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + // typenames must be the same + if (src_typenum != dst_typenum) { + throw py::value_error( + "copy_usm_ndarray_for_roll_nd requires src and dst to " + "have the same type."); + } + + if (src_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, src_nelems); + + // check for compatible queues + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (src_nelems == 1) { + // handle special case of 1-element array + int src_elemsize = src.get_elemsize(); + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + sycl::event copy_ev = + exec_q.copy(src_data, dst_data, src_elemsize, depends); + return std::make_pair(keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + + auto array_types = td_ns::usm_ndarray_types(); + int type_id = array_types.typenum_to_lookup_id(src_typenum); + + std::vector normalized_shifts{}; + normalized_shifts.reserve(src_nd); + + for (int i = 0; i < src_nd; ++i) { + // normalize shift parameter to be 0 <= offset < dim + py::ssize_t dim = src_shape_ptr[i]; + std::size_t offset = + (shifts[i] >= 0) ? (shifts[i] % dim) : dim + (shifts[i] % dim); + + normalized_shifts.push_back(offset); + } + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + auto const &src_strides = src.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + auto const &common_shape = src.get_shape_vector(); + + static constexpr py::ssize_t src_offset = 0; + static constexpr py::ssize_t dst_offset = 0; + + auto fn = copy_for_roll_ndshift_dispatch_vector[type_id]; + + std::vector host_task_events; + host_task_events.reserve(2); + + // shape_strides = [src_shape, src_strides, dst_strides] + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, common_shape, src_strides, dst_strides, + normalized_shifts); + auto shape_strides_shifts_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides_shifts = shape_strides_shifts_owner.get(); + + std::vector all_deps(depends.size() + 1); + all_deps.push_back(copy_shape_ev); + all_deps.insert(std::end(all_deps), std::begin(depends), std::end(depends)); + + sycl::event copy_for_roll_event = + fn(exec_q, src_nelems, src_nd, shape_strides_shifts, src_data, + src_offset, dst_data, dst_offset, all_deps); + + auto temporaries_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {copy_for_roll_event}, shape_strides_shifts_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + return std::make_pair(keep_args_alive(exec_q, {src, dst}, host_task_events), + copy_for_roll_event); +} + +void init_copy_for_roll_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::copy_and_cast::CopyForRollStridedFactory; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(copy_for_roll_strided_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollContigFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(copy_for_roll_contig_dispatch_vector); + + using dpctl::tensor::kernels::copy_and_cast::CopyForRollNDShiftFactory; + DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(copy_for_roll_ndshift_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp new file mode 100644 index 000000000000..cffbf9f6f0d6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/copy_for_roll.hpp @@ -0,0 +1,65 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + copy_usm_ndarray_for_roll_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + py::ssize_t shift, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair + copy_usm_ndarray_for_roll_nd(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const std::vector &shifts, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_copy_for_roll_dispatch_vectors(); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 4903f22bd481..c1372c1c2406 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" #include "copy_for_reshape.hpp" -// #include "copy_for_roll.hpp" +#include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" // #include "eye_ctor.hpp" @@ -91,8 +91,8 @@ using dpctl::tensor::py_internal::copy_usm_ndarray_for_reshape; /* =========================== Copy for roll ============================= */ -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; -// using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_1d; +using dpctl::tensor::py_internal::copy_usm_ndarray_for_roll_nd; /* ============= Copy from numpy.ndarray to usm_ndarray ==================== */ @@ -158,8 +158,8 @@ void init_dispatch_vectors(void) using namespace dpctl::tensor::py_internal; init_copy_as_contig_dispatch_vectors(); - // init_copy_for_reshape_dispatch_vectors(); - // init_copy_for_roll_dispatch_vectors(); + init_copy_for_reshape_dispatch_vectors(); + init_copy_for_roll_dispatch_vectors(); // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); @@ -287,21 +287,21 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for flat " - // "traversal with shift. " - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("src"), py::arg("dst"), py::arg("shift"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_copy_usm_ndarray_for_roll_1d", ©_usm_ndarray_for_roll_1d, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for flat " + "traversal with shift. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shift"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, - // "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same - // " "shapes using underlying 'C'-contiguous order for " "traversal - // with shifts along each axis. " "Returns a tuple of events: - // (ht_event, comp_event)", py::arg("src"), py::arg("dst"), - // py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = - // py::list()); + m.def("_copy_usm_ndarray_for_roll_nd", ©_usm_ndarray_for_roll_nd, + "Copies from usm_ndarray `src` into usm_ndarray `dst` with the same " + "shapes using underlying 'C'-contiguous order for " + "traversal with shifts along each axis. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("src"), py::arg("dst"), py::arg("shifts"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, // "Fills input 1D contiguous usm_ndarray `dst` with linear From 85c29daa5b6855db9d3b023e29777950aebaba18 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:57:28 -0800 Subject: [PATCH 045/145] Move roll() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_manipulation_functions.py | 120 ++++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 dpctl_ext/tensor/_manipulation_functions.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 416216074f8a..edb2c096bad1 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -43,6 +43,9 @@ put, take, ) +from dpctl_ext.tensor._manipulation_functions import ( + roll, +) from dpctl_ext.tensor._reshape import reshape __all__ = [ @@ -53,6 +56,7 @@ "full", "put", "reshape", + "roll", "take", "to_numpy", "tril", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py new file mode 100644 index 000000000000..fa8fc27876b3 --- /dev/null +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -0,0 +1,120 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import operator + +import dpctl.tensor as dpt +import dpctl.utils as dputils +import numpy as np + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_tuple + +__doc__ = ( + "Implementation module for array manipulation " + "functions in :module:`dpctl.tensor`" +) + + +def roll(x, /, shift, *, axis=None): + """ + roll(x, shift, axis) + + Rolls array elements along a specified axis. + Array elements that roll beyond the last position are re-introduced + at the first position. Array elements that roll beyond the first position + are re-introduced at the last position. + + Args: + x (usm_ndarray): input array + shift (Union[int, Tuple[int,...]]): number of places by which the + elements are shifted. If `shift` is a tuple, then `axis` must be a + tuple of the same size, and each of the given axes must be shifted + by the corresponding element in `shift`. If `shift` is an `int` + and `axis` a tuple, then the same `shift` must be used for all + specified axes. If a `shift` is positive, then array elements is + shifted positively (toward larger indices) along the dimension of + `axis`. + If a `shift` is negative, then array elements must be shifted + negatively (toward smaller indices) along the dimension of `axis`. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along which + elements to shift. If `axis` is `None`, the array is + flattened, shifted, and then restored to its original shape. + Default: `None`. + + Returns: + usm_ndarray: + An array having the same `dtype`, `usm_type` and + `device` attributes as `x` and whose elements are shifted relative + to `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + exec_q = x.sycl_queue + _manager = dputils.SequentialOrderManager[exec_q] + if axis is None: + shift = operator.index(shift) + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + sz = operator.index(x.size) + shift = (shift % sz) if sz > 0 else 0 + dep_evs = _manager.submitted_events + hev, roll_ev = ti._copy_usm_ndarray_for_roll_1d( + src=x, + dst=res, + shift=shift, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, roll_ev) + return res + axis = normalize_axis_tuple(axis, x.ndim, allow_duplicate=True) + broadcasted = np.broadcast(shift, axis) + if broadcasted.ndim > 1: + raise ValueError("'shift' and 'axis' should be scalars or 1D sequences") + shifts = [ + 0, + ] * x.ndim + shape = x.shape + for sh, ax in broadcasted: + n_i = operator.index(shape[ax]) + shifted = shifts[ax] + operator.index(sh) + shifts[ax] = (shifted % n_i) if n_i > 0 else 0 + res = dpt.empty( + x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + dep_evs = _manager.submitted_events + ht_e, roll_ev = ti._copy_usm_ndarray_for_roll_nd( + src=x, dst=res, shifts=shifts, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e, roll_ev) + return res From 6e8d857a5e85894a11c16a728ec437fac629a196 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 17 Feb 2026 06:58:11 -0800 Subject: [PATCH 046/145] Update dpnp.roll to use dpctl_ext --- dpnp/dpnp_iface_manipulation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index f992cc366e20..edd98348afe3 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3264,7 +3264,7 @@ def roll(x, shift, axis=None): if axis is None: return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape) - usm_res = dpt.roll(usm_x, shift=shift, axis=axis) + usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) From 19e93b99c7c2c238f1b697dfefe5b70525370819 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:34:09 -0800 Subject: [PATCH 047/145] Update .gitignore to ignore .so files in dpctl_ext --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 5d2725d3186f..0cfebe53f623 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,7 @@ dpnp/**/*.cpython*.so dpnp/**/*.pyd *~ core + +# TODO: revert to `dpctl/` +# when dpnp fully migrates dpctl/tensor +dpctl_ext/**/*.cpython*.so From b111e49b784168180c835569d5dbe97958521f16 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 04:35:23 -0800 Subject: [PATCH 048/145] Remove unused includes in tensor_ctors.cpp --- dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index be69ee1a8c7e..54d6adbc8f6e 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -32,18 +32,15 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions //===----------------------------------------------------------------------===// -// #include -// #include -// #include -#include -#include -#include -// #include #include #include #include #include +#include +#include +#include + #include "dpnp4pybind11.hpp" // #include "accumulators.hpp" From c082224e07df5e4d4960112ef5ec4e5faef2a452 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 18 Feb 2026 05:40:59 -0800 Subject: [PATCH 049/145] Use Python::Module for dpctl_ext static lib to avoid libpython dependency --- dpctl_ext/tensor/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 28e7a4cb55f4..ed69b4f10cba 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -27,7 +27,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -find_package(Python COMPONENTS Development) +find_package(Python COMPONENTS Development.Module) if(WIN32) if(${CMAKE_VERSION} VERSION_LESS "3.27") @@ -74,7 +74,7 @@ target_include_directories( # ${Dpctl_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include ) -target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Python) +target_link_libraries(${_static_lib_trgt} PRIVATE pybind11::headers Python::Module) set_target_properties(${_static_lib_trgt} PROPERTIES POSITION_INDEPENDENT_CODE ON) set(_py_trgts) From c3bc8ba76728c212cc9117bac72cf4b68f88da92 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 04:58:04 -0800 Subject: [PATCH 050/145] Move ti.mask_positions() and ti._cumsum_1d() --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/accumulators.hpp | 1448 +++++++++++++++++ .../tensor/libtensor/source/accumulators.cpp | 406 +++++ .../tensor/libtensor/source/accumulators.hpp | 62 + .../tensor/libtensor/source/tensor_ctors.cpp | 20 +- 5 files changed, 1927 insertions(+), 11 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 93555981deaa..f18481d08189 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -45,7 +45,7 @@ set(_static_lib_sources ) set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_as_contig.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp new file mode 100644 index 000000000000..6451bc950006 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/accumulators.hpp @@ -0,0 +1,1448 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for accumulators (cumulative sum, prod, etc.). +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::accumulators +{ + +namespace su_ns = dpctl::tensor::sycl_utils; + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +T ceiling_quotient(T n, T m) +{ + return (n + m - 1) / m; +} + +template +struct NonZeroIndicator +{ + constexpr NonZeroIndicator() {} + + outputT operator()(const inputT &val) const + { + static constexpr outputT out_one(1); + static constexpr outputT out_zero(0); + static constexpr inputT val_zero(0); + + return (val == val_zero) ? out_zero : out_one; + } +}; + +template +struct NoOpTransformer +{ + constexpr NoOpTransformer() {} + + T operator()(const T &val) const + { + return val; + } +}; + +template +struct CastTransformer +{ + constexpr CastTransformer() {} + + dstTy operator()(const srcTy &val) const + { + using dpctl::tensor::type_utils::convert_impl; + return convert_impl(val); + } +}; + +template +struct needs_workaround +{ + // workaround needed due to crash in JITing on CPU + // remove when CMPLRLLVM-65813 is resolved + static constexpr bool value = su_ns::IsSyclLogicalAnd::value || + su_ns::IsSyclLogicalOr::value; +}; + +template +struct can_use_inclusive_scan_over_group +{ + static constexpr bool value = sycl::has_known_identity::value && + !needs_workaround::value; +}; + +namespace detail +{ +template +class stack_t +{ + T *src_; + std::size_t size_; + T *local_scans_; + +public: + stack_t() : src_{}, size_{}, local_scans_{} {} + stack_t(T *src, std::size_t sz, T *local_scans) + : src_(src), size_(sz), local_scans_(local_scans) + { + } + ~stack_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + std::size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } +}; + +template +class stack_strided_t +{ + T *src_; + std::size_t size_; + T *local_scans_; + std::size_t local_stride_; + +public: + stack_strided_t() : src_{}, size_{}, local_scans_{}, local_stride_{} {} + stack_strided_t(T *src, + std::size_t sz, + T *local_scans, + std::size_t local_stride) + : src_(src), size_(sz), local_scans_(local_scans), + local_stride_(local_stride) + { + } + ~stack_strided_t(){}; + + T *get_src_ptr() const + { + return src_; + } + + std::size_t get_size() const + { + return size_; + } + + T *get_local_scans_ptr() const + { + return local_scans_; + } + + std::size_t get_local_stride() const + { + return local_stride_; + } +}; + +} // end of namespace detail + +// Iterative cumulative summation + +using nwiT = std::uint32_t; + +template +class inclusive_scan_iter_local_scan_blocked_krn; + +template +class inclusive_scan_iter_local_scan_striped_krn; + +template +sycl::event inclusive_scan_base_step_blocked( + sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + acc_groups = ceiling_quotient(acc_nelems, n_wi * wg_size); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + auto gws = sycl::range<1>(iter_nelems * acc_groups * wg_size); + auto lws = sycl::range<1>(wg_size); + + auto ndRange = sycl::nd_range<1>(gws, lws); + + slmT slm_iscan_tmp(lws, cgh); + + using KernelName = inclusive_scan_iter_local_scan_blocked_krn< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_id(0); + const std::size_t lid = it.get_local_id(0); + + const std::uint32_t wg_size = it.get_local_range(0); + const std::size_t reduce_chunks = acc_groups * wg_size; + const std::size_t iter_gid = gid / reduce_chunks; + const std::size_t chunk_gid = gid - (iter_gid * reduce_chunks); + + const std::size_t i = chunk_gid * n_wi; + const auto &iter_offsets = iter_indexer(iter_gid); + const auto &inp_iter_offset = iter_offsets.get_first_offset(); + const auto &out_iter_offset = iter_offsets.get_second_offset(); + + std::array local_iscan; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::size_t i_m_wi = i + m_wi; + if constexpr (!include_initial) { + local_iscan[m_wi] = + (i_m_wi < acc_nelems) + ? transformer(input[inp_iter_offset + + inp_indexer(s0 + s1 * i_m_wi)]) + : identity; + } + else { + // shift input to the left by a single element relative to + // output + local_iscan[m_wi] = + (i_m_wi < acc_nelems && i_m_wi > 0) + ? transformer( + input[inp_iter_offset + + inp_indexer((s0 + s1 * i_m_wi) - 1)]) + : identity; + } + } + +#pragma unroll + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = + scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]); + } + // local_iscan is now result of + // inclusive scan of locally stored inputs + + outputT wg_iscan_val; + if constexpr (can_use_inclusive_scan_over_group::value) { + wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_iscan.back(), scan_op, identity); + } + else { + wg_iscan_val = su_ns::custom_inclusive_scan_over_group( + it.get_group(), it.get_sub_group(), slm_iscan_tmp, + local_iscan.back(), identity, scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); + } + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid]; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier); + } + + const std::size_t start = std::min(i, acc_nelems); + const std::size_t end = std::min(i + n_wi, acc_nelems); + const nwiT m_max = static_cast(end - start); + for (nwiT m_wi = 0; m_wi < m_max; ++m_wi) { + output[out_iter_offset + out_indexer(i + m_wi)] = + local_iscan[m_wi]; + } + }); + }); + + return inc_scan_phase1_ev; +} + +template +sycl::event inclusive_scan_base_step_striped( + sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + const std::uint32_t reduce_nelems_per_wg = n_wi * wg_size; + acc_groups = + ceiling_quotient(acc_nelems, reduce_nelems_per_wg); + + sycl::event inc_scan_phase1_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using slmT = sycl::local_accessor; + + const auto &gRange = sycl::range<1>{iter_nelems * acc_groups * wg_size}; + const auto &lRange = sycl::range<1>{wg_size}; + + const auto &ndRange = sycl::nd_range<1>{gRange, lRange}; + + slmT slm_iscan_tmp(reduce_nelems_per_wg, cgh); + + using KernelName = inclusive_scan_iter_local_scan_striped_krn< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = + std::move(slm_iscan_tmp)]( + sycl::nd_item<1> it) { + const std::uint32_t lid = it.get_local_linear_id(); + const std::uint32_t wg_size = it.get_local_range(0); + + const auto &sg = it.get_sub_group(); + const std::uint32_t sgSize = sg.get_max_local_range()[0]; + const std::size_t sgroup_id = sg.get_group_id()[0]; + const std::uint32_t lane_id = sg.get_local_id()[0]; + + const std::size_t flat_group_id = it.get_group(0); + const std::size_t iter_gid = flat_group_id / acc_groups; + const std::size_t acc_group_id = + flat_group_id - (iter_gid * acc_groups); + + const auto &iter_offsets = iter_indexer(iter_gid); + const auto &inp_iter_offset = iter_offsets.get_first_offset(); + const auto &out_iter_offset = iter_offsets.get_second_offset(); + + std::array local_iscan{}; + + const std::size_t inp_id0 = acc_group_id * n_wi * wg_size + + sgroup_id * n_wi * sgSize + lane_id; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::size_t inp_id = inp_id0 + m_wi * sgSize; + if constexpr (!include_initial) { + local_iscan[m_wi] = + (inp_id < acc_nelems) + ? transformer(input[inp_iter_offset + + inp_indexer(s0 + s1 * inp_id)]) + : identity; + } + else { + // shift input to the left by a single element relative to + // output + local_iscan[m_wi] = + (inp_id < acc_nelems && inp_id > 0) + ? transformer( + input[inp_iter_offset + + inp_indexer((s0 + s1 * inp_id) - 1)]) + : identity; + } + } + + // change layout from striped to blocked + { + { + const std::uint32_t local_offset0 = lid * n_wi; +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + slm_iscan_tmp[local_offset0 + i] = local_iscan[i]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + + { + const std::uint32_t block_offset = + sgroup_id * sgSize * n_wi; + const std::uint32_t disp0 = lane_id * n_wi; +#pragma unroll + for (nwiT i = 0; i < n_wi; ++i) { + const std::uint32_t disp = disp0 + i; + + // disp == lane_id1 + i1 * sgSize; + const std::uint32_t i1 = disp / sgSize; + const std::uint32_t lane_id1 = disp - i1 * sgSize; + + const std::uint32_t disp_exchanged = + (lane_id1 * n_wi + i1); + + local_iscan[i] = + slm_iscan_tmp[block_offset + disp_exchanged]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + } + +#pragma unroll + for (nwiT m_wi = 1; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = + scan_op(local_iscan[m_wi], local_iscan[m_wi - 1]); + } + // local_iscan is now result of + // inclusive scan of locally stored inputs + + outputT wg_iscan_val; + if constexpr (can_use_inclusive_scan_over_group::value) { + wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_iscan.back(), scan_op, identity); + } + else { + wg_iscan_val = su_ns::custom_inclusive_scan_over_group( + it.get_group(), sg, slm_iscan_tmp, local_iscan.back(), + identity, scan_op); + // ensure all finished reading from SLM, to avoid race condition + // with subsequent writes into SLM + it.barrier(sycl::access::fence_space::local_space); + } + + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + const outputT modifier = (lid == 0) ? identity : slm_iscan_tmp[lid]; + +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + local_iscan[m_wi] = scan_op(local_iscan[m_wi], modifier); + } + + it.barrier(sycl::access::fence_space::local_space); + + // convert back to blocked layout + {{const std::uint32_t local_offset0 = lid * n_wi; +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + slm_iscan_tmp[local_offset0 + m_wi] = local_iscan[m_wi]; + } + + it.barrier(sycl::access::fence_space::local_space); + } + } + + { + const std::uint32_t block_offset = sgroup_id * sgSize * n_wi + lane_id; +#pragma unroll + for (nwiT m_wi = 0; m_wi < n_wi; ++m_wi) { + const std::uint32_t m_wi_scaled = m_wi * sgSize; + const std::size_t out_id = inp_id0 + m_wi_scaled; + if (out_id < acc_nelems) { + output[out_iter_offset + out_indexer(out_id)] = + slm_iscan_tmp[block_offset + m_wi_scaled]; + } + } + } +}); +}); + +return inc_scan_phase1_ev; +} + +template +sycl::event + inclusive_scan_base_step(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IterIndexerT &iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + TransformerT transformer, + const ScanOpT &scan_op, + outputT identity, + std::size_t &acc_groups, + const std::vector &depends = {}) +{ + // For small stride use striped load/store. + // Threshold value chosen experimentally. + if (s1 <= 16) { + return inclusive_scan_base_step_striped< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + } + else { + return inclusive_scan_base_step_blocked< + inputT, outputT, n_wi, IterIndexerT, InpIndexerT, OutIndexerT, + TransformerT, ScanOpT, include_initial>( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + } +} + +template +class inclusive_scan_1d_iter_chunk_update_krn; + +template +sycl::event update_local_chunks_1d(sycl::queue &exec_q, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + const sycl::event &dependent_event) +{ + const auto &ctx = exec_q.get_context(); + const auto &dev = exec_q.get_device(); + + const auto &kernel_id = sycl::get_kernel_id(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // output[ chunk_size * (i + 1) + j] += temp[i] + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + cgh.use_kernel_bundle(kb); + + static constexpr nwiT updates_per_wi = n_wi; + const std::size_t n_items = + ceiling_quotient(src_size, sg_size * n_wi) * sg_size; + + sycl::range<1> gRange{n_items}; + sycl::range<1> lRange{sg_size}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + cgh.parallel_for( + ndRange, + [chunk_size, src, src_size, local_scans](sycl::nd_item<1> ndit) { + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(0); + const std::size_t block_offset = ndit.get_group(0) * n_wi * lws; +#pragma unroll + for (std::size_t i = 0; i < updates_per_wi; ++i) { + const std::size_t src_id = + block_offset + ndit.get_local_id(0) + i * lws; + if (src_id < src_size) { + const std::size_t scan_id = (src_id / chunk_size); + const outputT modifier = + (scan_id > 0) ? local_scans[scan_id - 1] : identity; + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} + +/* + * output[j] = sum( input[s0 + i * s1], 0 <= i <= j) + * for 0 <= j < n_elems + */ +template +sycl::event inclusive_scan_iter_1d(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t n_elems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const IndexerT &indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + static constexpr std::size_t _iter_nelems = 1; + + using IterIndexerT = dpctl::tensor::offset_utils::TwoZeroOffsets_Indexer; + static constexpr IterIndexerT _no_op_iter_indexer{}; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT _no_op_indexer{}; + + std::size_t n_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, n_elems, input, output, s0, s1, + _no_op_iter_indexer, indexer, _no_op_indexer, transformer, scan_op, + identity, n_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (n_groups > 1) { + const std::size_t chunk_size = wg_size * n_wi; + + // how much of temporary allocation do we need + std::size_t n_groups_ = n_groups; + std::size_t temp_size = 0; + while (n_groups_ > 1) { + const std::size_t this_size = (n_groups_ - 1); + temp_size += this_size; + n_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(temp_size, + exec_q); + outputT *temp = temp_owner.get(); + + std::vector> stack{}; + + // inclusive scans over blocks + n_groups_ = n_groups; + outputT *src = output; + outputT *local_scans = temp; + + using NoOpTransformerT = NoOpTransformer; + static constexpr NoOpTransformerT _no_op_transformer{}; + std::size_t size_to_update = n_elems; + while (n_groups_ > 1) { + + const std::size_t src_size = n_groups_ - 1; + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, _iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, _no_op_iter_indexer, + _no_op_indexer, _no_op_indexer, _no_op_transformer, scan_op, + identity, n_groups_, // n_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans}); + src = local_scans; + local_scans += src_size; + size_to_update = src_size; + } + + for (std::size_t reverse_stack_id = 0; reverse_stack_id < stack.size(); + ++reverse_stack_id) + { + const std::size_t stack_id = stack.size() - 1 - reverse_stack_id; + + const auto &stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + const std::size_t src_size = stack_elem.get_size(); + const outputT *local_scans = stack_elem.get_local_scans_ptr(); + + using UpdateKernelName = + class inclusive_scan_1d_iter_chunk_update_krn; + + dependent_event = update_local_chunks_1d( + exec_q, src, src_size, local_scans, chunk_size, + dependent_event); + } + + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); + + host_tasks.push_back(free_ev); + } + + return dependent_event; +} + +typedef sycl::event (*accumulate_1d_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event + accumulate_1d_contig_impl(sycl::queue &q, + std::size_t n_elems, + const char *src, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, src_data_ptr, dst_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + return comp_ev; +} + +template +class inclusive_scan_final_chunk_update_krn; + +template +sycl::event final_update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + const OutIterIndexerT &out_iter_indexer, + const OutIndexerT &out_indexer, + sycl::event dependent_event) +{ + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + static constexpr nwiT updates_per_wi = n_wi; + const std::size_t updates_per_sg = sg_size * updates_per_wi; + const std::size_t update_nelems = + ceiling_quotient(src_size, updates_per_sg) * sg_size; + + sycl::range<2> gRange{iter_nelems, update_nelems}; + sycl::range<2> lRange{1, sg_size}; + + sycl::nd_range<2> ndRange{gRange, lRange}; + + sycl::event update_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_event); + + cgh.parallel_for( + ndRange, [chunk_size, src_size, local_stride, src, local_scans, + out_iter_indexer, out_indexer](sycl::nd_item<2> ndit) { + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t iter_gid = ndit.get_group(0); + + const std::size_t src_axis_id0 = + ndit.get_group(1) * updates_per_wi * lws + + ndit.get_local_id(1); + const std::size_t src_iter_id = out_iter_indexer(iter_gid); +#pragma unroll + for (nwiT i = 0; i < updates_per_wi; ++i) { + const std::size_t src_axis_id = src_axis_id0 + i * lws; + const std::size_t src_id = + out_indexer(src_axis_id) + src_iter_id; + + if (src_axis_id < src_size) { + const std::size_t scan_axis_id = + src_axis_id / chunk_size; + const std::size_t scan_id = + scan_axis_id + iter_gid * local_stride; + + const outputT modifier = (scan_axis_id > 0) + ? local_scans[scan_id - 1] + : identity; + + src[src_id] = scan_op(src[src_id], modifier); + } + } + }); + }); + + return update_event; +} + +template +class inclusive_scan_iter_chunk_update_krn; + +template +sycl::event update_local_chunks(sycl::queue &exec_q, + std::size_t iter_nelems, + outputT *src, + std::size_t src_size, + const outputT *local_scans, + std::size_t chunk_size, + std::size_t local_stride, + sycl::event dependent_event) +{ + static constexpr NoOpIndexer out_indexer{}; + static constexpr NoOpIndexer iter_out_indexer{}; + + return final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, iter_out_indexer, out_indexer, dependent_event); +} + +template +sycl::event inclusive_scan_iter(sycl::queue &exec_q, + const std::uint32_t wg_size, + const std::size_t iter_nelems, + const std::size_t acc_nelems, + const inputT *input, + outputT *output, + const std::size_t s0, + const std::size_t s1, + const InpIterIndexerT &inp_iter_indexer, + const OutIterIndexerT &out_iter_indexer, + const InpIndexerT &inp_indexer, + const OutIndexerT &out_indexer, + const TransformerT &transformer, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + static constexpr ScanOpT scan_op{}; + static constexpr outputT identity = + su_ns::Identity::value; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InpIterIndexerT, OutIterIndexerT>; + const IterIndexerT iter_indexer{inp_iter_indexer, out_iter_indexer}; + + std::size_t acc_groups; + sycl::event inc_scan_phase1_ev = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, acc_nelems, input, output, s0, s1, + iter_indexer, inp_indexer, out_indexer, transformer, scan_op, + identity, acc_groups, depends); + + sycl::event dependent_event = inc_scan_phase1_ev; + if (acc_groups > 1) { + const std::size_t chunk_size = wg_size * n_wi; + + // how much of temporary allocation do we need + std::size_t acc_groups_ = acc_groups; + std::size_t temp_size = 0; + while (acc_groups_ > 1) { + const std::size_t this_size = (acc_groups_ - 1); + temp_size += this_size; + acc_groups_ = ceiling_quotient(this_size, chunk_size); + } + + // allocate + auto temp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * temp_size, exec_q); + outputT *temp = temp_owner.get(); + + std::vector> stack{}; + + // inclusive scans over blocks + acc_groups_ = acc_groups; + outputT *src = output; + outputT *local_scans = temp; + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT _no_op_indexer{}; + using NoOpTransformerT = NoOpTransformer; + static constexpr NoOpTransformerT _no_op_transformer{}; + std::size_t size_to_update = acc_nelems; + + { + std::size_t src_size = acc_groups - 1; + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + OutIterIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{out_iter_indexer, + scan_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, out_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + while (acc_groups_ > 1) { + std::size_t src_size = acc_groups_ - 1; + + using LocalScanIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + const LocalScanIndexerT scan1_iter_indexer{ + /* size */ iter_nelems, + /* step */ size_to_update}; + const LocalScanIndexerT scan2_iter_indexer{/* size */ iter_nelems, + /* step */ src_size}; + + using IterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + LocalScanIndexerT, LocalScanIndexerT>; + const IterIndexerT iter_indexer_{scan1_iter_indexer, + scan2_iter_indexer}; + + dependent_event = + inclusive_scan_base_step( + exec_q, wg_size, iter_nelems, src_size, src, local_scans, + chunk_size - 1, chunk_size, iter_indexer_, _no_op_indexer, + _no_op_indexer, _no_op_transformer, scan_op, identity, + acc_groups_, // acc_groups_ is modified in place + {dependent_event}); + stack.push_back({src, size_to_update, local_scans, src_size}); + src = local_scans; + local_scans += src_size * iter_nelems; + size_to_update = src_size; + } + + for (std::size_t reverse_stack_id = 0; + reverse_stack_id < stack.size() - 1; ++reverse_stack_id) + { + const std::size_t stack_id = stack.size() - 1 - reverse_stack_id; + + const auto &stack_elem = stack[stack_id]; + outputT *src = stack_elem.get_src_ptr(); + std::size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + std::size_t local_stride = stack_elem.get_local_stride(); + + using UpdateKernelName = + class inclusive_scan_iter_chunk_update_krn; + + dependent_event = + update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, dependent_event); + } + + // last stack element is always directly to output + { + const auto &stack_elem = stack[0]; + outputT *src = stack_elem.get_src_ptr(); + const std::size_t src_size = stack_elem.get_size(); + outputT *local_scans = stack_elem.get_local_scans_ptr(); + const std::size_t local_stride = stack_elem.get_local_stride(); + + using UpdateKernelName = + class inclusive_scan_final_chunk_update_krn< + outputT, n_wi, OutIterIndexerT, OutIndexerT, ScanOpT>; + + dependent_event = + final_update_local_chunks( + exec_q, iter_nelems, src, src_size, local_scans, chunk_size, + local_stride, out_iter_indexer, out_indexer, + dependent_event); + } + + sycl::event free_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dependent_event}, temp_owner); + host_tasks.push_back(free_ev); + } + + return dependent_event; +} + +typedef sycl::event (*accumulate_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + char *, + std::vector &, + const std::vector &); + +template +sycl::event + accumulate_strided_impl(sycl::queue &q, + std::size_t iter_nelems, + std::size_t acc_nelems, + const char *src, + int iter_nd, + const ssize_t *iter_shape_strides, + ssize_t inp_iter_offset, + ssize_t out_iter_offset, + int acc_nd, + const ssize_t *acc_shape_strides, + char *dst, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const srcT *src_data_ptr = reinterpret_cast(src); + dstT *dst_data_ptr = reinterpret_cast(dst); + + using InpIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const InpIndexerT inp_axis_indexer{acc_nd, 0, acc_shape_strides}; + const InpIndexerT inp_iter_indexer{iter_nd, inp_iter_offset, + iter_shape_strides}; + + using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + const OutIndexerT out_axis_indexer{acc_nd, 0, acc_shape_strides, + acc_shape_strides + 2 * acc_nd}; + const OutIndexerT out_iter_indexer{iter_nd, out_iter_offset, + iter_shape_strides, + iter_shape_strides + 2 * iter_nd}; + + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = + inclusive_scan_iter( + q, wg_size, iter_nelems, acc_nelems, src_data_ptr, dst_data_ptr, + s0, s1, inp_iter_indexer, out_iter_indexer, inp_axis_indexer, + out_axis_indexer, transformer, host_tasks, depends); + } + + return comp_ev; +} + +typedef std::size_t (*cumsum_val_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + std::vector &, + const std::vector &); + +template +std::size_t cumsum_val_contig_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + static constexpr NoOpIndexerT flat_indexer{}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; + + sycl::event comp_ev; + const sycl::device &dev = q.get_device(); + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + flat_indexer, transformer, host_tasks, depends); + } + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); + + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); + copy_e.wait(); + std::size_t return_val = static_cast(*last_elem_host_usm); + + // explicitly free USM host allocation, by invoking deleter of + // the unique_ptr + host_usm_owner.reset(nullptr); + + return return_val; +} + +template +struct MaskPositionsContigFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } +}; + +template +struct MaskPositionsContigFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } +}; + +template +struct Cumsum1DContigFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_contig_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +typedef std::size_t (*cumsum_val_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + int, + const ssize_t *, + char *, + std::vector &, + const std::vector &); + +template +std::size_t + cumsum_val_strided_impl(sycl::queue &q, + std::size_t n_elems, + const char *mask, + int nd, + const ssize_t *shape_strides, + char *cumsum, + std::vector &host_tasks, + const std::vector &depends = {}) +{ + const maskT *mask_data_ptr = reinterpret_cast(mask); + cumsumT *cumsum_data_ptr = reinterpret_cast(cumsum); + + using StridedIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const StridedIndexerT strided_indexer{nd, 0, shape_strides}; + static constexpr transformerT transformer{}; + + static constexpr std::size_t s0 = 0; + static constexpr std::size_t s1 = 1; + static constexpr bool include_initial = false; + using AccumulateOpT = sycl::plus; + + const sycl::device &dev = q.get_device(); + sycl::event comp_ev; + if (dev.has(sycl::aspect::cpu)) { + static constexpr nwiT n_wi_for_cpu = 8; + const std::uint32_t wg_size = 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); + } + else { + static constexpr nwiT n_wi_for_gpu = 4; + // base_scan_striped algorithm does not execute correctly + // on HIP device with wg_size > 64 + const std::uint32_t wg_size = + (q.get_backend() == sycl::backend::ext_oneapi_hip) ? 64 : 256; + comp_ev = inclusive_scan_iter_1d( + q, wg_size, n_elems, mask_data_ptr, cumsum_data_ptr, s0, s1, + strided_indexer, transformer, host_tasks, depends); + } + + cumsumT *last_elem = cumsum_data_ptr + (n_elems - 1); + + auto host_usm_owner = + dpctl::tensor::alloc_utils::smart_malloc_host(1, q); + cumsumT *last_elem_host_usm = host_usm_owner.get(); + + sycl::event copy_e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + cgh.copy(last_elem, last_elem_host_usm, 1); + }); + copy_e.wait(); + std::size_t return_val = static_cast(*last_elem_host_usm); + + // explicitly free USM-host temporary, by invoking deleter of + // the unique_ptr + host_usm_owner.reset(nullptr); + + return return_val; +} + +template +struct MaskPositionsStridedFactoryForInt32 +{ + fnT get() + { + using cumsumT = std::int32_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } +}; + +template +struct MaskPositionsStridedFactoryForInt64 +{ + fnT get() + { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } +}; + +template +struct Cumsum1DStridedFactory +{ + fnT get() + { + if constexpr (std::is_integral_v) { + using cumsumT = std::int64_t; + fnT fn = + cumsum_val_strided_impl>; + return fn; + } + else { + return nullptr; + } + } +}; + +} // namespace dpctl::tensor::kernels::accumulators diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.cpp b/dpctl_ext/tensor/libtensor/source/accumulators.cpp new file mode 100644 index 000000000000..82913010755a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators.cpp @@ -0,0 +1,406 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +// Computation of positions of masked elements + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t + mask_positions_contig_i64_dispatch_vector[td_ns::num_types]; +static cumsum_val_contig_impl_fn_ptr_t + mask_positions_contig_i32_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t + mask_positions_strided_i64_dispatch_vector[td_ns::num_types]; +static cumsum_val_strided_impl_fn_ptr_t + mask_positions_strided_i32_dispatch_vector[td_ns::num_types]; + +void populate_mask_positions_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(mask_positions_contig_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(mask_positions_contig_i32_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(mask_positions_strided_i64_dispatch_vector); + + using dpctl::tensor::kernels::accumulators:: + MaskPositionsStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(mask_positions_strided_i32_dispatch_vector); + + return; +} + +std::size_t py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum); + + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("Result array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array must be C-contiguous."); + } + + // cumsum.shape == (mask.size,) + auto mask_size = mask.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != mask_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {mask, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + if (mask_size == 0) { + return 0; + } + + int mask_typenum = mask.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // mask can be any type + const char *mask_data = mask.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int mask_typeid = array_types.typenum_to_lookup_id(mask_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // cumsum must be int32_t/int64_t only + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int32 or int64 data-type."); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + std::vector host_task_events; + + if (mask.is_c_contiguous()) { + auto fn = (use_i32) + ? mask_positions_contig_i32_dispatch_vector[mask_typeid] + : mask_positions_contig_i64_dispatch_vector[mask_typeid]; + + std::size_t total_set; + + { + py::gil_scoped_release release; + + total_set = fn(exec_q, mask_size, mask_data, cumsum_data, + host_task_events, depends); + + sycl::event::wait(host_task_events); + } + return total_set; + } + + const py::ssize_t *shape = mask.get_shape_raw(); + auto const &strides_vector = mask.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int mask_nd = mask.get_ndim(); + int nd = mask_nd; + + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); + + // Strided implementation + auto strided_fn = + (use_i32) ? mask_positions_strided_i32_dispatch_vector[mask_typeid] + : mask_positions_strided_i64_dispatch_vector[mask_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + { + py::gil_scoped_release release; + + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.reset(nullptr); + } + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + std::size_t total_set; + + { + py::gil_scoped_release release; + + total_set = strided_fn(exec_q, mask_size, mask_data, nd, shape_strides, + cumsum_data, host_task_events, dependent_events); + + sycl::event::wait(host_task_events); + // ensure deleter of smart pointer is invoked with GIL released + shape_strides_owner.reset(nullptr); + } + + return total_set; +} + +using dpctl::tensor::kernels::accumulators::cumsum_val_strided_impl_fn_ptr_t; +static cumsum_val_strided_impl_fn_ptr_t + cumsum_1d_strided_dispatch_vector[td_ns::num_types]; +using dpctl::tensor::kernels::accumulators::cumsum_val_contig_impl_fn_ptr_t; +static cumsum_val_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_vector[td_ns::num_types]; + +void populate_cumsum_1d_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::accumulators::Cumsum1DContigFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cumsum_1d_contig_dispatch_vector); + + using dpctl::tensor::kernels::accumulators::Cumsum1DStridedFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cumsum_1d_strided_dispatch_vector); + + return; +} + +std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends) +{ + // cumsum is 1D + if (cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be one-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + // cumsum.shape == (src.size,) + auto src_size = src.get_size(); + auto cumsum_size = cumsum.get_shape(0); + if (cumsum_size != src_size) { + throw py::value_error("Inconsistent dimensions"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum})) { + // FIXME: use ExecutionPlacementError + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(cumsum); + + if (src_size == 0) { + return 0; + } + + int src_typenum = src.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + // src can be any type + const char *src_data = src.get_data(); + char *cumsum_data = cumsum.get_data(); + + auto const &array_types = td_ns::usm_ndarray_types(); + + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + // this cumsum must be int64_t only + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Cumulative sum array must have int64 data-type."); + } + + std::vector host_task_events; + + if (src.is_c_contiguous()) { + auto fn = cumsum_1d_contig_dispatch_vector[src_typeid]; + if (fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + std::size_t total = fn(exec_q, src_size, src_data, cumsum_data, + host_task_events, depends); + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + } + return total; + } + + const py::ssize_t *shape = src.get_shape_raw(); + auto const &strides_vector = src.get_strides_vector(); + + using shT = std::vector; + shT compact_shape; + shT compact_strides; + + int src_nd = src.get_ndim(); + int nd = src_nd; + + dpctl::tensor::py_internal::compact_iteration_space( + nd, shape, strides_vector, compact_shape, compact_strides); + + // Strided implementation + auto strided_fn = cumsum_1d_strided_dispatch_vector[src_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error( + "this cumsum requires integer type, got src_typeid=" + + std::to_string(src_typeid)); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, compact_shape, compact_strides); + auto shape_strides_owner = std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *shape_strides = shape_strides_owner.get(); + + if (2 * static_cast(nd) != std::get<1>(ptr_size_event_tuple)) { + { + py::gil_scoped_release release; + + copy_shape_ev.wait(); + sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.reset(nullptr); + } + throw std::runtime_error("Unexpected error"); + } + + std::vector dependent_events; + dependent_events.reserve(depends.size() + 1); + dependent_events.insert(dependent_events.end(), copy_shape_ev); + dependent_events.insert(dependent_events.end(), depends.begin(), + depends.end()); + + std::size_t total = + strided_fn(exec_q, src_size, src_data, nd, shape_strides, cumsum_data, + host_task_events, dependent_events); + + { + py::gil_scoped_release release; + sycl::event::wait(host_task_events); + + // ensure USM deleter is called with GIL released + shape_strides_owner.reset(nullptr); + } + + return total; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators.hpp b/dpctl_ext/tensor/libtensor/source/accumulators.hpp new file mode 100644 index 000000000000..42503093789b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators.hpp @@ -0,0 +1,62 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace dpctl::tensor::py_internal +{ + +extern void populate_mask_positions_dispatch_vectors(void); + +extern std::size_t + py_mask_positions(const dpctl::tensor::usm_ndarray &mask, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_cumsum_1d_dispatch_vectors(void); + +extern std::size_t py_cumsum_1d(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + std::vector const &depends = {}); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 3e5be4d9e8fe..3c096df8e367 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -43,7 +43,7 @@ #include "dpnp4pybind11.hpp" -// #include "accumulators.hpp" +#include "accumulators.hpp" // #include "boolean_advanced_indexing.hpp" // #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" @@ -113,12 +113,12 @@ using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; // using dpctl::tensor::py_internal::py_extract; -// using dpctl::tensor::py_internal::py_mask_positions; +using dpctl::tensor::py_internal::py_mask_positions; // using dpctl::tensor::py_internal::py_nonzero; // using dpctl::tensor::py_internal::py_place; /* ================= Repeat ====================*/ -// using dpctl::tensor::py_internal::py_cumsum_1d; +using dpctl::tensor::py_internal::py_cumsum_1d; // using dpctl::tensor::py_internal::py_repeat_by_scalar; // using dpctl::tensor::py_internal::py_repeat_by_sequence; @@ -166,9 +166,9 @@ void init_dispatch_vectors(void) // populate_masked_extract_dispatch_vectors(); // populate_masked_place_dispatch_vectors(); - // populate_mask_positions_dispatch_vectors(); + populate_mask_positions_dispatch_vectors(); - // populate_cumsum_1d_dispatch_vectors(); + populate_cumsum_1d_dispatch_vectors(); // init_repeat_dispatch_vectors(); // init_clip_dispatch_vectors(); @@ -408,12 +408,12 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("dst"), py::arg("k") = 0, py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), - // py::arg("cumsum"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("mask_positions", &py_mask_positions, "", py::arg("mask"), + py::arg("cumsum"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); - // m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), // py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), From 69b36b2a3bef5c46d5ea56e58afc8e4dcd2366a1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 05:50:52 -0800 Subject: [PATCH 051/145] Move ti._extract(), ti._place(), ti._nonzero() --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../kernels/boolean_advanced_indexing.hpp | 853 +++++++++++++++++ .../source/boolean_advanced_indexing.cpp | 859 ++++++++++++++++++ .../source/boolean_advanced_indexing.hpp | 81 ++ .../tensor/libtensor/source/tensor_ctors.cpp | 30 +- 5 files changed, 1809 insertions(+), 16 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index f18481d08189..88507aa2192f 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -53,7 +53,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp new file mode 100644 index 000000000000..046ad87d7d78 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -0,0 +1,853 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for advanced tensor index operations. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::kernels::indexing +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +struct MaskedExtractStridedFunctor +{ + MaskedExtractStridedFunctor(const dataT *src_data_p, + const indT *cumsum_data_p, + dataT *dst_data_p, + std::size_t masked_iter_size, + const OrthogIndexerT &orthog_src_dst_indexer_, + const MaskedSrcIndexerT &masked_src_indexer_, + const MaskedDstIndexerT &masked_dst_indexer_, + const LocalAccessorT &lacc_) + : src(src_data_p), cumsum(cumsum_data_p), dst(dst_data_p), + masked_nelems(masked_iter_size), + orthog_src_dst_indexer(orthog_src_dst_indexer_), + masked_src_indexer(masked_src_indexer_), + masked_dst_indexer(masked_dst_indexer_), lacc(lacc_) + { + static_assert( + std::is_same_v); + } + + void operator()(sycl::nd_item<2> ndit) const + { + const std::size_t orthog_i = ndit.get_global_id(0); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; + + const std::size_t max_offset = masked_nelems + 1; + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // dst[cumsum[i] - 1, j] = src[i, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_src_dst_indexer(orthog_i); + + const std::size_t total_src_offset = + masked_src_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_dst_offset = + masked_dst_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst[total_dst_offset] = src[total_src_offset]; + } + } + +private: + const dataT *src = nullptr; + const indT *cumsum = nullptr; + dataT *dst = nullptr; + std::size_t masked_nelems = 0; + // has nd, shape, src_strides, dst_strides for + // dimensions that ARE NOT masked + OrthogIndexerT orthog_src_dst_indexer; + // has nd, shape, src_strides for + // dimensions that ARE masked + MaskedSrcIndexerT masked_src_indexer; + // has 1, dst_strides for dimensions that ARE masked + MaskedDstIndexerT masked_dst_indexer; + LocalAccessorT lacc; +}; + +template +struct MaskedPlaceStridedFunctor +{ + MaskedPlaceStridedFunctor(dataT *dst_data_p, + const indT *cumsum_data_p, + const dataT *rhs_data_p, + std::size_t masked_iter_size, + const OrthogIndexerT &orthog_dst_rhs_indexer_, + const MaskedDstIndexerT &masked_dst_indexer_, + const MaskedRhsIndexerT &masked_rhs_indexer_, + const LocalAccessorT &lacc_) + : dst(dst_data_p), cumsum(cumsum_data_p), rhs(rhs_data_p), + masked_nelems(masked_iter_size), + orthog_dst_rhs_indexer(orthog_dst_rhs_indexer_), + masked_dst_indexer(masked_dst_indexer_), + masked_rhs_indexer(masked_rhs_indexer_), lacc(lacc_) + { + static_assert( + std::is_same_v); + } + + void operator()(sycl::nd_item<2> ndit) const + { + const std::size_t orthog_i = ndit.get_global_id(0); + const std::uint32_t l_i = ndit.get_local_id(1); + const std::uint32_t lws = ndit.get_local_range(1); + + const std::size_t masked_i = ndit.get_global_id(1); + const std::size_t masked_block_start = masked_i - l_i; + + const std::size_t max_offset = masked_nelems + 1; + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT(0) + : (offset < max_offset) ? cumsum[offset - 1] + : cumsum[masked_nelems - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const indT current_running_count = lacc[l_i + 1]; + const bool mask_set = (masked_i == 0) + ? (current_running_count == 1) + : (current_running_count == lacc[l_i] + 1); + + // src[i, j] = rhs[cumsum[i] - 1, j] + // if cumsum[i] == ((i > 0) ? cumsum[i-1] + 1 : 1) + if (mask_set && (masked_i < masked_nelems)) { + const auto &orthog_offsets = orthog_dst_rhs_indexer(orthog_i); + + const std::size_t total_dst_offset = + masked_dst_indexer(masked_i) + + orthog_offsets.get_first_offset(); + const std::size_t total_rhs_offset = + masked_rhs_indexer(current_running_count - 1) + + orthog_offsets.get_second_offset(); + + dst[total_dst_offset] = rhs[total_rhs_offset]; + } + } + +private: + dataT *dst = nullptr; + const indT *cumsum = nullptr; + const dataT *rhs = nullptr; + std::size_t masked_nelems = 0; + // has nd, shape, dst_strides, rhs_strides for + // dimensions that ARE NOT masked + OrthogIndexerT orthog_dst_rhs_indexer; + // has nd, shape, dst_strides for + // dimensions that ARE masked + MaskedDstIndexerT masked_dst_indexer; + // has 1, rhs_strides for dimensions that ARE masked + MaskedRhsIndexerT masked_rhs_indexer; + LocalAccessorT lacc; +}; + +// ======= Masked extraction ================================ + +namespace detail +{ + +template +std::size_t _get_lws_impl(std::size_t n) +{ + if constexpr (sizeof...(IR) == 0) { + return I; + } + else { + return (n < I) ? _get_lws_impl(n) : I; + } +} + +inline std::size_t get_lws(std::size_t n) +{ + static constexpr std::size_t lws0 = 256u; + static constexpr std::size_t lws1 = 128u; + static constexpr std::size_t lws2 = 64u; + return _get_lws_impl(n); +} + +} // end of namespace detail + +template +class masked_extract_all_slices_contig_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_contig_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + const char *, + const char *, + char *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_contig_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + static constexpr NoOpIndexer masked_src_indexer{}; + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); + + using KernelName = + class masked_extract_all_slices_contig_impl_krn; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_extent = iteration_size; + + const std::size_t lws = detail::get_lws(masked_extent); + + const std::size_t n_groups = (iteration_size + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_extent, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +template +class masked_extract_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + const char *, + const char *, + char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_extract_all_slices_strided_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int nd, + const ssize_t + *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); + const Strided1DIndexer masked_dst_indexer(/* size */ dst_size, + /* step */ dst_stride); + + using KernelName = class masked_extract_all_slices_strided_impl_krn< + StridedIndexer, Strided1DIndexer, dataT, indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_nelems = iteration_size; + + const std::size_t lws = detail::get_lws(masked_nelems); + + const std::size_t n_groups = (masked_nelems + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_nelems) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, iteration_size, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + const char *, + const char *, + char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +class masked_extract_some_slices_strided_impl_krn; + +template +sycl::event masked_extract_some_slices_strided_impl( + sycl::queue &exec_q, + ssize_t orthog_nelems, + ssize_t masked_nelems, + const char *src_p, + const char *cumsum_p, + char *dst_p, + int orthog_nd, + // [ortho_shape, ortho_src_strides, // ortho_dst_strides], + // length 3*ortho_nd + const ssize_t *packed_ortho_src_dst_shape_strides, + ssize_t ortho_src_offset, + ssize_t ortho_dst_offset, + int masked_nd, + // [masked_src_shape, masked_src_strides], + // length 2*masked_nd, mask_dst is 1D + const ssize_t *packed_masked_src_shape_strides, + ssize_t masked_dst_size, + ssize_t masked_dst_stride, + const std::vector &depends = {}) +{ + const TwoOffsets_StridedIndexer orthog_src_dst_indexer{ + orthog_nd, ortho_src_offset, ortho_dst_offset, + packed_ortho_src_dst_shape_strides}; + + const StridedIndexer masked_src_indexer{masked_nd, 0, + packed_masked_src_shape_strides}; + const Strided1DIndexer masked_dst_indexer{/* size */ masked_dst_size, + /* step */ masked_dst_stride}; + + using KernelName = class masked_extract_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DIndexer, dataT, + indT>; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + struct MaskedExtractStridedFunctor; + + const std::size_t masked_extent = masked_nelems; + + const std::size_t lws = detail::get_lws(masked_extent); + + const std::size_t n_groups = ((masked_extent + lws - 1) / lws); + const std::size_t orthog_extent = static_cast(orthog_nelems); + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + + sycl::nd_range<2> ndRange(gRange, lRange); + + const dataT *src_tp = reinterpret_cast(src_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + dataT *dst_tp = reinterpret_cast(dst_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = + std::min(lws, masked_extent) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(src_tp, cumsum_tp, dst_tp, masked_nelems, + orthog_src_dst_indexer, masked_src_indexer, + masked_dst_indexer, lacc)); + }); + + return comp_ev; +} + +template +struct MaskExtractAllSlicesContigFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesContigFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_contig_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractAllSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractSomeSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +template +struct MaskExtractSomeSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_extract_some_slices_strided_impl; + return fn; + } +}; + +// Masked placement + +template +class masked_place_all_slices_strided_impl_krn; + +typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + char *, + const char *, + const char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event masked_place_all_slices_strided_impl( + sycl::queue &exec_q, + ssize_t iteration_size, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int nd, + const ssize_t + *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd + ssize_t rhs_size, // rhs is 1D + ssize_t rhs_stride, + const std::vector &depends = {}) +{ + static constexpr TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); + const Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); + + using KernelName = class masked_place_all_slices_strided_impl_krn< + TwoZeroOffsets_Indexer, StridedIndexer, Strided1DCyclicIndexer, dataT, + indT>; + + static constexpr std::size_t nominal_lws = 256; + const std::size_t masked_extent = iteration_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{1, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + MaskedPlaceStridedFunctor; + + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer, lacc)); + }); + + return comp_ev; +} + +typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + char *, + const char *, + const char *, + int, + ssize_t const *, + ssize_t, + ssize_t, + int, + ssize_t const *, + ssize_t, + ssize_t, + const std::vector &); + +template +class masked_place_some_slices_strided_impl_krn; + +template +sycl::event masked_place_some_slices_strided_impl( + sycl::queue &exec_q, + ssize_t orthog_nelems, + ssize_t masked_nelems, + char *dst_p, + const char *cumsum_p, + const char *rhs_p, + int orthog_nd, + // [ortho_shape, ortho_dst_strides, ortho_rhs_strides], + // length 3*ortho_nd + const ssize_t *packed_ortho_dst_rhs_shape_strides, + ssize_t ortho_dst_offset, + ssize_t ortho_rhs_offset, + int masked_nd, + // [masked_dst_shape, masked_dst_strides], + // length 2*masked_nd, mask_dst is 1D + const ssize_t *packed_masked_dst_shape_strides, + ssize_t masked_rhs_size, + ssize_t masked_rhs_stride, + const std::vector &depends = {}) +{ + const TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ + orthog_nd, ortho_dst_offset, ortho_rhs_offset, + packed_ortho_dst_rhs_shape_strides}; + + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const + * *_packed_shape_strides) */ + const StridedIndexer masked_dst_indexer{masked_nd, 0, + packed_masked_dst_shape_strides}; + const Strided1DCyclicIndexer masked_rhs_indexer{0, masked_rhs_size, + masked_rhs_stride}; + + using KernelName = class masked_place_some_slices_strided_impl_krn< + TwoOffsets_StridedIndexer, StridedIndexer, Strided1DCyclicIndexer, + dataT, indT>; + + static constexpr std::size_t nominal_lws = 256; + const std::size_t orthog_extent = orthog_nelems; + const std::size_t masked_extent = masked_nelems; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + + sycl::range<2> gRange{orthog_extent, n_groups * lws}; + sycl::range<2> lRange{1, lws}; + sycl::nd_range<2> ndRange{gRange, lRange}; + + using LocalAccessorT = sycl::local_accessor; + using Impl = + MaskedPlaceStridedFunctor; + + dataT *dst_tp = reinterpret_cast(dst_p); + const dataT *rhs_tp = reinterpret_cast(rhs_p); + const indT *cumsum_tp = reinterpret_cast(cumsum_p); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(masked_extent, lws) + 1; + LocalAccessorT lacc(lacc_size, cgh); + + cgh.parallel_for( + ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems, + orthog_dst_rhs_indexer, masked_dst_indexer, + masked_rhs_indexer, lacc)); + }); + + return comp_ev; +} + +template +struct MaskPlaceAllSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceAllSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_place_all_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceSomeSlicesStridedFactoryForInt32 +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +template +struct MaskPlaceSomeSlicesStridedFactoryForInt64 +{ + fnT get() + { + fnT fn = masked_place_some_slices_strided_impl; + return fn; + } +}; + +// Non-zero + +template +class non_zero_indexes_krn; + +typedef sycl::event (*non_zero_indexes_fn_ptr_t)( + sycl::queue &, + ssize_t, + ssize_t, + int, + const char *, + char *, + const ssize_t *, + std::vector const &); + +template +sycl::event non_zero_indexes_impl(sycl::queue &exec_q, + ssize_t iter_size, + ssize_t nz_elems, + int nd, + const char *cumsum_cp, + char *indexes_cp, + const ssize_t *mask_shape, + std::vector const &depends) +{ + const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); + indT2 *indexes_data = reinterpret_cast(indexes_cp); + + static constexpr std::size_t nominal_lws = 256u; + const std::size_t masked_extent = iter_size; + const std::size_t lws = std::min(masked_extent, nominal_lws); + + const std::size_t n_groups = (masked_extent + lws - 1) / lws; + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const std::size_t lacc_size = std::min(lws, masked_extent) + 1; + sycl::local_accessor lacc(lacc_size, cgh); + + using KernelName = class non_zero_indexes_krn; + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_i = ndit.get_group(0); + const std::uint32_t l_i = ndit.get_local_id(0); + const std::uint32_t lws = ndit.get_local_range(0); + + const std::size_t masked_block_start = group_i * lws; + + for (std::uint32_t i = l_i; i < lacc.size(); i += lws) { + const std::size_t offset = masked_block_start + i; + lacc[i] = (offset == 0) ? indT1(0) + : (offset - 1 < masked_extent) + ? cumsum_data[offset - 1] + : cumsum_data[masked_extent - 1] + 1; + } + + sycl::group_barrier(ndit.get_group()); + + const std::size_t i = masked_block_start + l_i; + const auto cs_val = lacc[l_i]; + const bool cond = (lacc[l_i + 1] == cs_val + 1); + + if (cond && (i < masked_extent)) { + ssize_t i_ = static_cast(i); + for (int dim = nd; --dim > 0;) { + const auto sd = mask_shape[dim]; + const ssize_t q = i_ / sd; + const ssize_t r = (i_ - q * sd); + indexes_data[cs_val + dim * nz_elems] = + static_cast(r); + i_ = q; + } + indexes_data[cs_val] = static_cast(i_); + } + }); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels::indexing diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp new file mode 100644 index 000000000000..a78cb1750b81 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.cpp @@ -0,0 +1,859 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines implementation functions of dpctl.tensor.place and +/// dpctl.tensor.extract, dpctl.tensor.nonzero +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "boolean_advanced_indexing.hpp" +#include "kernels/boolean_advanced_indexing.hpp" + +namespace dpctl::tensor::py_internal +{ + +// Masked extraction + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_strided_impl_fn_ptr_t; + +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_i32_impl_dispatch_vector + [td_ns::num_types]; +static masked_extract_all_slices_strided_impl_fn_ptr_t + masked_extract_all_slices_strided_i64_impl_dispatch_vector + [td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_all_slices_contig_impl_fn_ptr_t; + +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_extract_all_slices_contig_impl_fn_ptr_t + masked_extract_all_slices_contig_i64_impl_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_extract_some_slices_strided_impl_fn_ptr_t; + +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_i32_impl_dispatch_vector + [td_ns::num_types]; +static masked_extract_some_slices_strided_impl_fn_ptr_t + masked_extract_some_slices_strided_i64_impl_dispatch_vector + [td_ns::num_types]; + +void populate_masked_extract_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactoryForInt32, td_ns::num_types> + dvb1; + dvb1.populate_dispatch_vector( + masked_extract_all_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder< + masked_extract_all_slices_strided_impl_fn_ptr_t, + MaskExtractAllSlicesStridedFactoryForInt64, td_ns::num_types> + dvb2; + dvb2.populate_dispatch_vector( + masked_extract_all_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractSomeSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactoryForInt32, td_ns::num_types> + dvb3; + dvb3.populate_dispatch_vector( + masked_extract_some_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractSomeSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder< + masked_extract_some_slices_strided_impl_fn_ptr_t, + MaskExtractSomeSlicesStridedFactoryForInt64, td_ns::num_types> + dvb4; + dvb4.populate_dispatch_vector( + masked_extract_some_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb5; + dvb5.populate_dispatch_vector( + masked_extract_all_slices_contig_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskExtractAllSlicesContigFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb6; + dvb6.populate_dispatch_vector( + masked_extract_all_slices_contig_i64_impl_dispatch_vector); +} + +std::pair + py_extract(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int src_nd = src.get_ndim(); + if ((axis_start < 0 || axis_end > src_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, cumsum, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_ortho_dims(true); + std::size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = same_ortho_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis_end; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + ortho_nelems *= src_sh_i; + same_ortho_dims = + same_ortho_dims && (src_sh_i == dst_shape[i - (mask_span_sz - 1)]); + } + + std::size_t masked_src_nelems(1); + std::size_t masked_dst_nelems(dst_shape[axis_start]); + for (auto i = axis_start; i < axis_end; ++i) { + masked_src_nelems *= src_shape[i]; + } + + // masked_dst_nelems is number of set elements in the mask, or last element + // in cumsum + if (!same_ortho_dims || + (masked_src_nelems != static_cast(cumsum_sz))) + { + throw py::value_error("Inconsistent array dimensions"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, ortho_nelems * masked_dst_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, cumsum) || overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error("Unexpected data type of cumsum array, expecting " + "'int32' or 'int64'"); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event extract_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == src_nd) { + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src.is_c_contiguous()) { + auto fn = + (use_i32) + ? masked_extract_all_slices_contig_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_contig_i64_impl_dispatch_vector + [src_typeid]; + + extract_ev = + fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, dst_data_p, + dst_shape_vec[0], dst_strides_vec[0], depends); + + // + host_task_events.push_back(extract_ev); + } + else { + // empty orthogonal directions + auto fn = + (use_i32) + ? masked_extract_all_slices_strided_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_all_slices_strided_i64_impl_dispatch_vector + [src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_src_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_src_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + extract_ev = fn(exec_q, cumsum_sz, src_data_p, cumsum_data_p, + dst_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + } + else { + // non-empty orthogonal directions + auto fn = + (use_i32) + ? masked_extract_some_slices_strided_i32_impl_dispatch_vector + [src_typeid] + : masked_extract_some_slices_strided_i64_impl_dispatch_vector + [src_typeid]; + + int masked_src_nd = mask_span_sz; + int ortho_nd = src_nd - masked_src_nd; + + using shT = std::vector; + + shT ortho_src_shape; + shT masked_src_shape; + shT ortho_src_strides; + shT masked_src_strides; + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis_start, axis_end, + ortho_src_shape, + masked_src_shape, // 4 vectors modified + ortho_src_strides, masked_src_strides); + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis_start, axis_start + 1, + ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + assert(ortho_src_shape.size() == static_cast(ortho_nd)); + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_src_shape.begin(), ortho_src_shape.end(), + ortho_dst_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_src_strides; + std::vector simplified_ortho_dst_strides; + + const py::ssize_t *_shape = ortho_src_shape.data(); + + py::ssize_t ortho_src_offset(0); + py::ssize_t ortho_dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, ortho_src_strides, ortho_dst_strides, + // output + simplified_ortho_shape, simplified_ortho_src_strides, + simplified_ortho_dst_strides, ortho_src_offset, ortho_dst_offset); + + assert(masked_dst_shape.size() == 1); + assert(masked_dst_strides.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_src_strides, simplified_ortho_dst_strides, + masked_src_shape, masked_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *packed_ortho_src_dst_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_src_shape_strides = + packed_shapes_strides + (3 * ortho_nd); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + // OrthogIndexerT orthog_src_dst_indexer_, MaskedIndexerT + // masked_src_indexer_, MaskedIndexerT masked_dst_indexer_ + extract_ev = fn(exec_q, ortho_nelems, masked_src_nelems, src_data_p, + cumsum_data_p, dst_data_p, + // data to build orthog_src_dst_indexer + ortho_nd, packed_ortho_src_dst_shape_strides, + ortho_src_offset, ortho_dst_offset, + // data to build masked_src_indexer + masked_src_nd, packed_masked_src_shape_strides, + // data to build masked_dst_indexer, + masked_dst_shape[0], masked_dst_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {extract_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, extract_ev); +} + +// Masked placement + +using dpctl::tensor::kernels::indexing:: + masked_place_all_slices_strided_impl_fn_ptr_t; + +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_place_all_slices_strided_impl_fn_ptr_t + masked_place_all_slices_strided_i64_impl_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::indexing:: + masked_place_some_slices_strided_impl_fn_ptr_t; + +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_i32_impl_dispatch_vector[td_ns::num_types]; +static masked_place_some_slices_strided_impl_fn_ptr_t + masked_place_some_slices_strided_i64_impl_dispatch_vector[td_ns::num_types]; + +void populate_masked_place_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::indexing:: + MaskPlaceAllSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector( + masked_place_all_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceAllSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector( + masked_place_all_slices_strided_i64_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceSomeSlicesStridedFactoryForInt32; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector( + masked_place_some_slices_strided_i32_impl_dispatch_vector); + + using dpctl::tensor::kernels::indexing:: + MaskPlaceSomeSlicesStridedFactoryForInt64; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector( + masked_place_some_slices_strided_i64_impl_dispatch_vector); +} + +/* + * @brief Copy dst[i, ortho_id] = rhs[cumsum[i] - 1, ortho_id] if cumsum[i] == + * ((i > 0) ? cumsum[i-1] + 1 : 1) + */ +std::pair + py_place(const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector &depends) +{ + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int dst_nd = dst.get_ndim(); + if ((axis_start < 0 || axis_end > dst_nd || axis_start >= axis_end)) { + throw py::value_error("Specified axes_start and axes_end are invalid."); + } + int mask_span_sz = axis_end - axis_start; + + int rhs_nd = rhs.get_ndim(); + if (dst_nd != rhs_nd + (mask_span_sz - 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!cumsum.is_c_contiguous() || cumsum.get_ndim() != 1) { + throw py::value_error("cumsum array must be a C-contiguous vector"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst, cumsum, rhs})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + py::ssize_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *rhs_shape = rhs.get_shape_raw(); + bool same_ortho_dims(true); + std::size_t ortho_nelems(1); // number of orthogonal iterations + + for (auto i = 0; i < axis_start; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = same_ortho_dims && (dst_sh_i == rhs_shape[i]); + } + for (auto i = axis_end; i < dst_nd; ++i) { + auto dst_sh_i = dst_shape[i]; + ortho_nelems *= dst_sh_i; + same_ortho_dims = + same_ortho_dims && (dst_sh_i == rhs_shape[i - (mask_span_sz - 1)]); + } + + std::size_t masked_dst_nelems(1); + for (auto i = axis_start; i < axis_end; ++i) { + masked_dst_nelems *= dst_shape[i]; + } + + if (!same_ortho_dims || + (masked_dst_nelems != static_cast(cumsum_sz))) + { + throw py::value_error("Inconsistent array dimensions"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, ortho_nelems * masked_dst_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, not with cumsum. + if (overlap(dst, rhs) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int dst_typenum = dst.get_typenum(); + int rhs_typenum = rhs.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int rhs_typeid = array_types.typenum_to_lookup_id(rhs_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) { + throw py::value_error("Unexpected data type of cumsum array, expecting " + "'int32' or 'int64'"); + } + + const bool use_i32 = (cumsum_typeid == int32_typeid); + + if (dst_typeid != rhs_typeid) { + throw py::value_error( + "Destination array must have the same elemental data types"); + } + + char *dst_data_p = dst.get_data(); + char *rhs_data_p = rhs.get_data(); + char *cumsum_data_p = cumsum.get_data(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto rhs_shape_vec = rhs.get_shape_vector(); + auto rhs_strides_vec = rhs.get_strides_vector(); + + sycl::event place_ev; + std::vector host_task_events{}; + if (axis_start == 0 && axis_end == dst_nd) { + // empty orthogonal directions + auto fn = (use_i32) + ? masked_place_all_slices_strided_i32_impl_dispatch_vector + [dst_typeid] + : masked_place_all_slices_strided_i64_impl_dispatch_vector + [dst_typeid]; + + assert(rhs_shape_vec.size() == 1); + assert(rhs_strides_vec.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, dst_shape_vec, dst_strides_vec); + auto packed_dst_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_dst_shape_strides_ev = + std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_dst_shape_strides = + packed_dst_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_dst_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + place_ev = fn(exec_q, cumsum_sz, dst_data_p, cumsum_data_p, rhs_data_p, + dst_nd, packed_dst_shape_strides, rhs_shape_vec[0], + rhs_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_dst_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + auto fn = + (use_i32) + ? masked_place_some_slices_strided_i32_impl_dispatch_vector + [dst_typeid] + : masked_place_some_slices_strided_i64_impl_dispatch_vector + [dst_typeid]; + + int masked_dst_nd = mask_span_sz; + int ortho_nd = dst_nd - masked_dst_nd; + + using shT = std::vector; + + shT ortho_dst_shape; + shT masked_dst_shape; + shT ortho_dst_strides; + shT masked_dst_strides; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis_start, axis_end, + ortho_dst_shape, + masked_dst_shape, // 4 vectors modified + ortho_dst_strides, masked_dst_strides); + + shT ortho_rhs_shape; + shT masked_rhs_shape; + shT ortho_rhs_strides; + shT masked_rhs_strides; + dpctl::tensor::py_internal::split_iteration_space( + rhs_shape_vec, rhs_strides_vec, axis_start, axis_start + 1, + ortho_rhs_shape, + masked_rhs_shape, // 4 vectors modified + ortho_rhs_strides, masked_rhs_strides); + + assert(ortho_dst_shape.size() == static_cast(ortho_nd)); + assert(ortho_rhs_shape.size() == static_cast(ortho_nd)); + assert(std::equal(ortho_dst_shape.begin(), ortho_dst_shape.end(), + ortho_rhs_shape.begin())); + + std::vector simplified_ortho_shape; + std::vector simplified_ortho_dst_strides; + std::vector simplified_ortho_rhs_strides; + + const py::ssize_t *_shape = ortho_dst_shape.data(); + + py::ssize_t ortho_dst_offset(0); + py::ssize_t ortho_rhs_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + ortho_nd, _shape, ortho_dst_strides, ortho_rhs_strides, + simplified_ortho_shape, simplified_ortho_dst_strides, + simplified_ortho_rhs_strides, ortho_dst_offset, ortho_rhs_offset); + + assert(masked_rhs_shape.size() == 1); + assert(masked_rhs_strides.size() == 1); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_ortho_shape, + simplified_ortho_dst_strides, simplified_ortho_rhs_strides, + masked_dst_shape, masked_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *packed_ortho_dst_rhs_shape_strides = + packed_shapes_strides; + const py::ssize_t *packed_masked_dst_shape_strides = + packed_shapes_strides + (3 * ortho_nd); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + place_ev = fn(exec_q, ortho_nelems, masked_dst_nelems, dst_data_p, + cumsum_data_p, rhs_data_p, + // data to build orthog_dst_rhs_indexer + ortho_nd, packed_ortho_dst_rhs_shape_strides, + ortho_dst_offset, ortho_rhs_offset, + // data to build masked_dst_indexer + masked_dst_nd, packed_masked_dst_shape_strides, + // data to build masked_dst_indexer, + masked_rhs_shape[0], masked_rhs_strides[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {place_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {dst, cumsum, rhs}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, place_ev); +} + +// Non-zero + +std::pair + py_nonzero(const dpctl::tensor::usm_ndarray + &cumsum, // int32/int64 input array, 1D, C-contiguous + const dpctl::tensor::usm_ndarray + &indexes, // int32/int64 2D output array, C-contiguous + const std::vector + &mask_shape, // shape of array from which cumsum was computed + sycl::queue &exec_q, + const std::vector &depends) +{ + if (!dpctl::utils::queues_are_compatible(exec_q, {cumsum, indexes})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(indexes); + + int cumsum_nd = cumsum.get_ndim(); + if (cumsum_nd != 1 || !cumsum.is_c_contiguous()) { + throw py::value_error("Cumsum array must be a C-contiguous vector"); + } + + int indexes_nd = indexes.get_ndim(); + if (indexes_nd != 2 || !indexes.is_c_contiguous()) { + throw py::value_error("Index array must be a C-contiguous matrix"); + } + + std::size_t _ndim = mask_shape.size(); + if (_ndim > std::numeric_limits::max()) { + throw py::value_error("Shape is too large"); + } + int ndim = static_cast(_ndim); + + const py::ssize_t *indexes_shape = indexes.get_shape_raw(); + + if (ndim != indexes_shape[0]) { + throw py::value_error( + "Length of shape must equal width of index matrix"); + } + + auto cumsum_sz = cumsum.get_size(); + py::ssize_t shape_nelems = + std::accumulate(mask_shape.begin(), mask_shape.end(), py::ssize_t(1), + std::multiplies()); + + if (cumsum_sz != shape_nelems) { + throw py::value_error("Shape and cumsum size are not consistent"); + } + + py::ssize_t nz_elems = indexes_shape[1]; + + int indexes_typenum = indexes.get_typenum(); + auto const &array_types = td_ns::usm_ndarray_types(); + int indexes_typeid = array_types.typenum_to_lookup_id(indexes_typenum); + + int cumsum_typenum = cumsum.get_typenum(); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + constexpr int int32_typeid = static_cast(td_ns::typenum_t::INT32); + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + + // cumsum must be int32_t or int64_t only + if ((cumsum_typeid != int32_typeid && cumsum_typeid != int64_typeid) || + (indexes_typeid != int32_typeid && indexes_typeid != int64_typeid)) + { + throw py::value_error("Cumulative sum array and index array must have " + "int32 or int64 data-type"); + } + + if (cumsum_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(cumsum, indexes)) { + throw py::value_error("Arrays are expected to ave no memory overlap"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + indexes, nz_elems * _ndim); + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto mask_shape_copying_tuple = device_allocate_and_pack( + exec_q, host_task_events, mask_shape); + auto src_shape_device_owner = + std::move(std::get<0>(mask_shape_copying_tuple)); + sycl::event copy_ev = std::get<2>(mask_shape_copying_tuple); + const py::ssize_t *src_shape_device_ptr = src_shape_device_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_ev); + + using dpctl::tensor::kernels::indexing::non_zero_indexes_fn_ptr_t; + using dpctl::tensor::kernels::indexing::non_zero_indexes_impl; + + int fn_index = ((cumsum_typeid == int64_typeid) ? 1 : 0) + + ((indexes_typeid == int64_typeid) ? 2 : 0); + std::array fn_impls = { + non_zero_indexes_impl, + non_zero_indexes_impl, + non_zero_indexes_impl, + non_zero_indexes_impl}; + auto fn = fn_impls[fn_index]; + + sycl::event non_zero_indexes_ev = + fn(exec_q, cumsum_sz, nz_elems, ndim, cumsum.get_data(), + indexes.get_data(), src_shape_device_ptr, all_deps); + + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {non_zero_indexes_ev}, src_shape_device_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {cumsum, indexes}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, non_zero_indexes_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp new file mode 100644 index 000000000000..71eafc77b00c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/boolean_advanced_indexing.hpp @@ -0,0 +1,81 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_extract(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_masked_extract_dispatch_vectors(void); + +extern std::pair + py_place(const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &cumsum, + int axis_start, // axis_start <= mask_i < axis_end + int axis_end, + const dpctl::tensor::usm_ndarray &rhs, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void populate_masked_place_dispatch_vectors(void); + +extern std::pair + py_nonzero(const dpctl::tensor::usm_ndarray + &cumsum, // int32 input array, 1D, C-contiguous + const dpctl::tensor::usm_ndarray + &indexes, // int32 2D output array, C-contiguous + const std::vector + &mask_shape, // shape of array from which cumsum was computed + sycl::queue &exec_q, + const std::vector &depends = {}); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 3c096df8e367..901af48074be 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -44,7 +44,7 @@ #include "dpnp4pybind11.hpp" #include "accumulators.hpp" -// #include "boolean_advanced_indexing.hpp" +#include "boolean_advanced_indexing.hpp" // #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" @@ -112,10 +112,10 @@ using dpctl::tensor::py_internal::usm_ndarray_zeros; using dpctl::tensor::py_internal::usm_ndarray_put; using dpctl::tensor::py_internal::usm_ndarray_take; -// using dpctl::tensor::py_internal::py_extract; +using dpctl::tensor::py_internal::py_extract; using dpctl::tensor::py_internal::py_mask_positions; -// using dpctl::tensor::py_internal::py_nonzero; -// using dpctl::tensor::py_internal::py_place; +using dpctl::tensor::py_internal::py_nonzero; +using dpctl::tensor::py_internal::py_place; /* ================= Repeat ====================*/ using dpctl::tensor::py_internal::py_cumsum_1d; @@ -163,8 +163,8 @@ void init_dispatch_vectors(void) // init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); - // populate_masked_extract_dispatch_vectors(); - // populate_masked_place_dispatch_vectors(); + populate_masked_extract_dispatch_vectors(); + populate_masked_place_dispatch_vectors(); populate_mask_positions_dispatch_vectors(); @@ -415,9 +415,9 @@ PYBIND11_MODULE(_tensor_impl, m) m.def("_cumsum_1d", &py_cumsum_1d, "", py::arg("src"), py::arg("cumsum"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), - // py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_extract", &py_extract, "", py::arg("src"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); auto overlap = [](const dpctl::tensor::usm_ndarray &x1, const dpctl::tensor::usm_ndarray &x2) -> bool { @@ -438,13 +438,13 @@ PYBIND11_MODULE(_tensor_impl, m) "Determines if the memory regions indexed by each array are the same", py::arg("array1"), py::arg("array2")); - // m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), - // py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"), + py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), - // py::arg("mask_shape"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_nonzero", &py_nonzero, "", py::arg("cumsum"), py::arg("indexes"), + py::arg("mask_shape"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), From 70a0fc6ccf80c8a5490a8bfdaca8731d8aa89c2d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 05:58:22 -0800 Subject: [PATCH 052/145] Move place() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_indexing_functions.py | 68 +++++++++++++++++++++++++ dpnp/dpnp_iface_indexing.py | 2 +- dpnp/dpnp_iface_manipulation.py | 2 +- 4 files changed, 72 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index edb2c096bad1..d4153b340596 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -40,6 +40,7 @@ triu, ) from dpctl_ext.tensor._indexing_functions import ( + place, put, take, ) @@ -54,6 +55,7 @@ "copy", "from_numpy", "full", + "place", "put", "reshape", "roll", diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index df4f3e953042..97aac6caf20f 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -50,6 +50,74 @@ def _get_indexing_mode(name): ) +def place(arr, mask, vals): + """place(arr, mask, vals) + + Change elements of an array based on conditional and input values. + + If ``mask`` is boolean ``dpctl.tensor.place`` is + equivalent to ``arr[condition] = vals``. + + Args: + arr (usm_ndarray): + Array to put data into. + mask (usm_ndarray): + Boolean mask array. Must have the same size as ``arr``. + vals (usm_ndarray, sequence): + Values to put into ``arr``. Only the first N elements are + used, where N is the number of True values in ``mask``. If + ``vals`` is smaller than N, it will be repeated, and if + elements of ``arr`` are to be masked, this sequence must be + non-empty. Array ``vals`` must be one dimensional. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if not isinstance(mask, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(mask)}" + ) + if not isinstance(vals, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(vals)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + arr.sycl_queue, + mask.sycl_queue, + vals.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if arr.shape != mask.shape or vals.ndim != 1: + raise ValueError("Array sizes are not as required") + cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + deps_ev = _manager.submitted_events + nz_count = ti.mask_positions( + mask, cumsum, sycl_queue=exec_q, depends=deps_ev + ) + if nz_count == 0: + return + if vals.size == 0: + raise ValueError("Cannot insert from an empty array!") + if vals.dtype == arr.dtype: + rhs = vals + else: + rhs = dpt.astype(vals, arr.dtype) + hev, pl_ev = ti._place( + dst=arr, + cumsum=cumsum, + axis_start=0, + axis_end=mask.ndim, + rhs=rhs, + sycl_queue=exec_q, + ) + _manager.add_event_pair(hev, pl_ev) + + def put(x, indices, vals, /, *, axis=None, mode="wrap"): """put(x, indices, vals, axis=None, mode="wrap") diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index b0769337c38b..a92370a69525 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -1619,7 +1619,7 @@ def place(a, mask, vals): usm_vals, usm_a.dtype, casting="safe", copy=False ) - dpt.place(usm_a, usm_mask, usm_vals) + dpt_ext.place(usm_a, usm_mask, usm_vals) def put(a, ind, v, /, *, axis=None, mode="wrap"): diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 4866c912ab6a..e988bbaa237b 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -415,7 +415,7 @@ def _get_first_nan_index(usm_a): if first_nan is not None: # all NaNs are collapsed, so need to replace the indices with # the index of the first NaN value in result array of unique values - dpt.place( + dpt_ext.place( usm_res.inverse_indices, usm_res.inverse_indices > first_nan, dpt_ext.reshape(first_nan, 1), From afa5411c6c8f5d1caf5cd0a134ad28725128c55a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 06:10:07 -0800 Subject: [PATCH 053/145] Move extract() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_copy_utils.py | 70 +++++++++++++++++++++++++ dpctl_ext/tensor/_indexing_functions.py | 48 +++++++++++++++++ dpnp/dpnp_iface_indexing.py | 2 +- 4 files changed, 121 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index d4153b340596..8fb164828eb5 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -40,6 +40,7 @@ triu, ) from dpctl_ext.tensor._indexing_functions import ( + extract, place, put, take, @@ -53,6 +54,7 @@ "asnumpy", "astype", "copy", + "extract", "from_numpy", "full", "place", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index c62218893a2c..4be6475ec4a3 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -27,6 +27,7 @@ # ***************************************************************************** import builtins +import operator import dpctl import dpctl.memory as dpm @@ -41,6 +42,8 @@ # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti +from ._numpy_helper import normalize_axis_index + __doc__ = ( "Implementation module for copy- and cast- operations on " ":class:`dpctl.tensor.usm_ndarray`." @@ -130,6 +133,73 @@ def _copy_from_numpy_into(dst, np_ary): ) +def _extract_impl(ary, ary_mask, axis=0): + """ + Extract elements of ary by applying mask starting from slot + dimension axis + """ + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + if isinstance(ary_mask, dpt.usm_ndarray): + dst_usm_type = dpctl.utils.get_coerced_usm_type( + (ary.usm_type, ary_mask.usm_type) + ) + exec_q = dpctl.utils.get_execution_queue( + (ary.sycl_queue, ary_mask.sycl_queue) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "arrays have different associated queues. " + "Use `y.to_device(x.device)` to migrate." + ) + elif isinstance(ary_mask, np.ndarray): + dst_usm_type = ary.usm_type + exec_q = ary.sycl_queue + ary_mask = dpt.asarray( + ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q + ) + else: + raise TypeError( + "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got " + f"{type(ary_mask)}" + ) + ary_nd = ary.ndim + pp = normalize_axis_index(operator.index(axis), ary_nd) + mask_nd = ary_mask.ndim + if pp < 0 or pp + mask_nd > ary_nd: + raise ValueError( + "Parameter p is inconsistent with input array dimensions" + ) + mask_nelems = ary_mask.size + cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 + cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) + exec_q = cumsum.sycl_queue + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] + dst = dpt.empty( + dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device + ) + if dst.size == 0: + return dst + hev, ev = ti._extract( + src=ary, + cumsum=cumsum, + axis_start=pp, + axis_end=pp + mask_nd, + dst=dst, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, ev) + return dst + + def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): """ from_numpy(arg, device=None, usm_type="device", sycl_queue=None) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 97aac6caf20f..de889f239d1f 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -37,6 +37,9 @@ import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti +from ._copy_utils import ( + _extract_impl, +) from ._numpy_helper import normalize_axis_index @@ -50,6 +53,51 @@ def _get_indexing_mode(name): ) +def extract(condition, arr): + """extract(condition, arr) + + Returns the elements of an array that satisfies the condition. + + If ``condition`` is boolean ``dpctl.tensor.extract`` is + equivalent to ``arr[condition]``. + + Note that ``dpctl.tensor.place`` does the opposite of + ``dpctl.tensor.extract``. + + Args: + conditions (usm_ndarray): + An array whose non-zero or ``True`` entries indicate the element + of ``arr`` to extract. + + arr (usm_ndarray): + Input array of the same size as ``condition``. + + Returns: + usm_ndarray: + Rank 1 array of values from ``arr`` where ``condition`` is + ``True``. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + exec_q = dpctl.utils.get_execution_queue( + ( + condition.sycl_queue, + arr.sycl_queue, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError + if condition.shape != arr.shape: + raise ValueError("Arrays are not of the same size") + return _extract_impl(arr, condition) + + def place(arr, mask, vals): """place(arr, mask, vals) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index a92370a69525..c462c1d5854a 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -823,7 +823,7 @@ def extract(condition, a): usm_a = dpt_ext.reshape(usm_a, -1) usm_cond = dpt_ext.reshape(usm_cond, -1) - usm_res = dpt.extract(usm_cond, usm_a) + usm_res = dpt_ext.extract(usm_cond, usm_a) return dpnp_array._create_from_usm_ndarray(usm_res) From 7feb4ee30381ef3935e19a25263b1ad3de3895e3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 06:18:09 -0800 Subject: [PATCH 054/145] Move nonzero() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_copy_utils.py | 31 +++++++++++++++++++++++++ dpctl_ext/tensor/_indexing_functions.py | 28 ++++++++++++++++++++++ dpnp/dpnp_iface_indexing.py | 4 ++-- 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 8fb164828eb5..a14d86079b0a 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -41,6 +41,7 @@ ) from dpctl_ext.tensor._indexing_functions import ( extract, + nonzero, place, put, take, @@ -57,6 +58,7 @@ "extract", "from_numpy", "full", + "nonzero", "place", "put", "reshape", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 4be6475ec4a3..bef2b5a5cf16 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -200,6 +200,37 @@ def _extract_impl(ary, ary_mask, axis=0): return dst +def _nonzero_impl(ary): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + exec_q = ary.sycl_queue + usm_type = ary.usm_type + mask_nelems = ary.size + cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 + cumsum = dpt.empty( + mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" + ) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + mask_count = ti.mask_positions( + ary, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + indexes_dt = ti.default_device_index_type(exec_q.sycl_device) + indexes = dpt.empty( + (ary.ndim, mask_count), + dtype=indexes_dt, + usm_type=usm_type, + sycl_queue=exec_q, + order="C", + ) + hev, nz_ev = ti._nonzero(cumsum, indexes, ary.shape, exec_q) + res = tuple(indexes[i, :] for i in range(ary.ndim)) + _manager.add_event_pair(hev, nz_ev) + return res + + def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): """ from_numpy(arg, device=None, usm_type="device", sycl_queue=None) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index de889f239d1f..4feb2f9adbeb 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -39,6 +39,7 @@ from ._copy_utils import ( _extract_impl, + _nonzero_impl, ) from ._numpy_helper import normalize_axis_index @@ -98,6 +99,33 @@ def extract(condition, arr): return _extract_impl(arr, condition) +def nonzero(arr): + """nonzero(arr) + + Return the indices of non-zero elements. + + Returns a tuple of usm_ndarrays, one for each dimension + of ``arr``, containing the indices of the non-zero elements + in that dimension. The values of ``arr`` are always tested in + row-major, C-style order. + + Args: + arr (usm_ndarray): + Input array, which has non-zero array rank. + + Returns: + Tuple[usm_ndarray, ...]: + Indices of non-zero array elements. + """ + if not isinstance(arr, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(arr)}" + ) + if arr.ndim == 0: + raise ValueError("Array of positive rank is expected") + return _nonzero_impl(arr) + + def place(arr, mask, vals): """place(arr, mask, vals) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index c462c1d5854a..5e1a6a0ecc3d 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -817,7 +817,7 @@ def extract(condition, a): usm_a = dpt_ext.reshape(usm_a, -1) usm_cond = dpt_ext.reshape(usm_cond, -1) - usm_res = dpt_ext.take(usm_a, dpt.nonzero(usm_cond)[0]) + usm_res = dpt_ext.take(usm_a, dpt_ext.nonzero(usm_cond)[0]) else: if usm_cond.shape != usm_a.shape: usm_a = dpt_ext.reshape(usm_a, -1) @@ -1546,7 +1546,7 @@ def nonzero(a): usm_a = dpnp.get_usm_ndarray(a) return tuple( - dpnp_array._create_from_usm_ndarray(y) for y in dpt.nonzero(usm_a) + dpnp_array._create_from_usm_ndarray(y) for y in dpt_ext.nonzero(usm_a) ) From f63f2f05540ad0dc7d2f04957446f4cf8429dd83 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 06:32:06 -0800 Subject: [PATCH 055/145] Move put_along_axis to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_copy_utils.py | 153 ++++++++++++++++++++++++ dpctl_ext/tensor/_indexing_functions.py | 87 ++++++++++++++ dpnp/dpnp_iface_indexing.py | 2 +- 4 files changed, 243 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a14d86079b0a..e3f50236f261 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -44,6 +44,7 @@ nonzero, place, put, + put_along_axis, take, ) from dpctl_ext.tensor._manipulation_functions import ( @@ -61,6 +62,7 @@ "nonzero", "place", "put", + "put_along_axis", "reshape", "roll", "take", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index bef2b5a5cf16..95608ace0c3b 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -28,6 +28,7 @@ import builtins import operator +from numbers import Integral import dpctl import dpctl.memory as dpm @@ -40,6 +41,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index @@ -200,6 +202,42 @@ def _extract_impl(ary, ary_mask, axis=0): return dst +def _get_indices_queue_usm_type(inds, queue, usm_type): + """ + Utility for validating indices are NumPy ndarray or usm_ndarray of integral + dtype or Python integers. At least one must be an array. + + For each array, the queue and usm type are appended to `queue_list` and + `usm_type_list`, respectively. + """ + queues = [queue] + usm_types = [usm_type] + any_array = False + for ind in inds: + if isinstance(ind, (np.ndarray, dpt.usm_ndarray)): + any_array = True + if ind.dtype.kind not in "ui": + raise IndexError( + "arrays used as indices must be of integer (or boolean) " + "type" + ) + if isinstance(ind, dpt.usm_ndarray): + queues.append(ind.sycl_queue) + usm_types.append(ind.usm_type) + elif not isinstance(ind, Integral): + raise TypeError( + "all elements of `ind` expected to be usm_ndarrays, " + f"NumPy arrays, or integers, found {type(ind)}" + ) + if not any_array: + raise TypeError( + "at least one element of `inds` expected to be an array" + ) + usm_type = dpctl.utils.get_coerced_usm_type(usm_types) + q = dpctl.utils.get_execution_queue(queues) + return q, usm_type + + def _nonzero_impl(ary): if not isinstance(ary, dpt.usm_ndarray): raise TypeError( @@ -231,6 +269,121 @@ def _nonzero_impl(ary): return res +def _prepare_indices_arrays(inds, q, usm_type): + """ + Utility taking a mix of usm_ndarray and possibly Python int scalar indices, + a queue (assumed to be common to arrays in inds), and a usm type. + + Python scalar integers are promoted to arrays on the provided queue and + with the provided usm type. All arrays are then promoted to a common + integral type (if possible) before being broadcast to a common shape. + """ + # scalar integers -> arrays + inds = tuple( + map( + lambda ind: ( + ind + if isinstance(ind, dpt.usm_ndarray) + else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q) + ), + inds, + ) + ) + + # promote to a common integral type if possible + ind_dt = dpt.result_type(*inds) + if ind_dt.kind not in "ui": + raise ValueError( + "cannot safely promote indices to an integer data type" + ) + inds = tuple( + map( + lambda ind: ( + ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt) + ), + inds, + ) + ) + + # broadcast + inds = dpt.broadcast_arrays(*inds) + + return inds + + +def _put_multi_index(ary, inds, p, vals, mode=0): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + ary_nd = ary.ndim + p = normalize_axis_index(operator.index(p), ary_nd) + mode = operator.index(mode) + if mode not in [0, 1]: + raise ValueError( + "Invalid value for mode keyword, only 0 or 1 is supported" + ) + if not isinstance(inds, (list, tuple)): + inds = (inds,) + + exec_q, coerced_usm_type = _get_indices_queue_usm_type( + inds, ary.sycl_queue, ary.usm_type + ) + + if exec_q is not None: + if not isinstance(vals, dpt.usm_ndarray): + vals = dpt.asarray( + vals, + dtype=ary.dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + else: + exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue)) + coerced_usm_type = dpctl.utils.get_coerced_usm_type( + ( + coerced_usm_type, + vals.usm_type, + ) + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + inds = _prepare_indices_arrays(inds, exec_q, coerced_usm_type) + + ind0 = inds[0] + ary_sh = ary.shape + p_end = p + len(inds) + if 0 in ary_sh[p:p_end] and ind0.size != 0: + raise IndexError( + "cannot put into non-empty indices along an empty axis" + ) + expected_vals_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:] + if vals.dtype == ary.dtype: + rhs = vals + else: + rhs = dpt_ext.astype(vals, ary.dtype) + rhs = dpt.broadcast_to(rhs, expected_vals_shape) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, put_ev = ti._put( + dst=ary, + ind=inds, + val=rhs, + axis_start=p, + mode=mode, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(hev, put_ev) + return + + def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): """ from_numpy(arg, device=None, usm_type="device", sycl_queue=None) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 4feb2f9adbeb..8bc512bfe9d4 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -40,6 +40,7 @@ from ._copy_utils import ( _extract_impl, _nonzero_impl, + _put_multi_index, ) from ._numpy_helper import normalize_axis_index @@ -54,6 +55,12 @@ def _get_indexing_mode(name): ) +def _range(sh_i, i, nd, q, usm_t, dt): + ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q) + ind.shape = tuple(sh_i if i == j else 1 for j in range(nd)) + return ind + + def extract(condition, arr): """extract(condition, arr) @@ -343,6 +350,86 @@ def put_vec_duplicates(vec, ind, vals): _manager.add_event_pair(hev, put_ev) +def put_along_axis(x, indices, vals, /, *, axis=-1, mode="wrap"): + """ + Puts elements into an array at the one-dimensional indices specified by + ``indices`` along a provided ``axis``. + + Args: + x (usm_ndarray): + input array. Must be compatible with ``indices``, except for the + axis (dimension) specified by ``axis``. + indices (usm_ndarray): + array indices. Must have the same rank (i.e., number of dimensions) + as ``x``. + vals (usm_ndarray): + Array of values to be put into ``x``. + Must be broadcastable to the shape of ``indices``. + axis: int + axis along which to select values. If ``axis`` is negative, the + function determines the axis along which to select values by + counting from the last dimension. Default: ``-1``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + .. note:: + + If input array ``indices`` contains duplicates, a race condition + occurs, and the value written into corresponding positions in ``x`` + may vary from run to run. Preserving sequential semantics in handing + the duplicates to achieve deterministic behavior requires additional + work. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}" + ) + x_nd = x.ndim + if x_nd != indices.ndim: + raise ValueError( + "Number of dimensions in the first and the second " + "argument arrays must be equal" + ) + pp = normalize_axis_index(operator.index(axis), x_nd) + if isinstance(vals, dpt.usm_ndarray): + queues_ = [x.sycl_queue, indices.sycl_queue, vals.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type, vals.usm_type] + else: + queues_ = [x.sycl_queue, indices.sycl_queue] + usm_types_ = [x.usm_type, indices.usm_type] + exec_q = dpctl.utils.get_execution_queue(queues_) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + ) + out_usm_type = dpctl.utils.get_coerced_usm_type(usm_types_) + mode_i = _get_indexing_mode(mode) + indexes_dt = ( + dpt.uint64 + if indices.dtype == dpt.uint64 + else ti.default_device_index_type(exec_q.sycl_device) + ) + _ind = tuple( + ( + indices + if i == pp + else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt) + ) + for i in range(x_nd) + ) + return _put_multi_index(x, _ind, 0, vals, mode=mode_i) + + def take(x, indices, /, *, axis=None, out=None, mode="wrap"): """take(x, indices, axis=None, out=None, mode="wrap") diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 5e1a6a0ecc3d..6ccc655796a4 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -1807,7 +1807,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"): values, usm_type=a.usm_type, sycl_queue=a.sycl_queue ) - dpt.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode) + dpt_ext.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode) def putmask(x1, mask, values): From 6fecefe2ee88c04bd1e2a8978f4facac316628f8 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 06:37:40 -0800 Subject: [PATCH 056/145] Move take_along_axis() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_copy_utils.py | 52 +++++++++++++++++ dpctl_ext/tensor/_indexing_functions.py | 78 +++++++++++++++++++++++++ dpnp/dpnp_iface_indexing.py | 2 +- 4 files changed, 133 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index e3f50236f261..69aca2b5143d 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -46,6 +46,7 @@ put, put_along_axis, take, + take_along_axis, ) from dpctl_ext.tensor._manipulation_functions import ( roll, @@ -66,6 +67,7 @@ "reshape", "roll", "take", + "take_along_axis", "to_numpy", "tril", "triu", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 95608ace0c3b..5d1ac209c86b 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -384,6 +384,58 @@ def _put_multi_index(ary, inds, p, vals, mode=0): return +def _take_multi_index(ary, inds, p, mode=0): + if not isinstance(ary, dpt.usm_ndarray): + raise TypeError( + f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}" + ) + ary_nd = ary.ndim + p = normalize_axis_index(operator.index(p), ary_nd) + mode = operator.index(mode) + if mode not in [0, 1]: + raise ValueError( + "Invalid value for mode keyword, only 0 or 1 is supported" + ) + if not isinstance(inds, (list, tuple)): + inds = (inds,) + + exec_q, res_usm_type = _get_indices_queue_usm_type( + inds, ary.sycl_queue, ary.usm_type + ) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Can not automatically determine where to allocate the " + "result or performance execution. " + "Use `usm_ndarray.to_device` method to migrate data to " + "be associated with the same queue." + ) + + inds = _prepare_indices_arrays(inds, exec_q, res_usm_type) + + ind0 = inds[0] + ary_sh = ary.shape + p_end = p + len(inds) + if 0 in ary_sh[p:p_end] and ind0.size != 0: + raise IndexError("cannot take non-empty indices from an empty axis") + res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:] + res = dpt.empty( + res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + hev, take_ev = ti._take( + src=ary, + ind=inds, + dst=res, + axis_start=p, + mode=mode, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(hev, take_ev) + return res + + def from_numpy(np_ary, /, *, device=None, usm_type="device", sycl_queue=None): """ from_numpy(arg, device=None, usm_type="device", sycl_queue=None) diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 8bc512bfe9d4..6ca327192f73 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -41,6 +41,7 @@ _extract_impl, _nonzero_impl, _put_multi_index, + _take_multi_index, ) from ._numpy_helper import normalize_axis_index @@ -561,3 +562,80 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"): out = orig_out return out + + +def take_along_axis(x, indices, /, *, axis=-1, mode="wrap"): + """ + Returns elements from an array at the one-dimensional indices specified + by ``indices`` along a provided ``axis``. + + Args: + x (usm_ndarray): + input array. Must be compatible with ``indices``, except for the + axis (dimension) specified by ``axis``. + indices (usm_ndarray): + array indices. Must have the same rank (i.e., number of dimensions) + as ``x``. + axis: int + axis along which to select values. If ``axis`` is negative, the + function determines the axis along which to select values by + counting from the last dimension. Default: ``-1``. + mode (str, optional): + How out-of-bounds indices will be handled. Possible values + are: + + - ``"wrap"``: clamps indices to (``-n <= i < n``), then wraps + negative indices. + - ``"clip"``: clips indices to (``0 <= i < n``). + + Default: ``"wrap"``. + + Returns: + usm_ndarray: + an array having the same data type as ``x``. The returned array has + the same rank (i.e., number of dimensions) as ``x`` and a shape + determined according to broadcasting rules, except for the axis + (dimension) specified by ``axis`` whose size must equal the size + of the corresponding axis (dimension) in ``indices``. + + Note: + Treatment of the out-of-bound indices in ``indices`` array is controlled + by the value of ``mode`` keyword. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + if not isinstance(indices, dpt.usm_ndarray): + raise TypeError( + f"Expected dpctl.tensor.usm_ndarray, got {type(indices)}" + ) + x_nd = x.ndim + if x_nd != indices.ndim: + raise ValueError( + "Number of dimensions in the first and the second " + "argument arrays must be equal" + ) + pp = normalize_axis_index(operator.index(axis), x_nd) + out_usm_type = dpctl.utils.get_coerced_usm_type( + (x.usm_type, indices.usm_type) + ) + exec_q = dpctl.utils.get_execution_queue((x.sycl_queue, indices.sycl_queue)) + if exec_q is None: + raise dpctl.utils.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + ) + mode_i = _get_indexing_mode(mode) + indexes_dt = ( + dpt.uint64 + if indices.dtype == dpt.uint64 + else ti.default_device_index_type(exec_q.sycl_device) + ) + _ind = tuple( + ( + indices + if i == pp + else _range(x.shape[i], i, x_nd, exec_q, out_usm_type, indexes_dt) + ) + for i in range(x_nd) + ) + return _take_multi_index(x, _ind, 0, mode=mode_i) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 6ccc655796a4..95998dbdda4b 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -2295,7 +2295,7 @@ def take_along_axis(a, indices, axis=-1, mode="wrap"): usm_a = dpnp.get_usm_ndarray(a) usm_ind = dpnp.get_usm_ndarray(indices) - usm_res = dpt.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode) + usm_res = dpt_ext.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode) return dpnp_array._create_from_usm_ndarray(usm_res) From b60d095ff6bbac9c96f8d139697333b169cc5ae1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 07:06:03 -0800 Subject: [PATCH 057/145] Move ti._eye to() to dpctl_ext/tensor/libtensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/constructors.hpp | 96 +++++++++++- .../tensor/libtensor/source/eye_ctor.cpp | 142 ++++++++++++++++++ .../tensor/libtensor/source/eye_ctor.hpp | 57 +++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 24 +-- 5 files changed, 307 insertions(+), 14 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/eye_ctor.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 88507aa2192f..0b166a202735 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -54,7 +54,7 @@ set(_tensor_impl_sources # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 22189ee3129c..26ae46707a6b 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -56,7 +56,8 @@ using dpctl::tensor::ssize_t; template class full_strided_kernel; -// template class eye_kernel; +template +class eye_kernel; using namespace dpctl::tensor::offset_utils; @@ -162,6 +163,99 @@ sycl::event full_strided_impl(sycl::queue &q, return fill_ev; } +/* ================ Eye ================== */ + +typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &, + std::size_t nelems, // num_elements + ssize_t start, + ssize_t end, + ssize_t step, + char *, // dst_data_ptr + const std::vector &); + +template +class EyeFunctor +{ +private: + Ty *p = nullptr; + ssize_t start_v; + ssize_t end_v; + ssize_t step_v; + +public: + EyeFunctor(char *dst_p, + const ssize_t v0, + const ssize_t v1, + const ssize_t dv) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + Ty set_v = 0; + ssize_t i = static_cast(wiid.get(0)); + if (i >= start_v and i <= end_v) { + if ((i - start_v) % step_v == 0) { + set_v = 1; + } + } + p[i] = set_v; + } +}; + +/*! + * @brief Function to populate 2D array with eye matrix. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Number of elements to assign. + * @param start Position of the first non-zero value. + * @param end Position of the last non-zero value. + * @param step Number of array elements between non-zeros. + * @param array_data Kernel accessible USM pointer for the destination array. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event eye_impl(sycl::queue &exec_q, + std::size_t nelems, + const ssize_t start, + const ssize_t end, + const ssize_t step, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = eye_kernel; + using Impl = EyeFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start, end, step)); + }); + + return eye_event; +} + +/*! + * @brief Factory to get function pointer of type `fnT` for data type `Ty`. + * @ingroup CtorKernels + */ +template +struct EyeFactory +{ + fnT get() + { + fnT f = eye_impl; + return f; + } +}; + /* =========================== Tril and triu ============================== */ // define function type diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp new file mode 100644 index 000000000000..025a7d58d06e --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.cpp @@ -0,0 +1,142 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +#include "eye_ctor.hpp" +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::utils::keep_args_alive; + +using dpctl::tensor::kernels::constructors::eye_fn_ptr_t; +static eye_fn_ptr_t eye_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_eye(py::ssize_t k, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 2D + + if (dst.get_ndim() != 2) { + throw py::value_error( + "usm_ndarray_eye: Expecting 2D array to populate"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error("Execution queue is not compatible with the " + "allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + const py::ssize_t nelem = dst.get_size(); + const py::ssize_t rows = dst.get_shape(0); + const py::ssize_t cols = dst.get_shape(1); + if (rows == 0 || cols == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + if (!is_dst_c_contig && !is_dst_f_contig) { + throw py::value_error("USM array is not contiguous"); + } + + py::ssize_t start; + if (is_dst_c_contig) { + start = (k < 0) ? -k * cols : k; + } + else { + start = (k < 0) ? -k : k * rows; + } + + const py::ssize_t *strides = dst.get_strides_raw(); + py::ssize_t step; + if (strides == nullptr) { + step = (is_dst_c_contig) ? cols + 1 : rows + 1; + } + else { + step = strides[0] + strides[1]; + } + + const py::ssize_t length = std::min({rows, cols, rows + k, cols - k}); + const py::ssize_t end = start + step * (length - 1); + + char *dst_data = dst.get_data(); + sycl::event eye_event; + + auto fn = eye_dispatch_vector[dst_typeid]; + + eye_event = fn(exec_q, static_cast(nelem), start, end, step, + dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {eye_event}), + eye_event); +} + +void init_eye_ctor_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::constructors::EyeFactory; + + DispatchVectorBuilder dvb; + dvb.populate_dispatch_vector(eye_dispatch_vector); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp new file mode 100644 index 000000000000..dda7f2c4813a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/eye_ctor.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + usm_ndarray_eye(py::ssize_t k, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_eye_ctor_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 901af48074be..98ab488e5879 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -52,7 +52,7 @@ #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" -// #include "eye_ctor.hpp" +#include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" @@ -124,7 +124,7 @@ using dpctl::tensor::py_internal::py_cumsum_1d; /* ================ Eye ================== */ -// using dpctl::tensor::py_internal::usm_ndarray_eye; +using dpctl::tensor::py_internal::usm_ndarray_eye; /* =========================== Tril and triu ============================== */ @@ -160,7 +160,7 @@ void init_dispatch_vectors(void) // init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); - // init_eye_ctor_dispatch_vectors(); + init_eye_ctor_dispatch_vectors(); init_triul_ctor_dispatch_vectors(); populate_masked_extract_dispatch_vectors(); @@ -348,15 +348,15 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("mode"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_eye", &usm_ndarray_eye, - // "Fills input 2D contiguous usm_ndarray `dst` with " - // "zeros outside of the diagonal " - // "specified by " - // "the diagonal index `k` " - // "which is filled with ones." - // "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_eye", &usm_ndarray_eye, + "Fills input 2D contiguous usm_ndarray `dst` with " + "zeros outside of the diagonal " + "specified by " + "the diagonal index `k` " + "which is filled with ones." + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("k"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("default_device_fp_type", dpctl::tensor::py_internal::default_device_fp_type, From a1eea4e730f9dde47d7fd2eae0fa338e1e500299 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 09:26:18 -0800 Subject: [PATCH 058/145] Move eye() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 128 +++++++++++++++++++++++++++++++++++ dpnp/dpnp_container.py | 2 +- 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 69aca2b5143d..fa76faccc632 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -35,6 +35,7 @@ to_numpy, ) from dpctl_ext.tensor._ctors import ( + eye, full, tril, triu, @@ -58,6 +59,7 @@ "astype", "copy", "extract", + "eye", "from_numpy", "full", "nonzero", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 5a39e9367e9c..5a9e07c73346 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -58,6 +58,38 @@ def _cast_fill_val(fill_val, dt): return fill_val +def _ensure_native_dtype_device_support(dtype, dev) -> None: + """Check that dtype is natively supported by device. + + Arg: + dtype: + Elemental data-type + dev (:class:`dpctl.SyclDevice`): + The device about which the query is being made. + Returns: + None + Raise: + ValueError: + if device does not natively support this `dtype`. + """ + if dtype in [dpt.float64, dpt.complex128] and not dev.has_aspect_fp64: + raise ValueError( + f"Device {dev.name} does not provide native support " + "for double-precision floating point type." + ) + if ( + dtype + in [ + dpt.float16, + ] + and not dev.has_aspect_fp16 + ): + raise ValueError( + f"Device {dev.name} does not provide native support " + "for half-precision floating point type." + ) + + def _to_scalar(obj, sc_ty): """A way to convert object to NumPy scalar type. Raises OverflowError if obj can not be represented @@ -67,6 +99,102 @@ def _to_scalar(obj, sc_ty): return zd_arr[()] +def eye( + n_rows, + n_cols=None, + /, + *, + k=0, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + eye(n_rows, n_cols=None, /, *, k=0, dtype=None, \ + device=None, usm_type="device", sycl_queue=None) + + Creates :class:`dpctl.tensor.usm_ndarray` with ones on the `k`-th + diagonal. + + Args: + n_rows (int): + number of rows in the output array. + n_cols (int, optional): + number of columns in the output array. If ``None``, + ``n_cols = n_rows``. Default: ``None`` + k (int): + index of the diagonal, with ``0`` as the main diagonal. + A positive value of ``k`` is a superdiagonal, a negative value + is a subdiagonal. + Raises :exc:`TypeError` if ``k`` is not an integer. + Default: ``0`` + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or + a NumPy scalar type. Default: ``None`` + order ("C" or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + A diagonal matrix. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + n_rows = operator.index(n_rows) + n_cols = n_rows if n_cols is None else operator.index(n_cols) + k = operator.index(k) + if k >= n_cols or -k >= n_rows: + return dpt.zeros( + (n_rows, n_cols), + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + (n_rows, n_cols), + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + if n_rows != 0 and n_cols != 0: + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + hev, eye_ev = ti._eye(k, dst=res, sycl_queue=sycl_queue) + _manager.add_event_pair(hev, eye_ev) + return res + + def _validate_fill_value(fill_val): """Validates that `fill_val` is a numeric or boolean scalar.""" # TODO: verify if `np.True_` and `np.False_` should be instances of diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index acda579a5f5e..0727b9bfd775 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -196,7 +196,7 @@ def eye( order = "C" """Creates `dpnp_array` with ones on the `k`th diagonal.""" - array_obj = dpt.eye( + array_obj = dpt_ext.eye( N, M, k=k, From 95ccf708ffcce4bcfcf1c1f934bf92d986911cca Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 13:17:17 -0800 Subject: [PATCH 059/145] Move ti._repeat... to dpctl_ext/tensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../libtensor/include/kernels/repeat.hpp | 462 ++++++++++ dpctl_ext/tensor/libtensor/source/repeat.cpp | 820 ++++++++++++++++++ dpctl_ext/tensor/libtensor/source/repeat.hpp | 83 ++ .../tensor/libtensor/source/tensor_ctors.cpp | 84 +- 5 files changed, 1407 insertions(+), 44 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/repeat.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 0b166a202735..797dbd6f4d1b 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -60,7 +60,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp new file mode 100644 index 000000000000..aab9a709f010 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/repeat.hpp @@ -0,0 +1,462 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor repeating operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::repeat +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +template +class repeat_by_sequence_kernel; + +template +class RepeatSequenceFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + const repT *reps = nullptr; + const repT *cumsum = nullptr; + std::size_t src_axis_nelems = 1; + OrthogIndexer orthog_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; + RepIndexer reps_strider; + +public: + RepeatSequenceFunctor(const T *src_, + T *dst_, + const repT *reps_, + const repT *cumsum_, + std::size_t src_axis_nelems_, + const OrthogIndexer &orthog_strider_, + const SrcAxisIndexer &src_axis_strider_, + const DstAxisIndexer &dst_axis_strider_, + const RepIndexer &reps_strider_) + : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_), + src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_), + src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_), reps_strider(reps_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + std::size_t id = idx[0]; + auto i_orthog = id / src_axis_nelems; + auto i_along = id - (i_orthog * src_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto val = src[src_offset + src_axis_strider(i_along)]; + auto last = cumsum[i_along]; + auto first = last - reps[reps_strider(i_along)]; + for (auto i = first; i < last; ++i) { + dst[dst_offset + dst_axis_strider(i)] = val; + } + } +}; + +typedef sycl::event (*repeat_by_sequence_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + const char *, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event + repeat_by_sequence_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t src_axis_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + int orthog_nd, + const ssize_t *orthog_src_dst_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, + ssize_t reps_shape, + ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + const TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, + orthog_src_dst_shape_and_strides}; + // indexers along repeated axis + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; + // indexer along reps array + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; + + const std::size_t gws = orthog_nelems * src_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems, + orthog_indexer, src_axis_indexer, dst_axis_indexer, + reps_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatSequenceFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const char *, + const char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, + std::size_t src_nelems, + const char *src_cp, + char *dst_cp, + const char *reps_cp, + const char *cumsum_cp, + int src_nd, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, + ssize_t reps_shape, + ssize_t reps_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + const repT *reps_tp = reinterpret_cast(reps_cp); + const repT *cumsum_tp = reinterpret_cast(cumsum_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + const StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; + // indexer along reps array + const Strided1DIndexer reps_indexer{/* size */ reps_shape, + /* step */ reps_stride}; + + const std::size_t gws = src_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatSequenceFunctor( + src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer, + src_indexer, dst_indexer, reps_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatSequence1DFactory +{ + fnT get() + { + fnT fn = repeat_by_sequence_1d_impl; + return fn; + } +}; + +template +class repeat_by_scalar_kernel; + +template +class RepeatScalarFunctor +{ +private: + const T *src = nullptr; + T *dst = nullptr; + ssize_t reps = 1; + std::size_t dst_axis_nelems = 0; + OrthogIndexer orthog_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; + +public: + RepeatScalarFunctor(const T *src_, + T *dst_, + const ssize_t reps_, + std::size_t dst_axis_nelems_, + const OrthogIndexer &orthog_strider_, + const SrcAxisIndexer &src_axis_strider_, + const DstAxisIndexer &dst_axis_strider_) + : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_), + orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_), + dst_axis_strider(dst_axis_strider_) + { + } + + void operator()(sycl::id<1> idx) const + { + std::size_t id = idx[0]; + auto i_orthog = id / dst_axis_nelems; + auto i_along = id - (i_orthog * dst_axis_nelems); + + auto orthog_offsets = orthog_strider(i_orthog); + auto src_offset = orthog_offsets.get_first_offset(); + auto dst_offset = orthog_offsets.get_second_offset(); + + auto dst_axis_offset = dst_axis_strider(i_along); + auto src_axis_offset = src_axis_strider(i_along / reps); + dst[dst_offset + dst_axis_offset] = src[src_offset + src_axis_offset]; + } +}; + +typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + const ssize_t, + int, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_impl(sycl::queue &q, + std::size_t orthog_nelems, + std::size_t dst_axis_nelems, + const char *src_cp, + char *dst_cp, + const ssize_t reps, + int orthog_nd, + const ssize_t *orthog_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + const TwoOffsets_StridedIndexer orthog_indexer{ + orthog_nd, src_offset, dst_offset, orthog_shape_and_strides}; + // indexers along repeated axis + const Strided1DIndexer src_axis_indexer{/* size */ src_axis_shape, + /* step */ src_axis_stride}; + const Strided1DIndexer dst_axis_indexer{/* size */ dst_axis_shape, + /* step */ dst_axis_stride}; + + const std::size_t gws = orthog_nelems * dst_axis_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor( + src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer, + src_axis_indexer, dst_axis_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatScalarFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_impl; + return fn; + } +}; + +typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + char *, + const ssize_t, + int, + const ssize_t *, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, + std::size_t dst_nelems, + const char *src_cp, + char *dst_cp, + const ssize_t reps, + int src_nd, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, + const std::vector &depends) +{ + sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const T *src_tp = reinterpret_cast(src_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + // orthog ndim indexer + static constexpr TwoZeroOffsets_Indexer orthog_indexer{}; + // indexers along repeated axis + const StridedIndexer src_indexer(src_nd, 0, src_shape_strides); + const Strided1DIndexer dst_indexer{/* size */ dst_shape, + /* step */ dst_stride}; + + const std::size_t gws = dst_nelems; + + cgh.parallel_for>( + sycl::range<1>(gws), + RepeatScalarFunctor(src_tp, dst_tp, reps, + dst_nelems, orthog_indexer, + src_indexer, dst_indexer)); + }); + + return repeat_ev; +} + +template +struct RepeatScalar1DFactory +{ + fnT get() + { + fnT fn = repeat_by_scalar_1d_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::repeat diff --git a/dpctl_ext/tensor/libtensor/source/repeat.cpp b/dpctl_ext/tensor/libtensor/source/repeat.cpp new file mode 100644 index 000000000000..4bba1d35a08e --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/repeat.cpp @@ -0,0 +1,820 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/repeat.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_fn_ptr_t; +static repeat_by_sequence_fn_ptr_t + repeat_by_sequence_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_sequence_1d_fn_ptr_t; +static repeat_by_sequence_1d_fn_ptr_t + repeat_by_sequence_1d_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_fn_ptr_t; +static repeat_by_scalar_fn_ptr_t + repeat_by_scalar_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::repeat::repeat_by_scalar_1d_fn_ptr_t; +static repeat_by_scalar_1d_fn_ptr_t + repeat_by_scalar_1d_dispatch_vector[td_ns::num_types]; + +void init_repeat_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::repeat::RepeatSequenceFactory; + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(repeat_by_sequence_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatSequence1DFactory; + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(repeat_by_sequence_1d_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalarFactory; + td_ns::DispatchVectorBuilder + dvb3; + dvb3.populate_dispatch_vector(repeat_by_scalar_dispatch_vector); + + using dpctl::tensor::kernels::repeat::RepeatScalar1DFactory; + td_ns::DispatchVectorBuilder + dvb4; + dvb4.populate_dispatch_vector(repeat_by_scalar_1d_dispatch_vector); +} + +std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + int axis, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + std::size_t orthog_nelems(1); // number of orthogonal iterations + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + std::size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + std::size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the sum of reps + if (!same_orthog_dims || src_axis_nelems != reps_sz || + src_axis_nelems != cumsum_sz) + { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * dst_axis_nelems); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = + fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, + cumsum_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0], + reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + + auto fn = repeat_by_sequence_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape, + axis_src_shape, orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, + axis_dst_shape, orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + dpctl::tensor::py_internal::simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, src_axis_nelems, src_data_p, + dst_data_p, reps_data_p, cumsum_data_p, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexers along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], + // data to build indexer for reps array + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t src_sz = src.get_size(); + std::size_t reps_sz = reps.get_size(); + std::size_t cumsum_sz = cumsum.get_size(); + + // shape at repeated axis must be equal to the sum of reps + if (src_sz != reps_sz || src_sz != cumsum_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + dst.get_size()); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, cumsum, or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + static constexpr int int64_typeid = + static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shapes_strides = + packed_src_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn( + exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, + src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0], + reps_shape_vec[0], reps_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + int axis, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + if (axis < 0 || (axis + 1 > src_nd && src_nd > 0) || + (axis > 0 && src_nd == 0)) { + throw py::value_error("Specified axis is invalid."); + } + + int dst_nd = dst.get_ndim(); + if ((src_nd != dst_nd && src_nd > 0) || (src_nd == 0 && dst_nd > 1)) { + throw py::value_error("Number of dimensions of source and destination " + "arrays is not consistent"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + bool same_orthog_dims(true); + std::size_t orthog_nelems(1); // number of orthogonal iterations + for (auto i = 0; i < axis; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + for (auto i = axis + 1; i < src_nd; ++i) { + auto src_sh_i = src_shape[i]; + orthog_nelems *= src_sh_i; + same_orthog_dims = same_orthog_dims && (src_sh_i == dst_shape[i]); + } + + std::size_t src_axis_nelems(1); + if (src_nd > 0) { + src_axis_nelems = src_shape[axis]; + } + std::size_t dst_axis_nelems(dst_shape[axis]); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if (!same_orthog_dims || (src_axis_nelems * reps) != dst_axis_nelems) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (orthog_nelems == 0 || src_axis_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, orthog_nelems * (src_axis_nelems * reps)); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + sycl::event repeat_ev; + std::vector host_task_events{}; + if (axis == 0 && src_nd < 2) { + // empty orthogonal directions + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + assert(dst_shape_vec.size() == 1); + assert(dst_strides_vec.size() == 1); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, dst_shape_vec[0], + dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + else { + // non-empty orthogonal directions + + auto fn = repeat_by_scalar_dispatch_vector[src_typeid]; + + int orthog_nd = src_nd - 1; + + using shT = std::vector; + shT orthog_src_shape; + shT orthog_src_strides; + shT axis_src_shape; + shT axis_src_stride; + dpctl::tensor::py_internal::split_iteration_space( + src_shape_vec, src_strides_vec, axis, axis + 1, orthog_src_shape, + axis_src_shape, orthog_src_strides, axis_src_stride); + + shT orthog_dst_shape; + shT orthog_dst_strides; + shT axis_dst_shape; + shT axis_dst_stride; + dpctl::tensor::py_internal::split_iteration_space( + dst_shape_vec, dst_strides_vec, axis, axis + 1, orthog_dst_shape, + axis_dst_shape, orthog_dst_strides, axis_dst_stride); + + assert(orthog_src_shape.size() == static_cast(orthog_nd)); + assert(orthog_dst_shape.size() == static_cast(orthog_nd)); + assert(std::equal(orthog_src_shape.begin(), orthog_src_shape.end(), + orthog_dst_shape.begin())); + + shT simplified_orthog_shape; + shT simplified_orthog_src_strides; + shT simplified_orthog_dst_strides; + + const py::ssize_t *_shape = orthog_src_shape.data(); + + py::ssize_t orthog_src_offset(0); + py::ssize_t orthog_dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space( + orthog_nd, _shape, orthog_src_strides, orthog_dst_strides, + // output + simplified_orthog_shape, simplified_orthog_src_strides, + simplified_orthog_dst_strides, orthog_src_offset, + orthog_dst_offset); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, simplified_orthog_shape, + simplified_orthog_src_strides, simplified_orthog_dst_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_shapes_strides = + packed_shapes_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, orthog_nelems, dst_axis_nelems, src_data_p, + dst_data_p, reps, + // data to build orthog indexer + orthog_nd, packed_shapes_strides, orthog_src_offset, + orthog_dst_offset, + // data to build indexer along repeated axis in src + axis_src_shape[0], axis_src_stride[0], + // data to build indexer along repeated axis in dst + axis_dst_shape[0], axis_dst_stride[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_shapes_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + } + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends) +{ + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t src_sz = src.get_size(); + std::size_t dst_sz = dst.get_size(); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if ((src_sz * reps) != dst_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + src_sz * reps); + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + auto packed_src_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple1)); + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + const py::ssize_t *packed_src_shape_strides = + packed_src_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], all_deps); + + sycl::event cleanup_tmp_allocations_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {repeat_ev}, packed_src_shape_strides_owner); + host_task_events.push_back(cleanup_tmp_allocations_ev); + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/repeat.hpp b/dpctl_ext/tensor/libtensor/source/repeat.hpp new file mode 100644 index 000000000000..5835377fb29c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/repeat.hpp @@ -0,0 +1,83 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_repeat_dispatch_vectors(void); + +extern std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + int axis, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + int axis, + sycl::queue &exec_q, + const std::vector &depends); + +extern std::pair + py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 98ab488e5879..48e65c4deb84 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -57,7 +57,7 @@ #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" // #include "linear_sequences.hpp" -// #include "repeat.hpp" +#include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" @@ -119,8 +119,8 @@ using dpctl::tensor::py_internal::py_place; /* ================= Repeat ====================*/ using dpctl::tensor::py_internal::py_cumsum_1d; -// using dpctl::tensor::py_internal::py_repeat_by_scalar; -// using dpctl::tensor::py_internal::py_repeat_by_sequence; +using dpctl::tensor::py_internal::py_repeat_by_scalar; +using dpctl::tensor::py_internal::py_repeat_by_sequence; /* ================ Eye ================== */ @@ -169,7 +169,7 @@ void init_dispatch_vectors(void) populate_mask_positions_dispatch_vectors(); populate_cumsum_1d_dispatch_vectors(); - // init_repeat_dispatch_vectors(); + init_repeat_dispatch_vectors(); // init_clip_dispatch_vectors(); @@ -450,45 +450,43 @@ PYBIND11_MODULE(_tensor_impl, m) // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), // py::arg("depends") = py::list()); - // auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, - // const dpctl::tensor::usm_ndarray &reps, - // const dpctl::tensor::usm_ndarray &cumsum, - // std::optional axis, sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // if (axis) { - // return py_repeat_by_sequence(src, dst, reps, cumsum, - // axis.value(), - // exec_q, depends); - // } - // else { - // return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, - // depends); - // } - // }; - // m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), - // py::arg("dst"), py::arg("reps"), py::arg("cumsum"), - // py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = - // py::list()); - - // auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, - // const dpctl::tensor::usm_ndarray &dst, - // const py::ssize_t reps, std::optional axis, - // sycl::queue &exec_q, - // const std::vector depends) - // -> std::pair { - // if (axis) { - // return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, - // depends); - // } - // else { - // return py_repeat_by_scalar(src, dst, reps, exec_q, depends); - // } - // }; - // m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), - // py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + std::optional axis, sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(), + exec_q, depends); + } + else { + return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + depends); + } + }; + m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), + py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, std::optional axis, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + depends); + } + else { + return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + } + }; + m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); // m.def("_clip", &py_clip, // "Clamps elements of array `x` to the range " From 43f97cf6b9382a51acbb327e94242ad6a7c775f1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 13:24:53 -0800 Subject: [PATCH 060/145] Move repeat() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 232 +++++++++++++++++++- dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 234 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index fa76faccc632..033004bb0095 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -50,6 +50,7 @@ take_along_axis, ) from dpctl_ext.tensor._manipulation_functions import ( + repeat, roll, ) from dpctl_ext.tensor._reshape import reshape @@ -66,6 +67,7 @@ "place", "put", "put_along_axis", + "repeat", "reshape", "roll", "take", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index fa8fc27876b3..85b4f029156b 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -28,6 +28,7 @@ import operator +import dpctl import dpctl.tensor as dpt import dpctl.utils as dputils import numpy as np @@ -36,7 +37,7 @@ # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor._tensor_impl as ti -from ._numpy_helper import normalize_axis_tuple +from ._numpy_helper import normalize_axis_index, normalize_axis_tuple __doc__ = ( "Implementation module for array manipulation " @@ -44,6 +45,235 @@ ) +def repeat(x, repeats, /, *, axis=None): + """repeat(x, repeats, axis=None) + + Repeat elements of an array on a per-element basis. + + Args: + x (usm_ndarray): input array + + repeats (Union[int, Sequence[int, ...], usm_ndarray]): + The number of repetitions for each element. + + `repeats` must be broadcast-compatible with `N` where `N` is + `prod(x.shape)` if `axis` is `None` and `x.shape[axis]` + otherwise. + + If `repeats` is an array, it must have an integer data type. + Otherwise, `repeats` must be a Python integer or sequence of + Python integers (i.e., a tuple, list, or range). + + axis (Optional[int]): + The axis along which to repeat values. If `axis` is `None`, the + function repeats elements of the flattened array. Default: `None`. + + Returns: + usm_ndarray: + output array with repeated elements. + + If `axis` is `None`, the returned array is one-dimensional, + otherwise, it has the same shape as `x`, except for the axis along + which elements were repeated. + + The returned array will have the same data type as `x`. + The returned array will be located on the same device as `x` and + have the same USM allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + x_ndim = x.ndim + x_shape = x.shape + if axis is not None: + axis = normalize_axis_index(operator.index(axis), x_ndim) + axis_size = x_shape[axis] + else: + axis_size = x.size + + scalar = False + if isinstance(repeats, int): + if repeats < 0: + raise ValueError("`repeats` must be a positive integer") + usm_type = x.usm_type + exec_q = x.sycl_queue + scalar = True + elif isinstance(repeats, dpt.usm_ndarray): + if repeats.ndim > 1: + raise ValueError( + "`repeats` array must be 0- or 1-dimensional, got " + f"{repeats.ndim}" + ) + exec_q = dpctl.utils.get_execution_queue( + (x.sycl_queue, repeats.sycl_queue) + ) + if exec_q is None: + raise dputils.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + usm_type = dpctl.utils.get_coerced_usm_type( + ( + x.usm_type, + repeats.usm_type, + ) + ) + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): + raise TypeError( + f"'repeats' data type {repeats.dtype} cannot be cast to " + "'int64' according to the casting rule ''safe.''" + ) + if repeats.size == 1: + scalar = True + # bring the single element to the host + if repeats.ndim == 0: + repeats = int(repeats) + else: + # Get the single element explicitly + # since non-0D arrays can not be converted to scalars + repeats = int(repeats[0]) + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + else: + if repeats.size != axis_size: + raise ValueError( + "'repeats' array must be broadcastable to the size of " + "the repeated axis" + ) + if not dpt.all(repeats >= 0): + raise ValueError("'repeats' elements must be positive") + + elif isinstance(repeats, (tuple, list, range)): + usm_type = x.usm_type + exec_q = x.sycl_queue + + len_reps = len(repeats) + if len_reps == 1: + repeats = repeats[0] + if repeats < 0: + raise ValueError("`repeats` elements must be positive") + scalar = True + else: + if len_reps != axis_size: + raise ValueError( + "`repeats` sequence must have the same length as the " + "repeated axis" + ) + repeats = dpt.asarray( + repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q + ) + if not dpt.all(repeats >= 0): + raise ValueError("`repeats` elements must be positive") + else: + raise TypeError( + "Expected int, sequence, or `usm_ndarray` for second argument," + f"got {type(repeats)}" + ) + + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if scalar: + res_axis_size = repeats * axis_size + if axis is not None: + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_scalar( + src=x, + dst=res, + reps=repeats, + axis=axis, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + else: + if repeats.dtype != dpt.int64: + rep_buf = dpt.empty( + repeats.shape, + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + # _cumsum_1d synchronizes so `depends` ends here safely + res_axis_size = ti._cumsum_1d( + rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] + ) + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_sequence( + src=x, + dst=res, + reps=rep_buf, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + else: + cumsum = dpt.empty( + (axis_size,), + dtype=dpt.int64, + usm_type=usm_type, + sycl_queue=exec_q, + ) + res_axis_size = ti._cumsum_1d( + repeats, cumsum, sycl_queue=exec_q, depends=dep_evs + ) + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) + res = dpt.empty( + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, + ) + if res_axis_size > 0: + ht_rep_ev, rep_ev = ti._repeat_by_sequence( + src=x, + dst=res, + reps=repeats, + cumsum=cumsum, + axis=axis, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_rep_ev, rep_ev) + return res + + def roll(x, /, shift, *, axis=None): """ roll(x, shift, axis) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index e988bbaa237b..8aa5b6832b3d 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -2837,7 +2837,7 @@ def repeat(a, repeats, axis=None): a = dpnp.ravel(a) usm_arr = dpnp.get_usm_ndarray(a) - usm_res = dpt.repeat(usm_arr, repeats, axis=axis) + usm_res = dpt_ext.repeat(usm_arr, repeats, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) From 9b248f4bff68c5ccb153a1fb51cebcd582fd6cf7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 13:48:50 -0800 Subject: [PATCH 061/145] Move ti._where() to dpctl_ext/tensor/libtensor --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../libtensor/include/kernels/where.hpp | 339 ++++++++++++++++++ .../tensor/libtensor/source/tensor_ctors.cpp | 10 +- dpctl_ext/tensor/libtensor/source/where.cpp | 266 ++++++++++++++ dpctl_ext/tensor/libtensor/source/where.hpp | 57 +++ 5 files changed, 668 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/where.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/where.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/where.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 797dbd6f4d1b..cec88aed44c8 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -58,7 +58,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/zeros_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/where.hpp b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp new file mode 100644 index 000000000000..b92a3a76c9c1 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/where.hpp @@ -0,0 +1,339 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.where. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::search +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +class where_strided_kernel; +template +class where_contig_kernel; + +template +class WhereContigFunctor +{ +private: + std::size_t nelems = 0; + const condT *cond_p = nullptr; + const T *x1_p = nullptr; + const T *x2_p = nullptr; + T *dst_p = nullptr; + +public: + WhereContigFunctor(std::size_t nelems_, + const condT *cond_p_, + const T *x1_p_, + const T *x2_p_, + T *dst_p_) + : nelems(nelems_), cond_p(cond_p_), x1_p(x1_p_), x2_p(x2_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (!enable_sg_loadstore || is_complex::value || + is_complex::value) + { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + + const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi; + const std::size_t start = + (gid / sgSize) * (nelems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + nelems_per_sg); + for (std::size_t offset = start; offset < end; offset += sgSize) { + using dpctl::tensor::type_utils::convert_impl; + const bool check = convert_impl(cond_p[offset]); + dst_p[offset] = check ? x1_p[offset] : x2_p[offset]; + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + nelems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; + +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t idx = base + it * sgSize; + auto x1_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x1_p[idx]); + auto x2_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x2_p[idx]); + auto cond_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&cond_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + const sycl::vec x1_vec = + sub_group_load(sg, x1_multi_ptr); + const sycl::vec x2_vec = + sub_group_load(sg, x2_multi_ptr); + const sycl::vec cond_vec = + sub_group_load(sg, cond_multi_ptr); +#pragma unroll + for (std::uint8_t k = 0; k < vec_sz; ++k) { + dst_vec[k] = cond_vec[k] ? x1_vec[k] : x2_vec[k]; + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { + dst_p[k] = cond_p[k] ? x1_p[k] : x2_p[k]; + } + } + } + } +}; + +typedef sycl::event (*where_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event where_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *cond_cp, + const char *x1_cp, + const char *x2_cp, + char *dst_cp, + const std::vector &depends) +{ + const condT *cond_tp = reinterpret_cast(cond_cp); + const T *x1_tp = reinterpret_cast(x1_cp); + const T *x2_tp = reinterpret_cast(x2_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event where_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + std::size_t lws = 64; + static constexpr std::uint8_t vec_sz = 4u; + static constexpr std::uint8_t n_vecs = 2u; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(cond_cp) && + is_aligned(x1_cp) && + is_aligned(x2_cp) && + is_aligned(dst_cp)) + { + static constexpr bool enable_sg_loadstore = true; + using KernelName = where_contig_kernel; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, + x2_tp, dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = + where_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + WhereContigFunctor(nelems, cond_tp, x1_tp, + x2_tp, dst_tp)); + } + }); + + return where_ev; +} + +template +class WhereStridedFunctor +{ +private: + const T *x1_p = nullptr; + const T *x2_p = nullptr; + T *dst_p = nullptr; + const condT *cond_p = nullptr; + IndexerT indexer; + +public: + WhereStridedFunctor(const condT *cond_p_, + const T *x1_p_, + const T *x2_p_, + T *dst_p_, + const IndexerT &indexer_) + : x1_p(x1_p_), x2_p(x2_p_), dst_p(dst_p_), cond_p(cond_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + std::size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + + using dpctl::tensor::type_utils::convert_impl; + bool check = + convert_impl(cond_p[offsets.get_first_offset()]); + + dst_p[offsets.get_fourth_offset()] = + check ? x1_p[offsets.get_second_offset()] + : x2_p[offsets.get_third_offset()]; + } +}; + +typedef sycl::event (*where_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const char *, + const char *, + const char *, + char *, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event where_strided_impl(sycl::queue &q, + std::size_t nelems, + int nd, + const char *cond_cp, + const char *x1_cp, + const char *x2_cp, + char *dst_cp, + const ssize_t *shape_strides, + ssize_t x1_offset, + ssize_t x2_offset, + ssize_t cond_offset, + ssize_t dst_offset, + const std::vector &depends) +{ + const condT *cond_tp = reinterpret_cast(cond_cp); + const T *x1_tp = reinterpret_cast(x1_cp); + const T *x2_tp = reinterpret_cast(x2_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event where_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const FourOffsets_StridedIndexer indexer{ + nd, cond_offset, x1_offset, x2_offset, dst_offset, shape_strides}; + + cgh.parallel_for< + where_strided_kernel>( + sycl::range<1>(nelems), + WhereStridedFunctor( + cond_tp, x1_tp, x2_tp, dst_tp, indexer)); + }); + + return where_ev; +} + +template +struct WhereStridedFactory +{ + fnT get() + { + fnT fn = where_strided_impl; + return fn; + } +}; + +template +struct WhereContigFactory +{ + fnT get() + { + fnT fn = where_contig_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::search diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 48e65c4deb84..1619357aaf64 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -62,7 +62,7 @@ #include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" -// #include "where.hpp" +#include "where.hpp" #include "zeros_ctor.hpp" namespace py = pybind11; @@ -132,7 +132,7 @@ using dpctl::tensor::py_internal::usm_ndarray_triul; /* =========================== Where ============================== */ -// using dpctl::tensor::py_internal::py_where; +using dpctl::tensor::py_internal::py_where; /* =========================== Clip ============================== */ // using dpctl::tensor::py_internal::py_clip; @@ -446,9 +446,9 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("mask_shape"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), - // py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_where", &py_where, "", py::arg("condition"), py::arg("x1"), + py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, const dpctl::tensor::usm_ndarray &dst, diff --git a/dpctl_ext/tensor/libtensor/source/where.cpp b/dpctl_ext/tensor/libtensor/source/where.cpp new file mode 100644 index 000000000000..1afdbf45c66b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/where.cpp @@ -0,0 +1,266 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.where +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/where.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" +#include "where.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::search::where_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::search::where_strided_impl_fn_ptr_t; + +static where_contig_impl_fn_ptr_t where_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static where_strided_impl_fn_ptr_t + where_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::utils::keep_args_alive; + +std::pair + py_where(const dpctl::tensor::usm_ndarray &condition, + const dpctl::tensor::usm_ndarray &x1, + const dpctl::tensor::usm_ndarray &x2, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {x1, x2, condition, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int nd = condition.get_ndim(); + int x1_nd = x1.get_ndim(); + int x2_nd = x2.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != x1_nd || nd != x2_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for where kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for where kernel."); + } + + const py::ssize_t *x1_shape = x1.get_shape_raw(); + const py::ssize_t *x2_shape = x2.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + const py::ssize_t *cond_shape = condition.get_shape_raw(); + + bool shapes_equal(true); + std::size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (x1_shape[i] == sh_i) && + (x2_shape[i] == sh_i) && (cond_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Axes are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, condition) && !same_logical_tensors(dst, condition)) || + (overlap(dst, x1) && !same_logical_tensors(dst, x1)) || + (overlap(dst, x2) && !same_logical_tensors(dst, x2))) + { + throw py::value_error("Destination array overlaps with input."); + } + + int x1_typenum = x1.get_typenum(); + int x2_typenum = x2.get_typenum(); + int cond_typenum = condition.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int cond_typeid = array_types.typenum_to_lookup_id(cond_typenum); + int x1_typeid = array_types.typenum_to_lookup_id(x1_typenum); + int x2_typeid = array_types.typenum_to_lookup_id(x2_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (x1_typeid != x2_typeid || x1_typeid != dst_typeid) { + throw py::value_error("Value arrays must have the same data type"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems); + + char *cond_data = condition.get_data(); + char *x1_data = x1.get_data(); + char *x2_data = x2.get_data(); + char *dst_data = dst.get_data(); + + bool is_x1_c_contig = x1.is_c_contiguous(); + bool is_x1_f_contig = x1.is_f_contiguous(); + + bool is_x2_c_contig = x2.is_c_contiguous(); + bool is_x2_f_contig = x2.is_f_contiguous(); + + bool is_cond_c_contig = condition.is_c_contiguous(); + bool is_cond_f_contig = condition.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_x1_c_contig && is_x2_c_contig && is_cond_c_contig && + is_dst_c_contig); + bool all_f_contig = (is_x1_f_contig && is_x2_f_contig && is_cond_f_contig && + is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto contig_fn = where_contig_dispatch_table[x1_typeid][cond_typeid]; + + auto where_ev = contig_fn(exec_q, nelems, cond_data, x1_data, x2_data, + dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {x1, x2, dst, condition}, {where_ev}); + + return std::make_pair(ht_ev, where_ev); + } + + auto const &cond_strides = condition.get_strides_vector(); + auto const &x1_strides = x1.get_strides_vector(); + auto const &x2_strides = x2.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_cond_strides; + shT simplified_x1_strides; + shT simplified_x2_strides; + shT simplified_dst_strides; + py::ssize_t cond_offset(0); + py::ssize_t x1_offset(0); + py::ssize_t x2_offset(0); + py::ssize_t dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space_4( + nd, x1_shape, cond_strides, x1_strides, x2_strides, dst_strides, + // outputs + simplified_shape, simplified_cond_strides, simplified_x1_strides, + simplified_x2_strides, simplified_dst_strides, cond_offset, x1_offset, + x2_offset, dst_offset); + + auto fn = where_strided_dispatch_table[x1_typeid][cond_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_cond_strides, simplified_x1_strides, + simplified_x2_strides, simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event where_ev = fn(exec_q, nelems, nd, cond_data, x1_data, x2_data, + dst_data, packed_shape_strides, cond_offset, + x1_offset, x2_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {where_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {x1, x2, condition, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, where_ev); +} + +void init_where_dispatch_tables(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::search::WhereContigFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(where_contig_dispatch_table); + + using dpctl::tensor::kernels::search::WhereStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(where_strided_dispatch_table); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/where.hpp b/dpctl_ext/tensor/libtensor/source/where.hpp new file mode 100644 index 000000000000..ba81d8b11642 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/where.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.where +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_where(const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + const dpctl::tensor::usm_ndarray &, + sycl::queue &, + const std::vector &); + +extern void init_where_dispatch_tables(void); + +} // namespace dpctl::tensor::py_internal From 24425cf5b3991ecf0be082a65ceb9909d75ca0b2 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 19 Feb 2026 14:29:57 -0800 Subject: [PATCH 062/145] Move _type_utils.py to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 7 + dpctl_ext/tensor/_type_utils.py | 998 ++++++++++++++++++++++++++++++++ 2 files changed, 1005 insertions(+) create mode 100644 dpctl_ext/tensor/_type_utils.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 033004bb0095..d7f2a785802c 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -55,20 +55,27 @@ ) from dpctl_ext.tensor._reshape import reshape +from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type + __all__ = [ "asnumpy", "astype", + "can_cast", "copy", "extract", "eye", + "finfo", "from_numpy", "full", + "iinfo", + "isdtype", "nonzero", "place", "put", "put_along_axis", "repeat", "reshape", + "result_type", "roll", "take", "take_along_axis", diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py new file mode 100644 index 000000000000..bcfec499f355 --- /dev/null +++ b/dpctl_ext/tensor/_type_utils.py @@ -0,0 +1,998 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from __future__ import annotations + +import dpctl.tensor as dpt +import numpy as np + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._tensor_impl as ti + + +def _all_data_types(_fp16, _fp64): + _non_fp_types = [ + dpt.bool, + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + ] + if _fp64: + if _fp16: + return _non_fp_types + [ + dpt.float16, + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + else: + return _non_fp_types + [ + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + else: + if _fp16: + return _non_fp_types + [ + dpt.float16, + dpt.float32, + dpt.complex64, + ] + else: + return _non_fp_types + [ + dpt.float32, + dpt.complex64, + ] + + +def _acceptance_fn_default_binary( + arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev +): + return True + + +def _acceptance_fn_default_unary(arg_dtype, ret_buf_dt, res_dt, sycl_dev): + return True + + +def _acceptance_fn_divide( + arg1_dtype, arg2_dtype, ret_buf1_dt, ret_buf2_dt, res_dt, sycl_dev +): + # both are being promoted, if the kind of result is + # different than the kind of original input dtypes, + # we use default dtype for the resulting kind. + # This covers, e.g. (array_dtype_i1 / array_dtype_u1) + # result of which in divide is double (in NumPy), but + # regular type promotion rules peg at float16 + if (ret_buf1_dt.kind != arg1_dtype.kind) and ( + ret_buf2_dt.kind != arg2_dtype.kind + ): + default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev) + if res_dt == default_dt: + return True + else: + return False + else: + return True + + +def _acceptance_fn_negative(arg_dtype, buf_dt, res_dt, sycl_dev): + # negative is not defined for boolean data type + if arg_dtype.char == "?": + raise ValueError( + "The `negative` function, the `-` operator, is not supported " + "for inputs of data type bool, use the `~` operator or the " + "`logical_not` function instead" + ) + else: + return True + + +def _acceptance_fn_reciprocal(arg_dtype, buf_dt, res_dt, sycl_dev): + # if the kind of result is different from the kind of input, we use the + # default floating-point dtype for the resulting kind. This guarantees + # alignment of reciprocal and divide output types. + if buf_dt.kind != arg_dtype.kind: + default_dt = _get_device_default_dtype(res_dt.kind, sycl_dev) + if res_dt == default_dt: + return True + else: + return False + else: + return True + + +def _acceptance_fn_subtract( + arg1_dtype, arg2_dtype, buf1_dt, buf2_dt, res_dt, sycl_dev +): + # subtract is not defined for boolean data type + if arg1_dtype.char == "?" and arg2_dtype.char == "?": + raise ValueError( + "The `subtract` function, the `-` operator, is not supported " + "for inputs of data type bool, use the `^` operator, the " + "`bitwise_xor`, or the `logical_xor` function instead" + ) + else: + return True + + +def _can_cast( + from_: dpt.dtype, to_: dpt.dtype, _fp16: bool, _fp64: bool, casting="safe" +) -> bool: + """ + Can `from_` be cast to `to_` safely on a device with + fp16 and fp64 aspects as given? + """ + if not _dtype_supported_by_device_impl(to_, _fp16, _fp64): + return False + can_cast_v = np.can_cast(from_, to_, casting=casting) # ask NumPy + if _fp16 and _fp64: + return can_cast_v + if not can_cast_v: + if ( + from_.kind in "biu" + and to_.kind in "fc" + and _is_maximal_inexact_type(to_, _fp16, _fp64) + ): + return True + + return can_cast_v + + +def _dtype_supported_by_device_impl( + dt: dpt.dtype, has_fp16: bool, has_fp64: bool +) -> bool: + if has_fp64: + if not has_fp16: + if dt is dpt.float16: + return False + else: + if dt is dpt.float64: + return False + elif dt is dpt.complex128: + return False + if not has_fp16 and dt is dpt.float16: + return False + return True + + +def _find_buf_dtype(arg_dtype, query_fn, sycl_dev, acceptance_fn): + res_dt = query_fn(arg_dtype) + if res_dt: + return None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf_dt in all_dts: + if _can_cast(arg_dtype, buf_dt, _fp16, _fp64): + res_dt = query_fn(buf_dt) + if res_dt: + acceptable = acceptance_fn(arg_dtype, buf_dt, res_dt, sycl_dev) + if acceptable: + return buf_dt, res_dt + else: + continue + + return None, None + + +def _find_buf_dtype2(arg1_dtype, arg2_dtype, query_fn, sycl_dev, acceptance_fn): + res_dt = query_fn(arg1_dtype, arg2_dtype) + if res_dt: + return None, None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + all_dts = _all_data_types(_fp16, _fp64) + for buf1_dt in all_dts: + for buf2_dt in all_dts: + if _can_cast(arg1_dtype, buf1_dt, _fp16, _fp64) and _can_cast( + arg2_dtype, buf2_dt, _fp16, _fp64 + ): + res_dt = query_fn(buf1_dt, buf2_dt) + if res_dt: + ret_buf1_dt = None if buf1_dt == arg1_dtype else buf1_dt + ret_buf2_dt = None if buf2_dt == arg2_dtype else buf2_dt + if ret_buf1_dt is None or ret_buf2_dt is None: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + acceptable = acceptance_fn( + arg1_dtype, + arg2_dtype, + ret_buf1_dt, + ret_buf2_dt, + res_dt, + sycl_dev, + ) + if acceptable: + return ret_buf1_dt, ret_buf2_dt, res_dt + else: + continue + + return None, None, None + + +def _find_buf_dtype_in_place_op(arg1_dtype, arg2_dtype, query_fn, sycl_dev): + res_dt = query_fn(arg1_dtype, arg2_dtype) + if res_dt: + return None, res_dt + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg2_dtype, arg1_dtype, _fp16, _fp64, casting="same_kind"): + res_dt = query_fn(arg1_dtype, arg1_dtype) + if res_dt: + return arg1_dtype, res_dt + + return None, None + + +def _get_device_default_dtype(dt_kind, sycl_dev): + if dt_kind == "b": + return dpt.dtype(ti.default_device_bool_type(sycl_dev)) + elif dt_kind == "i": + return dpt.dtype(ti.default_device_int_type(sycl_dev)) + elif dt_kind == "u": + return dpt.dtype(ti.default_device_uint_type(sycl_dev)) + elif dt_kind == "f": + return dpt.dtype(ti.default_device_fp_type(sycl_dev)) + elif dt_kind == "c": + return dpt.dtype(ti.default_device_complex_type(sycl_dev)) + raise RuntimeError + + +def _is_maximal_inexact_type(dt: dpt.dtype, _fp16: bool, _fp64: bool): + """ + Return True if data type `dt` is the + maximal size inexact data type + """ + if _fp64: + return dt in [dpt.float64, dpt.complex128] + return dt in [dpt.float32, dpt.complex64] + + +def _to_device_supported_dtype(dt, dev): + has_fp16 = dev.has_aspect_fp16 + has_fp64 = dev.has_aspect_fp64 + + return _to_device_supported_dtype_impl(dt, has_fp16, has_fp64) + + +def _to_device_supported_dtype_impl(dt, has_fp16, has_fp64): + if has_fp64: + if not has_fp16: + if dt is dpt.float16: + return dpt.float32 + else: + if dt is dpt.float64: + return dpt.float32 + elif dt is dpt.complex128: + return dpt.complex64 + if not has_fp16 and dt is dpt.float16: + return dpt.float32 + return dt + + +class WeakBooleanType: + """Python type representing type of Python boolean objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakIntegralType: + """Python type representing type of Python integral objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakFloatingType: + """Python type representing type of Python floating point objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +class WeakComplexType: + """Python type representing type of Python complex floating point objects""" + + def __init__(self, o): + self.o_ = o + + def get(self): + return self.o_ + + +def _weak_type_num_kind(o): + _map = {"?": 0, "i": 1, "f": 2, "c": 3} + if isinstance(o, WeakBooleanType): + return _map["?"] + if isinstance(o, WeakIntegralType): + return _map["i"] + if isinstance(o, WeakFloatingType): + return _map["f"] + if isinstance(o, WeakComplexType): + return _map["c"] + raise TypeError( + f"Unexpected type {o} while expecting " + "`WeakBooleanType`, `WeakIntegralType`," + "`WeakFloatingType`, or `WeakComplexType`." + ) + + +def _strong_dtype_num_kind(o): + _map = {"b": 0, "i": 1, "u": 1, "f": 2, "c": 3} + if not isinstance(o, dpt.dtype): + raise TypeError + k = o.kind + if k in _map: + return _map[k] + raise ValueError(f"Unrecognized kind {k} for dtype {o}") + + +def _is_weak_dtype(dtype): + return isinstance( + dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ) + + +def _resolve_weak_types(o1_dtype, o2_dtype, dev): + """Resolves weak data type per NEP-0050""" + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + raise ValueError + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev): + """ + Resolves weak data type per NEP-0050 for comparisons and + divide, where result type is known and special behavior + is needed to handle mixed integer kinds and Python integers + without overflow + """ + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + raise ValueError + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + if o1_kind_num == o2_kind_num and isinstance( + o1_dtype, WeakIntegralType + ): + o1_val = o1_dtype.get() + o2_iinfo = dpt.iinfo(o2_dtype) + if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max): + return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + if o1_kind_num == o2_kind_num and isinstance( + o2_dtype, WeakIntegralType + ): + o2_val = o2_dtype.get() + o1_iinfo = dpt.iinfo(o1_dtype) + if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max): + return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val)) + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev): + """ + Resolves weak data types per NEP-0050, + where the second and third arguments are + permitted to be weak types + """ + if _is_weak_dtype(st_dtype): + raise ValueError + if _is_weak_dtype(dtype1): + if _is_weak_dtype(dtype2): + kind_num1 = _weak_type_num_kind(dtype1) + kind_num2 = _weak_type_num_kind(dtype2) + st_kind_num = _strong_dtype_num_kind(st_dtype) + + if kind_num1 > st_kind_num: + if isinstance(dtype1, WeakIntegralType): + ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype1, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype1 = dpt.complex64 + ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype1 = st_dtype + + if kind_num2 > st_kind_num: + if isinstance(dtype2, WeakIntegralType): + ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype2, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype2 = dpt.complex64 + ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype2 = st_dtype + + return ret_dtype1, ret_dtype2 + + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype2), dtype2), + ] + ) + dt1_kind_num = _weak_type_num_kind(dtype1) + if dt1_kind_num > max_dt_num_kind: + if isinstance(dtype1, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), dtype2 + if isinstance(dtype1, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dpt.complex64, dtype2 + return ( + _to_device_supported_dtype(dpt.complex128, dev), + dtype2, + ) + return _to_device_supported_dtype(dpt.float64, dev), dtype2 + else: + return max_dtype, dtype2 + elif _is_weak_dtype(dtype2): + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype1), dtype1), + ] + ) + dt2_kind_num = _weak_type_num_kind(dtype2) + if dt2_kind_num > max_dt_num_kind: + if isinstance(dtype2, WeakIntegralType): + return dtype1, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype2, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dtype1, dpt.complex64 + return ( + dtype1, + _to_device_supported_dtype(dpt.complex128, dev), + ) + return dtype1, _to_device_supported_dtype(dpt.float64, dev) + else: + return dtype1, max_dtype + else: + # both are strong dtypes + # return unmodified + return dtype1, dtype2 + + +def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev): + """Resolves one weak data type with one strong data type per NEP-0050""" + if _is_weak_dtype(st_dtype): + raise ValueError + if _is_weak_dtype(dtype): + st_kind_num = _strong_dtype_num_kind(st_dtype) + kind_num = _weak_type_num_kind(dtype) + if kind_num > st_kind_num: + if isinstance(dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + return dpt.complex64 + return _to_device_supported_dtype(dpt.complex128, dev) + return _to_device_supported_dtype(dpt.float64, dev) + else: + return st_dtype + else: + return dtype + + +class finfo_object: + """ + `numpy.finfo` subclass which returns Python floating-point scalars for + `eps`, `max`, `min`, and `smallest_normal` attributes. + """ + + def __init__(self, dtype): + _supported_dtype([dpt.dtype(dtype)]) + self._finfo = np.finfo(dtype) + + @property + def bits(self): + """Number of bits occupied by the real-valued floating-point data type.""" + return int(self._finfo.bits) + + @property + def smallest_normal(self): + """ + Smallest positive real-valued floating-point number with full + precision. + """ + return float(self._finfo.smallest_normal) + + @property + def tiny(self): + """An alias for `smallest_normal`""" + return float(self._finfo.tiny) + + @property + def eps(self): + """ + Difference between 1.0 and the next smallest representable real-valued + floating-point number larger than 1.0 according to the IEEE-754 + standard. + """ + return float(self._finfo.eps) + + @property + def epsneg(self): + """ + Difference between 1.0 and the next smallest representable real-valued + floating-point number smaller than 1.0 according to the IEEE-754 + standard. + """ + return float(self._finfo.epsneg) + + @property + def min(self): + """Smallest representable real-valued number.""" + return float(self._finfo.min) + + @property + def max(self): + """Largest representable real-valued number.""" + return float(self._finfo.max) + + @property + def resolution(self): + """The approximate decimal resolution of this type.""" + return float(self._finfo.resolution) + + @property + def precision(self): + """ + The approximate number of decimal digits to which this kind of + floating point type is precise. + """ + return float(self._finfo.precision) + + @property + def dtype(self): + """ + The dtype for which finfo returns information. For complex input, the + returned dtype is the associated floating point dtype for its real and + complex components. + """ + return self._finfo.dtype + + def __str__(self): + return self._finfo.__str__() + + def __repr__(self): + return self._finfo.__repr__() + + +def can_cast(from_, to, /, *, casting="safe") -> bool: + """can_cast(from, to, casting="safe") + + Determines if one data type can be cast to another data type according \ + to Type Promotion Rules. + + Args: + from_ (Union[usm_ndarray, dtype]): + source data type. If `from_` is an array, a device-specific type + promotion rules apply. + to (dtype): + target data type + casting (Optional[str]): + controls what kind of data casting may occur. + + * "no" means data types should not be cast at all. + * "safe" means only casts that preserve values are allowed. + * "same_kind" means only safe casts and casts within a kind, + like `float64` to `float32`, are allowed. + * "unsafe" means any data conversion can be done. + + Default: `"safe"`. + + Returns: + bool: + Gives `True` if cast can occur according to the casting rule. + + Device-specific type promotion rules take into account which data type are + and are not supported by a specific device. + """ + if isinstance(to, dpt.usm_ndarray): + raise TypeError(f"Expected `dpt.dtype` type, got {type(to)}.") + + dtype_to = dpt.dtype(to) + _supported_dtype([dtype_to]) + + if isinstance(from_, dpt.usm_ndarray): + dtype_from = from_.dtype + return _can_cast( + dtype_from, + dtype_to, + from_.sycl_device.has_aspect_fp16, + from_.sycl_device.has_aspect_fp64, + casting=casting, + ) + else: + dtype_from = dpt.dtype(from_) + _supported_dtype([dtype_from]) + # query casting as if all dtypes are supported + return _can_cast(dtype_from, dtype_to, True, True, casting=casting) + + +def result_type(*arrays_and_dtypes): + """ + result_type(*arrays_and_dtypes) + + Returns the dtype that results from applying the Type Promotion Rules to \ + the arguments. + + Args: + arrays_and_dtypes (Union[usm_ndarray, dtype]): + An arbitrary length sequence of usm_ndarray objects or dtypes. + + Returns: + dtype: + The dtype resulting from an operation involving the + input arrays and dtypes. + """ + dtypes = [] + devices = [] + weak_dtypes = [] + for arg_i in arrays_and_dtypes: + if isinstance(arg_i, dpt.usm_ndarray): + devices.append(arg_i.sycl_device) + dtypes.append(arg_i.dtype) + elif isinstance(arg_i, int): + weak_dtypes.append(WeakIntegralType(arg_i)) + elif isinstance(arg_i, float): + weak_dtypes.append(WeakFloatingType(arg_i)) + elif isinstance(arg_i, complex): + weak_dtypes.append(WeakComplexType(arg_i)) + elif isinstance(arg_i, bool): + weak_dtypes.append(WeakBooleanType(arg_i)) + else: + dt = dpt.dtype(arg_i) + _supported_dtype([dt]) + dtypes.append(dt) + + has_fp16 = True + has_fp64 = True + target_dev = None + if devices: + inspected = False + for d in devices: + if inspected: + unsame_fp16_support = d.has_aspect_fp16 != has_fp16 + unsame_fp64_support = d.has_aspect_fp64 != has_fp64 + if unsame_fp16_support or unsame_fp64_support: + raise ValueError( + "Input arrays reside on devices " + "with different device supports; " + "unable to determine which " + "device-specific type promotion rules " + "to use." + ) + else: + has_fp16 = d.has_aspect_fp16 + has_fp64 = d.has_aspect_fp64 + target_dev = d + inspected = True + + if not dtypes and weak_dtypes: + dtypes.append(weak_dtypes[0].get()) + + if not (has_fp16 and has_fp64): + for dt in dtypes: + if not _dtype_supported_by_device_impl(dt, has_fp16, has_fp64): + raise ValueError( + f"Argument {dt} is not supported by the device" + ) + res_dt = np.result_type(*dtypes) + res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64) + for wdt in weak_dtypes: + pair = _resolve_weak_types(wdt, res_dt, target_dev) + res_dt = np.result_type(*pair) + res_dt = _to_device_supported_dtype_impl(res_dt, has_fp16, has_fp64) + else: + res_dt = np.result_type(*dtypes) + if weak_dtypes: + weak_dt_obj = [wdt.get() for wdt in weak_dtypes] + res_dt = np.result_type(res_dt, *weak_dt_obj) + + return res_dt + + +def iinfo(dtype, /): + """iinfo(dtype) + + Returns machine limits for integer data types. + + Args: + dtype (dtype, usm_ndarray): + integer dtype or + an array with integer dtype. + + Returns: + iinfo_object: + An object with the following attributes: + + * bits: int + number of bits occupied by the data type + * max: int + largest representable number. + * min: int + smallest representable number. + * dtype: dtype + integer data type. + """ + if isinstance(dtype, dpt.usm_ndarray): + dtype = dtype.dtype + _supported_dtype([dpt.dtype(dtype)]) + return np.iinfo(dtype) + + +def finfo(dtype, /): + """finfo(type) + + Returns machine limits for floating-point data types. + + Args: + dtype (dtype, usm_ndarray): floating-point dtype or + an array with floating point data type. + If complex, the information is about its component + data type. + + Returns: + finfo_object: + an object have the following attributes: + + * bits: int + number of bits occupied by dtype. + * eps: float + difference between 1.0 and the next smallest representable + real-valued floating-point number larger than 1.0 according + to the IEEE-754 standard. + * max: float + largest representable real-valued number. + * min: float + smallest representable real-valued number. + * smallest_normal: float + smallest positive real-valued floating-point number with + full precision. + * dtype: dtype + real-valued floating-point data type. + + """ + if isinstance(dtype, dpt.usm_ndarray): + dtype = dtype.dtype + _supported_dtype([dpt.dtype(dtype)]) + return finfo_object(dtype) + + +def _supported_dtype(dtypes): + for dtype in dtypes: + if dtype.char not in "?bBhHiIlLqQefdFD": + raise ValueError(f"Dpctl doesn't support dtype {dtype}.") + return True + + +def isdtype(dtype, kind): + """isdtype(dtype, kind) + + Returns a boolean indicating whether a provided `dtype` is + of a specified data type `kind`. + + See [array API](array_api) for more information. + + [array_api]: https://data-apis.org/array-api/latest/ + """ + + if not isinstance(dtype, np.dtype): + raise TypeError(f"Expected instance of `dpt.dtype`, got {dtype}") + + if isinstance(kind, np.dtype): + return dtype == kind + + elif isinstance(kind, str): + if kind == "bool": + return dtype == np.dtype("bool") + elif kind == "signed integer": + return dtype.kind == "i" + elif kind == "unsigned integer": + return dtype.kind == "u" + elif kind == "integral": + return dtype.kind in "iu" + elif kind == "real floating": + return dtype.kind == "f" + elif kind == "complex floating": + return dtype.kind == "c" + elif kind == "numeric": + return dtype.kind in "iufc" + else: + raise ValueError(f"Unrecognized data type kind: {kind}") + + elif isinstance(kind, tuple): + return any(isdtype(dtype, k) for k in kind) + + else: + raise TypeError(f"Unsupported data type kind: {kind}") + + +def _default_accumulation_dtype(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + """ + inp_kind = inp_dt.kind + if inp_kind in "bi": + res_dt = dpt.dtype(ti.default_device_int_type(q)) + if inp_dt.itemsize > res_dt.itemsize: + res_dt = inp_dt + elif inp_kind in "u": + res_dt = dpt.dtype(ti.default_device_uint_type(q)) + res_ii = dpt.iinfo(res_dt) + inp_ii = dpt.iinfo(inp_dt) + if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max: + pass + else: + res_dt = inp_dt + elif inp_kind in "fc": + res_dt = inp_dt + + return res_dt + + +def _default_accumulation_dtype_fp_types(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when accumulation is performed on queue `q` + and the accumulation supports only floating-point data types + """ + inp_kind = inp_dt.kind + if inp_kind in "biu": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + can_cast_v = dpt.can_cast(inp_dt, res_dt) + if not can_cast_v: + _fp64 = q.sycl_device.has_aspect_fp64 + res_dt = dpt.float64 if _fp64 else dpt.float32 + elif inp_kind in "f": + res_dt = inp_dt + elif inp_kind in "c": + raise ValueError("function not defined for complex types") + + return res_dt + + +__all__ = [ + "_find_buf_dtype", + "_find_buf_dtype2", + "_to_device_supported_dtype", + "_acceptance_fn_default_unary", + "_acceptance_fn_reciprocal", + "_acceptance_fn_default_binary", + "_acceptance_fn_divide", + "_acceptance_fn_negative", + "_acceptance_fn_subtract", + "_resolve_one_strong_one_weak_types", + "_resolve_one_strong_two_weak_types", + "_resolve_weak_types", + "_resolve_weak_types_all_py_ints", + "_weak_type_num_kind", + "_strong_dtype_num_kind", + "can_cast", + "finfo", + "iinfo", + "isdtype", + "result_type", + "WeakBooleanType", + "WeakIntegralType", + "WeakFloatingType", + "WeakComplexType", + "_default_accumulation_dtype", + "_default_accumulation_dtype_fp_types", + "_find_buf_dtype_in_place_op", +] From c88297c6f57f9b3accb523d409f2272f00489cbb Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:15:56 -0800 Subject: [PATCH 063/145] Use import _type_utils from dpctl_ext --- dpnp/dpnp_array.py | 2 +- dpnp/dpnp_iface_mathematical.py | 8 +++++--- dpnp/dpnp_iface_trigonometric.py | 4 +++- dpnp/dpnp_utils/dpnp_utils_common.py | 5 +++-- dpnp/tests/test_indexing.py | 7 +++++-- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 0b6d882c53db..564627bacf2f 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -38,12 +38,12 @@ import warnings import dpctl.tensor as dpt -import dpctl.tensor._type_utils as dtu from dpctl.tensor._numpy_helper import AxisError # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._type_utils as dtu import dpnp from . import memory as dpm diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index e339c24d384c..f2779b2fdcd1 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -40,6 +40,7 @@ """ # pylint: disable=protected-access +# pylint: disable=duplicate-code # pylint: disable=no-name-in-module @@ -48,15 +49,16 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti -import dpctl.tensor._type_utils as dtu import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( normalize_axis_index, normalize_axis_tuple, ) -from dpctl.tensor._type_utils import _acceptance_fn_divide +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi @@ -1561,7 +1563,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None): mkl_fn_to_call="_mkl_div_to_call", mkl_impl_fn="_div", binary_inplace_fn=ti._divide_inplace, - acceptance_fn=_acceptance_fn_divide, + acceptance_fn=dtu._acceptance_fn_divide, ) diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py index a46f06c10e08..9894bd304701 100644 --- a/dpnp/dpnp_iface_trigonometric.py +++ b/dpnp/dpnp_iface_trigonometric.py @@ -45,8 +45,10 @@ import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti -import dpctl.tensor._type_utils as dtu +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi diff --git a/dpnp/dpnp_utils/dpnp_utils_common.py b/dpnp/dpnp_utils/dpnp_utils_common.py index e4bde2e1ec86..aa294fefe275 100644 --- a/dpnp/dpnp_utils/dpnp_utils_common.py +++ b/dpnp/dpnp_utils/dpnp_utils_common.py @@ -29,8 +29,9 @@ from collections.abc import Iterable -import dpctl.tensor._type_utils as dtu - +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor._type_utils as dtu import dpnp from dpnp.dpnp_utils import map_dtype_to_device diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py index 9a55efe138b7..79c41a2f45f7 100644 --- a/dpnp/tests/test_indexing.py +++ b/dpnp/tests/test_indexing.py @@ -4,8 +4,6 @@ import dpctl.tensor as dpt import numpy import pytest -from dpctl.tensor._numpy_helper import AxisError -from dpctl.tensor._type_utils import _to_device_supported_dtype from dpctl.utils import ExecutionPlacementError from numpy.testing import ( assert_, @@ -16,6 +14,11 @@ ) import dpnp + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +from dpctl_ext.tensor._numpy_helper import AxisError +from dpctl_ext.tensor._type_utils import _to_device_supported_dtype from dpnp.dpnp_array import dpnp_array from .helper import ( From 9207fc57be52aa8aa724290efe0c3da194906894 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:19:22 -0800 Subject: [PATCH 064/145] Move _broadcast_shape_impl to dpctl_ext/tenspr/_manipulation_fucntions.py --- dpctl_ext/tensor/_copy_utils.py | 2 +- dpctl_ext/tensor/_manipulation_functions.py | 40 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 5d1ac209c86b..30a25dd1e2f6 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -37,12 +37,12 @@ import numpy as np from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device -from dpctl.tensor._type_utils import _dtype_supported_by_device_impl # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti +from dpctl_ext.tensor._type_utils import _dtype_supported_by_device_impl from ._numpy_helper import normalize_axis_index diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 85b4f029156b..572b2a99f872 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -26,6 +26,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import itertools import operator import dpctl @@ -45,6 +46,45 @@ ) +def _broadcast_shape_impl(shapes): + if len(set(shapes)) == 1: + return shapes[0] + mutable_shapes = False + nds = [len(s) for s in shapes] + biggest = max(nds) + sh_len = len(shapes) + for i in range(sh_len): + diff = biggest - nds[i] + if diff > 0: + ty = type(shapes[i]) + shapes[i] = ty( + itertools.chain(itertools.repeat(1, diff), shapes[i]) + ) + common_shape = [] + for axis in range(biggest): + lengths = [s[axis] for s in shapes] + unique = set(lengths + [1]) + if len(unique) > 2: + raise ValueError( + "Shape mismatch: two or more arrays have " + f"incompatible dimensions on axis ({axis},)" + ) + elif len(unique) == 2: + unique.remove(1) + new_length = unique.pop() + common_shape.append(new_length) + for i in range(sh_len): + if shapes[i][axis] == 1: + if not mutable_shapes: + shapes = [list(s) for s in shapes] + mutable_shapes = True + shapes[i][axis] = new_length + else: + common_shape.append(1) + + return tuple(common_shape) + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) From ebbce3483b2a337398ae1ccdbb94722635ab7cc5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:35:21 -0800 Subject: [PATCH 065/145] Reuse can_cast() from dpctl_ext/tensor --- dpctl_ext/tensor/_copy_utils.py | 2 +- dpctl_ext/tensor/_manipulation_functions.py | 3 ++- dpctl_ext/tensor/_type_utils.py | 3 ++- dpnp/dpnp_iface_manipulation.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 30a25dd1e2f6..e14d35a0e485 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -1013,7 +1013,7 @@ def astype( else: target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue) - if not dpt.can_cast(ary_dtype, target_dtype, casting=casting): + if not dpt_ext.can_cast(ary_dtype, target_dtype, casting=casting): raise TypeError( f"Can not cast from {ary_dtype} to {newdtype} " f"according to rule {casting}." diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 572b2a99f872..f1b8b46dbcbc 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -36,6 +36,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index, normalize_axis_tuple @@ -162,7 +163,7 @@ def repeat(x, repeats, /, *, axis=None): ) ) dpctl.utils.validate_usm_type(usm_type, allow_none=False) - if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): + if not dpt_ext.can_cast(repeats.dtype, dpt.int64, casting="same_kind"): raise TypeError( f"'repeats' data type {repeats.dtype} cannot be cast to " "'int64' according to the casting rule ''safe.''" diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py index bcfec499f355..9715aa306e1f 100644 --- a/dpctl_ext/tensor/_type_utils.py +++ b/dpctl_ext/tensor/_type_utils.py @@ -33,6 +33,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti @@ -955,7 +956,7 @@ def _default_accumulation_dtype_fp_types(inp_dt, q): inp_kind = inp_dt.kind if inp_kind in "biu": res_dt = dpt.dtype(ti.default_device_fp_type(q)) - can_cast_v = dpt.can_cast(inp_dt, res_dt) + can_cast_v = dpt_ext.can_cast(inp_dt, res_dt) if not can_cast_v: _fp64 = q.sycl_device.has_aspect_fp64 res_dt = dpt.float64 if _fp64 else dpt.float32 diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 8aa5b6832b3d..66cf9f6f844c 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1270,7 +1270,7 @@ def can_cast(from_, to, casting="safe"): if dpnp.is_supported_array_type(from_) else dpnp.dtype(from_) ) - return dpt.can_cast(dtype_from, to, casting=casting) + return dpt_ext.can_cast(dtype_from, to, casting=casting) def column_stack(tup): From 6a133337d7cf47c5ac4406e689a49c0b452f0d64 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:44:41 -0800 Subject: [PATCH 066/145] Reuse finfo(), iinfo(), isdtype() from dpctl_ext/tensor --- dpctl_ext/tensor/_type_utils.py | 8 ++++---- dpnp/dpnp_iface_types.py | 9 ++++++--- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py index 9715aa306e1f..1e386e15dfa3 100644 --- a/dpctl_ext/tensor/_type_utils.py +++ b/dpctl_ext/tensor/_type_utils.py @@ -450,7 +450,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev): o1_dtype, WeakIntegralType ): o1_val = o1_dtype.get() - o2_iinfo = dpt.iinfo(o2_dtype) + o2_iinfo = dpt_ext.iinfo(o2_dtype) if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max): return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype return o2_dtype, o2_dtype @@ -473,7 +473,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev): o2_dtype, WeakIntegralType ): o2_val = o2_dtype.get() - o1_iinfo = dpt.iinfo(o1_dtype) + o1_iinfo = dpt_ext.iinfo(o1_dtype) if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max): return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val)) return o1_dtype, o1_dtype @@ -936,8 +936,8 @@ def _default_accumulation_dtype(inp_dt, q): res_dt = inp_dt elif inp_kind in "u": res_dt = dpt.dtype(ti.default_device_uint_type(q)) - res_ii = dpt.iinfo(res_dt) - inp_ii = dpt.iinfo(inp_dt) + res_ii = dpt_ext.iinfo(res_dt) + inp_ii = dpt_ext.iinfo(inp_dt) if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max: pass else: diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py index 8fdb9e1d3d38..f133333d6b83 100644 --- a/dpnp/dpnp_iface_types.py +++ b/dpnp/dpnp_iface_types.py @@ -40,6 +40,9 @@ import dpctl.tensor as dpt import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .dpnp_array import dpnp_array @@ -211,7 +214,7 @@ def finfo(dtype): """ if isinstance(dtype, dpnp_array): dtype = dtype.dtype - return dpt.finfo(dtype) + return dpt_ext.finfo(dtype) # pylint: disable=redefined-outer-name @@ -244,7 +247,7 @@ def iinfo(dtype): if isinstance(dtype, dpnp_array): dtype = dtype.dtype - return dpt.iinfo(dtype) + return dpt_ext.iinfo(dtype) def isdtype(dtype, kind): @@ -298,7 +301,7 @@ def isdtype(dtype, kind): elif isinstance(kind, tuple): kind = tuple(dpt.dtype(k) if isinstance(k, type) else k for k in kind) - return dpt.isdtype(dtype, kind) + return dpt_ext.isdtype(dtype, kind) def issubdtype(arg1, arg2): From de2c429f7c384757c0c871301c441cdb1a089e92 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:47:36 -0800 Subject: [PATCH 067/145] Reuse result_type() from dpctl_ext/tensor --- dpctl_ext/tensor/_copy_utils.py | 2 +- dpnp/dpnp_iface_indexing.py | 2 +- dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index e14d35a0e485..af72544a8b0d 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -291,7 +291,7 @@ def _prepare_indices_arrays(inds, q, usm_type): ) # promote to a common integral type if possible - ind_dt = dpt.result_type(*inds) + ind_dt = dpt_ext.result_type(*inds) if ind_dt.kind not in "ui": raise ValueError( "cannot safely promote indices to an integer data type" diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 95998dbdda4b..35739df9a12b 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -250,7 +250,7 @@ def choose(a, choices, out=None, mode="wrap"): res_usm_type, exec_q = get_usm_allocations(choices + [inds]) # apply type promotion to input choices - res_dt = dpt.result_type(*choices) + res_dt = dpt_ext.result_type(*choices) if len(choices) > 1: choices = tuple( map( diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 66cf9f6f844c..08fd55c58ace 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3195,7 +3195,7 @@ def result_type(*arrays_and_dtypes): ) for X in arrays_and_dtypes ] - return dpt.result_type(*usm_arrays_and_dtypes) + return dpt_ext.result_type(*usm_arrays_and_dtypes) def roll(x, shift, axis=None): From 9a970021f27db0d340c8a1907dfc6470de1361a7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:50:50 -0800 Subject: [PATCH 068/145] Move _scalar_utils.py to dpctl_ext/tensor --- dpctl_ext/tensor/_scalar_utils.py | 122 ++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 dpctl_ext/tensor/_scalar_utils.py diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py new file mode 100644 index 000000000000..86787baea8cc --- /dev/null +++ b/dpctl_ext/tensor/_scalar_utils.py @@ -0,0 +1,122 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import numbers + +import dpctl.memory as dpm +import dpctl.tensor as dpt +import numpy as np +from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer + +from ._type_utils import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _to_device_supported_dtype, +) + + +def _get_queue_usm_type(o): + """Return SYCL device where object `o` allocated memory, or None.""" + if isinstance(o, dpt.usm_ndarray): + return o.sycl_queue, o.usm_type + elif hasattr(o, "__sycl_usm_array_interface__"): + try: + m = dpm.as_usm_memory(o) + return m.sycl_queue, m.get_usm_type() + except Exception: + return None, None + return None, None + + +def _get_dtype(o, dev): + if isinstance(o, dpt.usm_ndarray): + return o.dtype + if hasattr(o, "__sycl_usm_array_interface__"): + return dpt.asarray(o).dtype + if _is_buffer(o): + host_dt = np.array(o).dtype + dev_dt = _to_device_supported_dtype(host_dt, dev) + return dev_dt + if hasattr(o, "dtype"): + dev_dt = _to_device_supported_dtype(o.dtype, dev) + return dev_dt + if isinstance(o, bool): + return WeakBooleanType(o) + if isinstance(o, int): + return WeakIntegralType(o) + if isinstance(o, float): + return WeakFloatingType(o) + if isinstance(o, complex): + return WeakComplexType(o) + return np.object_ + + +def _validate_dtype(dt) -> bool: + return isinstance( + dt, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ) or ( + isinstance(dt, dpt.dtype) + and dt + in [ + dpt.bool, + dpt.int8, + dpt.uint8, + dpt.int16, + dpt.uint16, + dpt.int32, + dpt.uint32, + dpt.int64, + dpt.uint64, + dpt.float16, + dpt.float32, + dpt.float64, + dpt.complex64, + dpt.complex128, + ] + ) + + +def _get_shape(o): + if isinstance(o, dpt.usm_ndarray): + return o.shape + if _is_buffer(o): + return memoryview(o).shape + if isinstance(o, numbers.Number): + return () + return getattr(o, "shape", tuple()) + + +__all__ = [ + "_get_dtype", + "_get_queue_usm_type", + "_get_shape", + "_validate_dtype", +] From 15c467ddb5dcdc4edbf7d901fac63999fb23f512 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 02:57:06 -0800 Subject: [PATCH 069/145] Move where() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 3 + dpctl_ext/tensor/_search_functions.py | 419 ++++++++++++++++++++++++++ dpnp/dpnp_algo/dpnp_arraycreation.py | 2 +- dpnp/dpnp_iface_searching.py | 5 +- 4 files changed, 427 insertions(+), 2 deletions(-) create mode 100644 dpctl_ext/tensor/_search_functions.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index d7f2a785802c..2ca989bc8c14 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -27,6 +27,8 @@ # ***************************************************************************** +from dpctl.tensor._search_functions import where + from dpctl_ext.tensor._copy_utils import ( asnumpy, astype, @@ -82,4 +84,5 @@ "to_numpy", "tril", "triu", + "where", ] diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py new file mode 100644 index 000000000000..053c68e1857d --- /dev/null +++ b/dpctl_ext/tensor/_search_functions.py @@ -0,0 +1,419 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.tensor as dpt +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti +from dpctl_ext.tensor._manipulation_functions import _broadcast_shape_impl + +from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _all_data_types, + _can_cast, + _is_weak_dtype, + _strong_dtype_num_kind, + _to_device_supported_dtype, + _weak_type_num_kind, +) + + +def _default_dtype_from_weak_type(dt, dev): + if isinstance(dt, WeakBooleanType): + return dpt.bool + if isinstance(dt, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dt, WeakFloatingType): + return dpt.dtype(ti.default_device_fp_type(dev)) + if isinstance(dt, WeakComplexType): + return dpt.dtype(ti.default_device_complex_type(dev)) + + +def _resolve_two_weak_types(o1_dtype, o2_dtype, dev): + """Resolves two weak data types per NEP-0050""" + if _is_weak_dtype(o1_dtype): + if _is_weak_dtype(o2_dtype): + return _default_dtype_from_weak_type( + o1_dtype, dev + ), _default_dtype_from_weak_type(o2_dtype, dev) + o1_kind_num = _weak_type_num_kind(o1_dtype) + o2_kind_num = _strong_dtype_num_kind(o2_dtype) + if o1_kind_num > o2_kind_num: + if isinstance(o1_dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), o2_dtype + if isinstance(o1_dtype, WeakComplexType): + if o2_dtype is dpt.float16 or o2_dtype is dpt.float32: + return dpt.complex64, o2_dtype + return ( + _to_device_supported_dtype(dpt.complex128, dev), + o2_dtype, + ) + return _to_device_supported_dtype(dpt.float64, dev), o2_dtype + else: + return o2_dtype, o2_dtype + elif _is_weak_dtype(o2_dtype): + o1_kind_num = _strong_dtype_num_kind(o1_dtype) + o2_kind_num = _weak_type_num_kind(o2_dtype) + if o2_kind_num > o1_kind_num: + if isinstance(o2_dtype, WeakIntegralType): + return o1_dtype, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(o2_dtype, WeakComplexType): + if o1_dtype is dpt.float16 or o1_dtype is dpt.float32: + return o1_dtype, dpt.complex64 + return o1_dtype, _to_device_supported_dtype(dpt.complex128, dev) + return ( + o1_dtype, + _to_device_supported_dtype(dpt.float64, dev), + ) + else: + return o1_dtype, o1_dtype + else: + return o1_dtype, o2_dtype + + +def _where_result_type(dt1, dt2, dev): + res_dtype = dpt_ext.result_type(dt1, dt2) + fp16 = dev.has_aspect_fp16 + fp64 = dev.has_aspect_fp64 + + all_dts = _all_data_types(fp16, fp64) + if res_dtype in all_dts: + return res_dtype + else: + for res_dtype_ in all_dts: + if _can_cast(dt1, res_dtype_, fp16, fp64) and _can_cast( + dt2, res_dtype_, fp16, fp64 + ): + return res_dtype_ + return None + + +def where(condition, x1, x2, /, *, order="K", out=None): + """ + Returns :class:`dpctl.tensor.usm_ndarray` with elements chosen + from ``x1`` or ``x2`` depending on ``condition``. + + Args: + condition (usm_ndarray): When ``True`` yields from ``x1``, + and otherwise yields from ``x2``. + Must be compatible with ``x1`` and ``x2`` according + to broadcasting rules. + x1 (Union[usm_ndarray, bool, int, float, complex]): + Array from which values are chosen when ``condition`` is ``True``. + Must be compatible with ``condition`` and ``x2`` according + to broadcasting rules. + x2 (Union[usm_ndarray, bool, int, float, complex]): + Array from which values are chosen when ``condition`` is not + ``True``. + Must be compatible with ``condition`` and ``x2`` according + to broadcasting rules. + order (``"K"``, ``"C"``, ``"F"``, ``"A"``, optional): + Memory layout of the new output array, + if parameter ``out`` is ``None``. + Default: ``"K"``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + An array with elements from ``x1`` where ``condition`` is ``True``, + and elements from ``x2`` elsewhere. + + The data type of the returned array is determined by applying + the Type Promotion Rules to ``x1`` and ``x2``. + """ + if not isinstance(condition, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(condition)}" + ) + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, condition_usm_type = condition.sycl_queue, condition.usm_type + q2, x1_usm_type = _get_queue_usm_type(x1) + q3, x2_usm_type = _get_queue_usm_type(x2) + if q2 is None and q3 is None: + exec_q = q1 + out_usm_type = condition_usm_type + elif q3 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpctl.utils.get_coerced_usm_type( + ( + condition_usm_type, + x1_usm_type, + ) + ) + elif q2 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpctl.utils.get_coerced_usm_type( + ( + condition_usm_type, + x2_usm_type, + ) + ) + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + out_usm_type = dpctl.utils.get_coerced_usm_type( + ( + condition_usm_type, + x1_usm_type, + x2_usm_type, + ) + ) + dpctl.utils.validate_usm_type(out_usm_type, allow_none=False) + condition_shape = condition.shape + x1_shape = _get_shape(x1) + x2_shape = _get_shape(x2) + if not all( + isinstance(s, (tuple, list)) + for s in ( + x1_shape, + x2_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + condition_shape, + x1_shape, + x2_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{condition_shape}, {x1_shape}, and {x2_shape}" + ) + sycl_dev = exec_q.sycl_device + x1_dtype = _get_dtype(x1, sycl_dev) + x2_dtype = _get_dtype(x2, sycl_dev) + if not all(_validate_dtype(o) for o in (x1_dtype, x2_dtype)): + raise ValueError("Operands have unsupported data types") + x1_dtype, x2_dtype = _resolve_two_weak_types(x1_dtype, x2_dtype, sycl_dev) + out_dtype = _where_result_type(x1_dtype, x2_dtype, sycl_dev) + if out_dtype is None: + raise TypeError( + "function 'where' does not support input " + f"types ({x1_dtype}, {x2_dtype}), " + "and the inputs could not be safely coerced " + "to any supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if out_dtype != out.dtype: + raise ValueError( + f"Output array of type {out_dtype} is needed, " + f"got {out.dtype}" + ) + + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(condition, out) and not ti._same_logical_tensors( + condition, out + ): + out = dpt.empty_like(out) + + if isinstance(x1, dpt.usm_ndarray): + if ( + ti._array_overlap(x1, out) + and not ti._same_logical_tensors(x1, out) + and x1_dtype == out_dtype + ): + out = dpt.empty_like(out) + + if isinstance(x2, dpt.usm_ndarray): + if ( + ti._array_overlap(x2, out) + and not ti._same_logical_tensors(x2, out) + and x2_dtype == out_dtype + ): + out = dpt.empty_like(out) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + condition, + x1, + x2, + ) + ) + else "C" + ) + if not isinstance(x1, dpt.usm_ndarray): + x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q) + if not isinstance(x2, dpt.usm_ndarray): + x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q) + + if condition.size == 0: + if out is not None: + return out + else: + if order == "K": + return _empty_like_triple_orderK( + condition, + x1, + x2, + out_dtype, + res_shape, + out_usm_type, + exec_q, + ) + else: + return dpt.empty( + res_shape, + dtype=out_dtype, + order=order, + usm_type=out_usm_type, + sycl_queue=exec_q, + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if x1_dtype != out_dtype: + if order == "K": + _x1 = _empty_like_orderK(x1, out_dtype) + else: + _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs + ) + x1 = _x1 + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + + if x2_dtype != out_dtype: + if order == "K": + _x2 = _empty_like_orderK(x2, out_dtype) + else: + _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs + ) + x2 = _x2 + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=out_dtype, + order=order, + usm_type=out_usm_type, + sycl_queue=exec_q, + ) + + if condition_shape != res_shape: + condition = dpt.broadcast_to(condition, res_shape) + if x1_shape != res_shape: + x1 = dpt.broadcast_to(x1, res_shape) + if x2_shape != res_shape: + x2 = dpt.broadcast_to(x2, res_shape) + + dep_evs = _manager.submitted_events + hev, where_ev = ti._where( + condition=condition, + x1=x1, + x2=x2, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, where_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[where_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + + return out diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 47edf63a68b4..f3dd18153563 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -243,7 +243,7 @@ def dpnp_linspace( # Needed a special handling for denormal numbers (when step == 0), # see numpy#5437 for more details. # Note, dpt.where() is used to avoid a synchronization branch. - usm_res = dpt.where( + usm_res = dpt_ext.where( step == 0, (usm_res / step_num) * delta, usm_res * step ) else: diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 16ab633d506b..a2389978d506 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -44,6 +44,7 @@ # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as dti import dpnp @@ -473,5 +474,7 @@ def where(condition, x=None, y=None, /, *, order="K", out=None): usm_condition = dpnp.get_usm_ndarray(condition) usm_out = None if out is None else dpnp.get_usm_ndarray(out) - usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out) + usm_res = dpt_ext.where( + usm_condition, usm_x, usm_y, order=order, out=usm_out + ) return dpnp.get_result_array(usm_res, out) From 134bb35006aaf2db54c474427b50e330e8470e1d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 03:18:31 -0800 Subject: [PATCH 070/145] Update imports in dpnp/dpnp_iface_indexing.py --- dpnp/dpnp_iface_indexing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 95998dbdda4b..f305b106221f 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -47,9 +47,6 @@ import dpctl.tensor as dpt import dpctl.utils as dpu import numpy -from dpctl.tensor._copy_utils import _nonzero_impl -from dpctl.tensor._indexing_functions import _get_indexing_mode -from dpctl.tensor._numpy_helper import normalize_axis_index # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` @@ -60,6 +57,9 @@ # pylint: disable=no-name-in-module import dpnp.backend.extensions.indexing._indexing_impl as indexing_ext +from dpctl_ext.tensor._copy_utils import _nonzero_impl +from dpctl_ext.tensor._indexing_functions import _get_indexing_mode +from dpctl_ext.tensor._numpy_helper import normalize_axis_index # pylint: disable=no-name-in-module from .dpnp_algo import ( From c58b531481fc9b83b995fced56510d7e5b17bbc8 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 03:44:23 -0800 Subject: [PATCH 071/145] Enable -fno-fast-math compiler flag --- dpctl_ext/tensor/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index cec88aed44c8..06efd2aa7b8b 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -92,10 +92,10 @@ endif() set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) #list( #APPEND _no_fast_math_sources From 4f00632551ecbcfd9a467e7fdf63d2ff239d5ed5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 03:59:54 -0800 Subject: [PATCH 072/145] Move _clip to dpctl_ext/tensor/libtensor --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../tensor/libtensor/include/kernels/clip.hpp | 357 ++++++++++++++++++ dpctl_ext/tensor/libtensor/source/clip.cpp | 267 +++++++++++++ dpctl_ext/tensor/libtensor/source/clip.hpp | 57 +++ .../tensor/libtensor/source/tensor_ctors.cpp | 20 +- 5 files changed, 693 insertions(+), 12 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/clip.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/clip.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/clip.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 06efd2aa7b8b..6f823a818ce7 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -61,7 +61,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) set(_static_lib_trgt simplify_iteration_space) @@ -94,7 +94,7 @@ set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) #list( diff --git a/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp new file mode 100644 index 000000000000..7ce50b2b62bc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/clip.hpp @@ -0,0 +1,357 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.clip. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "kernels/alignment.hpp" +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels::clip +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::offset_utils; + +using dpctl::tensor::kernels::alignment_utils:: + disabled_sg_loadstore_wrapper_krn; +using dpctl::tensor::kernels::alignment_utils::is_aligned; +using dpctl::tensor::kernels::alignment_utils::required_alignment; + +using dpctl::tensor::sycl_utils::sub_group_load; +using dpctl::tensor::sycl_utils::sub_group_store; + +template +T clip(const T &x, const T &min, const T &max) +{ + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::max_complex; + using dpctl::tensor::math_utils::min_complex; + return min_complex(max_complex(x, min), max); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + auto tmp = (std::isnan(x) || x > min) ? x : min; + return (std::isnan(tmp) || tmp < max) ? tmp : max; + } + else if constexpr (std::is_same_v) { + return (x || min) && max; + } + else { + auto tmp = (x > min) ? x : min; + return (tmp < max) ? tmp : max; + } +} + +template +class ClipContigFunctor +{ +private: + std::size_t nelems = 0; + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + +public: + ClipContigFunctor(std::size_t nelems_, + const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_) + : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + static constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz; + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value || !enable_sg_loadstore) { + const std::uint16_t sgSize = + ndit.get_sub_group().get_local_range()[0]; + const std::size_t gid = ndit.get_global_linear_id(); + const std::uint16_t nelems_per_sg = sgSize * nelems_per_wi; + + const std::size_t start = + (gid / sgSize) * (nelems_per_sg - sgSize) + gid; + const std::size_t end = std::min(nelems, start + nelems_per_sg); + + for (std::size_t offset = start; offset < end; offset += sgSize) { + dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + const std::uint16_t sgSize = sg.get_max_local_range()[0]; + + const std::size_t base = + nelems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * sgSize); + + if (base + nelems_per_wi * sgSize < nelems) { + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + const std::size_t idx = base + it * sgSize; + auto x_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x_p[idx]); + auto min_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&min_p[idx]); + auto max_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&max_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + const sycl::vec x_vec = + sub_group_load(sg, x_multi_ptr); + const sycl::vec min_vec = + sub_group_load(sg, min_multi_ptr); + const sycl::vec max_vec = + sub_group_load(sg, max_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id], + max_vec[vec_id]); + } + sub_group_store(sg, dst_vec, dst_multi_ptr); + } + } + else { + const std::size_t lane_id = sg.get_local_id()[0]; + for (std::size_t k = base + lane_id; k < nelems; k += sgSize) { + dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); + } + } + } + } +}; + +template +class clip_contig_kernel; + +typedef sycl::event (*clip_contig_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event clip_contig_impl(sycl::queue &q, + std::size_t nelems, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + std::size_t lws = 64; + static constexpr std::uint8_t vec_sz = 4; + static constexpr std::uint8_t n_vecs = 2; + const std::size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + if (is_aligned(x_cp) && + is_aligned(min_cp) && + is_aligned(max_cp) && + is_aligned(dst_cp)) + { + static constexpr bool enable_sg_loadstore = true; + using KernelName = clip_contig_kernel; + using Impl = + ClipContigFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(nelems, x_tp, min_tp, max_tp, dst_tp)); + } + else { + static constexpr bool disable_sg_loadstore = false; + using InnerKernelName = clip_contig_kernel; + using KernelName = + disabled_sg_loadstore_wrapper_krn; + using Impl = + ClipContigFunctor; + + cgh.parallel_for( + sycl::nd_range<1>(gws_range, lws_range), + Impl(nelems, x_tp, min_tp, max_tp, dst_tp)); + } + }); + + return clip_ev; +} + +template +class ClipStridedFunctor +{ +private: + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + IndexerT indexer; + +public: + ClipStridedFunctor(const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_, + const IndexerT &indexer_) + : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + std::size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + dst_p[offsets.get_fourth_offset()] = clip( + x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], + max_p[offsets.get_third_offset()]); + } +}; + +template +class clip_strided_kernel; + +typedef sycl::event (*clip_strided_impl_fn_ptr_t)( + sycl::queue &, + std::size_t, + int, + const char *, + const char *, + const char *, + char *, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event clip_strided_impl(sycl::queue &q, + std::size_t nelems, + int nd, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const ssize_t *shape_strides, + ssize_t x_offset, + ssize_t min_offset, + ssize_t max_offset, + ssize_t dst_offset, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const FourOffsets_StridedIndexer indexer{ + nd, x_offset, min_offset, max_offset, dst_offset, shape_strides}; + + using KernelName = clip_strided_kernel; + using Impl = ClipStridedFunctor; + + cgh.parallel_for( + sycl::range<1>(nelems), + Impl(x_tp, min_tp, max_tp, dst_tp, indexer)); + }); + + return clip_ev; +} + +template +struct ClipStridedFactory +{ + fnT get() + { + fnT fn = clip_strided_impl; + return fn; + } +}; + +template +struct ClipContigFactory +{ + fnT get() + { + + fnT fn = clip_contig_impl; + return fn; + } +}; + +} // namespace dpctl::tensor::kernels::clip diff --git a/dpctl_ext/tensor/libtensor/source/clip.cpp b/dpctl_ext/tensor/libtensor/source/clip.cpp new file mode 100644 index 000000000000..1414689bc4b7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/clip.cpp @@ -0,0 +1,267 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "clip.hpp" +#include "kernels/clip.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t; + +static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types]; +static clip_strided_impl_fn_ptr_t + clip_strided_dispatch_vector[td_ns::num_types]; + +void init_clip_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::clip::ClipContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(clip_contig_dispatch_vector); + + using dpctl::tensor::kernels::clip::ClipStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(clip_strided_dispatch_vector); +} + +using dpctl::utils::keep_args_alive; + +std::pair + py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + int nd = src.get_ndim(); + int min_nd = min.get_ndim(); + int max_nd = max.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != min_nd || nd != max_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for clip kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for clip kernel."); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *min_shape = min.get_shape_raw(); + const py::ssize_t *max_shape = max.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + std::size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (min_shape[i] == sh_i) && + (max_shape[i] == sh_i) && (src_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Arrays are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, src) && !same_logical_tensors(dst, src)) || + (overlap(dst, min) && !same_logical_tensors(dst, min)) || + (overlap(dst, max) && !same_logical_tensors(dst, max))) + { + throw py::value_error("Destination array overlaps with input."); + } + + int min_typenum = min.get_typenum(); + int max_typenum = max.get_typenum(); + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int min_typeid = array_types.typenum_to_lookup_id(min_typenum); + int max_typeid = array_types.typenum_to_lookup_id(max_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid || src_typeid != min_typeid || + src_typeid != max_typeid) + { + throw py::value_error("Input, min, max, and destination arrays must " + "have the same data type"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, nelems); + + char *src_data = src.get_data(); + char *min_data = min.get_data(); + char *max_data = max.get_data(); + char *dst_data = dst.get_data(); + + bool is_min_c_contig = min.is_c_contiguous(); + bool is_min_f_contig = min.is_f_contiguous(); + + bool is_max_c_contig = max.is_c_contiguous(); + bool is_max_f_contig = max.is_f_contiguous(); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_min_c_contig && is_max_c_contig && + is_src_c_contig && is_dst_c_contig); + bool all_f_contig = (is_min_f_contig && is_max_f_contig && + is_src_f_contig && is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto fn = clip_contig_dispatch_vector[src_typeid]; + + sycl::event clip_ev = + fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev}); + + return std::make_pair(ht_ev, clip_ev); + } + + auto const &src_strides = src.get_strides_vector(); + auto const &min_strides = min.get_strides_vector(); + auto const &max_strides = max.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_min_strides; + shT simplified_max_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t min_offset(0); + py::ssize_t max_offset(0); + py::ssize_t dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space_4( + nd, src_shape, src_strides, min_strides, max_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides, src_offset, min_offset, + max_offset, dst_offset); + + auto fn = clip_strided_dispatch_vector[src_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data, + dst_data, packed_shape_strides, src_offset, + min_offset, max_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {clip_ev}, packed_shape_strides_owner); + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, min, max, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, clip_ev); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/clip.hpp b/dpctl_ext/tensor/libtensor/source/clip.hpp new file mode 100644 index 000000000000..de8f0e559b6e --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/clip.hpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" + +namespace dpctl::tensor::py_internal +{ + +extern std::pair + py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +extern void init_clip_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 1619357aaf64..0a3e111c8003 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -45,7 +45,7 @@ #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" -// #include "clip.hpp" +#include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_as_contig.hpp" #include "copy_for_reshape.hpp" @@ -135,7 +135,7 @@ using dpctl::tensor::py_internal::usm_ndarray_triul; using dpctl::tensor::py_internal::py_where; /* =========================== Clip ============================== */ -// using dpctl::tensor::py_internal::py_clip; +using dpctl::tensor::py_internal::py_clip; // populate dispatch tables void init_dispatch_tables(void) @@ -171,7 +171,7 @@ void init_dispatch_vectors(void) populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); - // init_clip_dispatch_vectors(); + init_clip_dispatch_vectors(); return; } @@ -488,11 +488,11 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_clip", &py_clip, - // "Clamps elements of array `x` to the range " - // "[`min`, `max] and writes the result to the " - // "array `dst` for each element of `x`, `min`, and `max`." - // "Returns a tuple of events: (hev, ev)", - // py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_clip", &py_clip, + "Clamps elements of array `x` to the range " + "[`min`, `max] and writes the result to the " + "array `dst` for each element of `x`, `min`, and `max`." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); } From 5d45d85548bb53915e4536bd92579c794d8dd575 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 04:09:16 -0800 Subject: [PATCH 073/145] Move clip() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_clip.py | 781 ++++++++++++++++++++++++++++++++ dpnp/dpnp_iface_mathematical.py | 3 +- 3 files changed, 785 insertions(+), 1 deletion(-) create mode 100644 dpctl_ext/tensor/_clip.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 2ca989bc8c14..8cd8a1896b28 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -57,6 +57,7 @@ ) from dpctl_ext.tensor._reshape import reshape +from ._clip import clip from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ @@ -64,6 +65,7 @@ "astype", "can_cast", "copy", + "clip", "extract", "eye", "finfo", diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py new file mode 100644 index 000000000000..50d3ecd568e3 --- /dev/null +++ b/dpctl_ext/tensor/_clip.py @@ -0,0 +1,781 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti +from dpctl_ext.tensor._copy_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _empty_like_triple_orderK, +) +from dpctl_ext.tensor._manipulation_functions import _broadcast_shape_impl +from dpctl_ext.tensor._type_utils import _can_cast + +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + _resolve_one_strong_one_weak_types, + _resolve_one_strong_two_weak_types, +) + + +def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev): + """ + Checks if both types `arg1_dtype` and `arg2_dtype` can be + cast to `res_dtype` according to the rule `safe` + """ + if arg1_dtype == res_dtype and arg2_dtype == res_dtype: + return None, None, res_dtype + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast( + arg2_dtype, res_dtype, _fp16, _fp64 + ): + # prevent unnecessary casting + ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype + ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype + return ret_buf1_dt, ret_buf2_dt, res_dtype + else: + return None, None, None + + +def _clip_none(x, val, out, order, _binary_fn): + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, val_usm_type = _get_queue_usm_type(val) + if q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + val_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + val_shape = _get_shape(val) + if not isinstance(val_shape, (tuple, list)): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + val_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape} and {val_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + val_dtype = _get_dtype(val, sycl_dev) + if not _validate_dtype(val_dtype): + raise ValueError("Operands have unsupported data types") + + val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev) + + res_dt = x.dtype + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if not _can_cast(val_dtype, res_dt, _fp16, _fp64): + raise ValueError( + f"function 'clip' does not support input types " + f"({x_dtype}, {val_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + if ( + ti._array_overlap(val, out) + and not ti._same_logical_tensors(val, out) + and val_dtype == res_dt + ): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + val_ary = val + else: + val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + val_ary, + ) + ) + else "C" + ) + if val_dtype == res_dt: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, val_ary, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if val_ary.shape != res_shape: + val_ary = dpt.broadcast_to(val_ary, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_binary_ev, binary_ev = _binary_fn( + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, copy_ev) + out = orig_out + return out + else: + if order == "K": + buf = _empty_like_orderK(val_ary, res_dt) + else: + buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=val_ary, dst=buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, buf, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + buf = dpt.broadcast_to(buf, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, + src2=buf, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + +def clip(x, /, min=None, max=None, out=None, order="K"): + """clip(x, min=None, max=None, out=None, order="K") + + Clips to the range [`min_i`, `max_i`] for each element `x_i` + in `x`. + + Args: + x (usm_ndarray): Array containing elements to clip. + Must be compatible with `min` and `max` according + to broadcasting rules. + min ({None, Union[usm_ndarray, bool, int, float, complex]}, optional): + Array containing minimum values. + Must be compatible with `x` and `max` according + to broadcasting rules. + max ({None, Union[usm_ndarray, bool, int, float, complex]}, optional): + Array containing maximum values. + Must be compatible with `x` and `min` according + to broadcasting rules. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is + `None`. + Default: "K". + + Returns: + usm_ndarray: + An array with elements clipped to the range [`min`, `max`]. + The returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected `x` to be of dpctl.tensor.usm_ndarray type, got " + f"{type(x)}" + ) + if order not in ["K", "C", "F", "A"]: + order = "K" + if x.dtype.kind in "iu": + if isinstance(min, int) and min <= dpt_ext.iinfo(x.dtype).min: + min = None + if isinstance(max, int) and max >= dpt_ext.iinfo(x.dtype).max: + max = None + if min is None and max is None: + exec_q = x.sycl_queue + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != x.shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {x.shape}, " + f"got {out.shape}" + ) + + if x.dtype != out.dtype: + raise ValueError( + f"Output array of type {x.dtype} is needed, " + f"got {out.dtype}" + ) + + if ( + dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) + is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + else: + return out + else: + if order == "K": + out = _empty_like_orderK(x, x.dtype) + else: + out = dpt.empty_like(x, order=order) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + out = orig_out + return out + elif max is None: + return _clip_none(x, min, out, order, tei._maximum) + elif min is None: + return _clip_none(x, max, out, order, tei._minimum) + else: + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, min_usm_type = _get_queue_usm_type(min) + q3, max_usm_type = _get_queue_usm_type(max) + if q2 is None and q3 is None: + exec_q = q1 + res_usm_type = x_usm_type + elif q3 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + ) + ) + elif q2 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + max_usm_type, + ) + ) + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + max_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + min_shape = _get_shape(min) + max_shape = _get_shape(max) + if not all( + isinstance(s, (tuple, list)) + for s in ( + min_shape, + max_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + min_shape, + max_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape}, {min_shape}, and {max_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + min_dtype = _get_dtype(min, sycl_dev) + max_dtype = _get_dtype(max, sycl_dev) + if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)): + raise ValueError("Operands have unsupported data types") + + min_dtype, max_dtype = _resolve_one_strong_two_weak_types( + x_dtype, min_dtype, max_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _check_clip_dtypes( + x_dtype, + min_dtype, + max_dtype, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{clip}' does not support input types " + f"({x_dtype}, {min_dtype}, {max_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if ( + dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) + is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + if ( + ti._array_overlap(min, out) + and not ti._same_logical_tensors(min, out) + and buf1_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(max, dpt.usm_ndarray): + if ( + ti._array_overlap(max, out) + and not ti._same_logical_tensors(max, out) + and buf2_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + a_min = min + else: + a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + if isinstance(max, dpt.usm_ndarray): + a_max = max + else: + a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + a_max, + ) + ) + else "C" + ) + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=dep_ev, + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + buf2, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + _manager = SequentialOrderManager[exec_q] + dep_ev = _manager.submitted_events + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_ev + ) + _manager.add_event_pair(ht_copy_ev, copy_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + buf1, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=buf1, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_binary_ev, binary_ev) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + _manager.add_event_pair(ht_copy_out_ev, cpy_ev) + out = orig_out + return out + + if order == "K": + if ( + x.flags.c_contiguous + and a_min.flags.c_contiguous + and a_max.flags.c_contiguous + ): + order = "C" + elif ( + x.flags.f_contiguous + and a_min.flags.f_contiguous + and a_max.flags.f_contiguous + ): + order = "F" + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy1_ev, copy1_ev) + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_copy2_ev, copy2_ev) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, clip_ev = ti._clip( + src=x, + min=buf1, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + _manager.add_event_pair(ht_, clip_ev) + return out diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index f2779b2fdcd1..3dbf07be080a 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -58,6 +58,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi @@ -729,7 +730,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs): usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max) usm_out = None if out is None else dpnp.get_usm_ndarray(out) - usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order) + usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order) if out is not None and isinstance(out, dpnp_array): return out return dpnp_array._create_from_usm_ndarray(usm_res) From e70425b9f66f9edc3faaede735384e31fe918621 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 20 Feb 2026 04:51:33 -0800 Subject: [PATCH 074/145] Populate dispatch tables with where --- dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index 0a3e111c8003..e6bc3b5dfb61 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -145,7 +145,7 @@ void init_dispatch_tables(void) init_copy_and_cast_usm_to_usm_dispatch_tables(); init_copy_numpy_ndarray_into_usm_ndarray_dispatch_tables(); init_advanced_indexing_dispatch_tables(); - // init_where_dispatch_tables(); + init_where_dispatch_tables(); return; } From 36919c14ecd1b6e0d34d5ca51a7524221231277d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 02:06:38 -0800 Subject: [PATCH 075/145] Move _linspace_step/_linspace_affine to dpctl_ext/tensor/libtensor --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../include/kernels/constructors.hpp | 177 ++++++++++ .../libtensor/source/linear_sequences.cpp | 308 ++++++++++++++++++ .../libtensor/source/linear_sequences.hpp | 66 ++++ .../tensor/libtensor/source/tensor_ctors.cpp | 38 ++- 5 files changed, 573 insertions(+), 20 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/linear_sequences.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 6f823a818ce7..864e34ddaba4 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -51,7 +51,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_roll.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/integer_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_advanced_indexing.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/eye_ctor.cpp @@ -93,7 +93,7 @@ endif() set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp - # ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp index 26ae46707a6b..27074cd2d246 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/constructors.hpp @@ -54,6 +54,10 @@ using dpctl::tensor::ssize_t; @defgroup CtorKernels */ +template +class linear_sequence_step_kernel; +template +class linear_sequence_affine_kernel; template class full_strided_kernel; template @@ -61,6 +65,179 @@ class eye_kernel; using namespace dpctl::tensor::offset_utils; +template +class LinearSequenceStepFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty step_v; + +public: + LinearSequenceStepFunctor(char *dst_p, Ty v0, Ty dv) + : p(reinterpret_cast(dst_p)), start_v(v0), step_v(dv) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + p[i] = Ty{start_v.real() + i * step_v.real(), + start_v.imag() + i * step_v.imag()}; + } + else { + p[i] = start_v + i * step_v; + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting value and + * increment. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start_v Typed starting value of the sequence + * @param step_v Typed increment of the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty step_v, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + sycl::event lin_space_step_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + cgh.parallel_for>( + sycl::range<1>{nelems}, + LinearSequenceStepFunctor(array_data, start_v, step_v)); + }); + + return lin_space_step_event; +} + +// Constructor to populate tensor with linear sequence defined by +// start and and data + +template +class LinearSequenceAffineFunctor +{ +private: + Ty *p = nullptr; + Ty start_v; + Ty end_v; + std::size_t n; + +public: + LinearSequenceAffineFunctor(char *dst_p, Ty v0, Ty v1, std::size_t den) + : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), + n((den == 0) ? 1 : den) + { + } + + void operator()(sycl::id<1> wiid) const + { + auto i = wiid.get(0); + wTy wc = wTy(i) / n; + wTy w = wTy(n - i) / n; + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using reT = typename Ty::value_type; + auto _w = static_cast(w); + auto _wc = static_cast(wc); + auto re_comb = sycl::fma(start_v.real(), _w, reT(0)); + re_comb = + sycl::fma(end_v.real(), _wc, + re_comb); // start_v.real() * _w + end_v.real() * _wc; + auto im_comb = + sycl::fma(start_v.imag(), _w, + reT(0)); // start_v.imag() * _w + end_v.imag() * _wc; + im_comb = sycl::fma(end_v.imag(), _wc, im_comb); + Ty affine_comb = Ty{re_comb, im_comb}; + p[i] = affine_comb; + } + else if constexpr (std::is_floating_point::value) { + Ty _w = static_cast(w); + Ty _wc = static_cast(wc); + auto affine_comb = + sycl::fma(start_v, _w, Ty(0)); // start_v * w + end_v * wc; + affine_comb = sycl::fma(end_v, _wc, affine_comb); + p[i] = affine_comb; + } + else { + using dpctl::tensor::type_utils::convert_impl; + auto affine_comb = start_v * w + end_v * wc; + p[i] = convert_impl(affine_comb); + } + } +}; + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by typed starting and end values. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence. + * @param start_v Stating value of the sequence. + * @param end_v End-value of the sequence. + * @param include_endpoint Whether the end-value is included in the sequence. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + Ty start_v, + Ty end_v, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + dpctl::tensor::type_utils::validate_type_for_device(exec_q); + + const bool device_supports_doubles = + exec_q.get_device().has(sycl::aspect::fp64); + const std::size_t den = (include_endpoint) ? nelems - 1 : nelems; + + sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + if (device_supports_doubles) { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + else { + using KernelName = linear_sequence_affine_kernel; + using Impl = LinearSequenceAffineFunctor; + + cgh.parallel_for(sycl::range<1>{nelems}, + Impl(array_data, start_v, end_v, den)); + } + }); + + return lin_space_affine_event; +} + /* ================ Full ================== */ /*! diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp new file mode 100644 index 000000000000..5204f24b3724 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.cpp @@ -0,0 +1,308 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/constructors.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "linear_sequences.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty step_v = py::cast(step); + + using dpctl::tensor::kernels::constructors::lin_space_step_impl; + + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + std::size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + std::size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v = py::cast(start); + Ty end_v = py::cast(end); + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); + + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + +static lin_space_affine_fn_ptr_t + lin_space_affine_dispatch_vector[td_ns::num_types]; + +std::pair + usm_ndarray_linear_sequence_step(const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue is not compatible with the allocation queue"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_step_event; + + auto fn = lin_space_step_dispatch_vector[dst_typeid]; + + linspace_step_event = + fn(exec_q, static_cast(len), start, dt, dst_data, depends); + + return std::make_pair(keep_args_alive(exec_q, {dst}, {linspace_step_event}), + linspace_step_event); +} + +std::pair + usm_ndarray_linear_sequence_affine(const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends) +{ + // dst must be 1D and C-contiguous + // start, end should be coercible into data type of dst + + if (dst.get_ndim() != 1) { + throw py::value_error( + "usm_ndarray_linspace: Expecting 1D array to populate"); + } + + if (!dst.is_c_contiguous()) { + throw py::value_error( + "usm_ndarray_linspace: Non-contiguous arrays are not supported"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {dst})) { + throw py::value_error( + "Execution queue context is not the same as allocation context"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + auto array_types = td_ns::usm_ndarray_types(); + int dst_typenum = dst.get_typenum(); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + py::ssize_t len = dst.get_shape(0); + if (len == 0) { + // nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + char *dst_data = dst.get_data(); + sycl::event linspace_affine_event; + + auto fn = lin_space_affine_dispatch_vector[dst_typeid]; + + linspace_affine_event = fn(exec_q, static_cast(len), start, + end, include_endpoint, dst_data, depends); + + return std::make_pair( + keep_args_alive(exec_q, {dst}, {linspace_affine_event}), + linspace_affine_event); +} + +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template +struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template +struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + +void init_linear_sequences_dispatch_vectors(void) +{ + using namespace td_ns; + + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(lin_space_step_dispatch_vector); + + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(lin_space_affine_dispatch_vector); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp new file mode 100644 index 000000000000..45cf45153462 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/linear_sequences.hpp @@ -0,0 +1,66 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern std::pair usm_ndarray_linear_sequence_step( + const py::object &start, + const py::object &dt, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern std::pair usm_ndarray_linear_sequence_affine( + const py::object &start, + const py::object &end, + const dpctl::tensor::usm_ndarray &dst, + bool include_endpoint, + sycl::queue &exec_q, + const std::vector &depends = {}); + +extern void init_linear_sequences_dispatch_vectors(void); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp index e6bc3b5dfb61..7bed4df01d29 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_ctors.cpp @@ -56,7 +56,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "kernels/dpctl_tensor_types.hpp" -// #include "linear_sequences.hpp" +#include "linear_sequences.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" @@ -97,8 +97,8 @@ using dpctl::tensor::py_internal::copy_numpy_ndarray_into_usm_ndarray; /* ============= linear-sequence ==================== */ -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; -// using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_affine; +using dpctl::tensor::py_internal::usm_ndarray_linear_sequence_step; /* ================ Full ================== */ @@ -157,7 +157,7 @@ void init_dispatch_vectors(void) init_copy_as_contig_dispatch_vectors(); init_copy_for_reshape_dispatch_vectors(); init_copy_for_roll_dispatch_vectors(); - // init_linear_sequences_dispatch_vectors(); + init_linear_sequences_dispatch_vectors(); init_full_ctor_dispatch_vectors(); init_zeros_ctor_dispatch_vectors(); init_eye_ctor_dispatch_vectors(); @@ -300,20 +300,22 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("src"), py::arg("dst"), py::arg("shifts"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - // m.def("_linspace_step", &usm_ndarray_linear_sequence_step, - // "Fills input 1D contiguous usm_ndarray `dst` with linear - // sequence " "specified by " "starting point `start` and step - // `dt`. " "Returns a tuple of events: (ht_event, comp_event)", - // py::arg("start"), py::arg("dt"), py::arg("dst"), - // py::arg("sycl_queue"), py::arg("depends") = py::list()); - - // m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, - // "Fills input 1D contiguous usm_ndarray `dst` with linear - // sequence " "specified by " "starting point `start` and end - // point `end`. " "Returns a tuple of events: (ht_event, - // comp_event)", py::arg("start"), py::arg("end"), py::arg("dst"), - // py::arg("include_endpoint"), py::arg("sycl_queue"), - // py::arg("depends") = py::list()); + m.def("_linspace_step", &usm_ndarray_linear_sequence_step, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and step `dt`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("dt"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_linspace_affine", &usm_ndarray_linear_sequence_affine, + "Fills input 1D contiguous usm_ndarray `dst` with linear sequence " + "specified by " + "starting point `start` and end point `end`. " + "Returns a tuple of events: (ht_event, comp_event)", + py::arg("start"), py::arg("end"), py::arg("dst"), + py::arg("include_endpoint"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); m.def("_copy_numpy_ndarray_into_usm_ndarray", ©_numpy_ndarray_into_usm_ndarray, From 95ac4f7876cb006946f2d878a8a90767f74b17b0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 02:40:07 -0800 Subject: [PATCH 076/145] Move ti.linspace() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 206 +++++++++++++++++++++++++++ dpnp/dpnp_algo/dpnp_arraycreation.py | 2 +- dpnp/fft/dpnp_utils_fft.py | 12 +- 4 files changed, 212 insertions(+), 10 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 8cd8a1896b28..78ba47b635e0 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -39,6 +39,7 @@ from dpctl_ext.tensor._ctors import ( eye, full, + linspace, tril, triu, ) @@ -73,6 +74,7 @@ "full", "iinfo", "isdtype", + "linspace", "nonzero", "place", "put", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 5a9e07c73346..ed97b260dd7d 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -30,17 +30,84 @@ from numbers import Number import dpctl +import dpctl.memory as dpm import dpctl.tensor as dpt import dpctl.utils import numpy as np from dpctl.tensor._data_types import _get_dtype from dpctl.tensor._device import normalize_queue_device +from dpctl.tensor._usmarray import _is_object_with_buffer_protocol # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti +__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`" + +_empty_tuple = () +_host_set = frozenset([None]) + + +def _array_info_dispatch(obj): + if isinstance(obj, dpt.usm_ndarray): + return obj.shape, obj.dtype, frozenset([obj.sycl_queue]) + if isinstance(obj, np.ndarray): + return obj.shape, obj.dtype, _host_set + if isinstance(obj, range): + return (len(obj),), int, _host_set + if isinstance(obj, bool): + return _empty_tuple, bool, _host_set + if isinstance(obj, float): + return _empty_tuple, float, _host_set + if isinstance(obj, int): + return _empty_tuple, int, _host_set + if isinstance(obj, complex): + return _empty_tuple, complex, _host_set + if isinstance( + obj, + ( + list, + tuple, + ), + ): + return _array_info_sequence(obj) + if _is_object_with_buffer_protocol(obj): + np_obj = np.array(obj) + return np_obj.shape, np_obj.dtype, _host_set + if hasattr(obj, "__usm_ndarray__"): + usm_ar = obj.__usm_ndarray__ + if isinstance(usm_ar, dpt.usm_ndarray): + return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue]) + if hasattr(obj, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(obj) + return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue]) + + +def _array_info_sequence(li): + if not isinstance(li, (list, tuple, range)): + raise TypeError(f"Expected list, tuple, or range, got {type(li)}") + n = len(li) + dim = None + dt = None + device = frozenset() + for el in li: + el_dim, el_dt, el_dev = _array_info_dispatch(el) + if dim is None: + dim = el_dim + dt = np.promote_types(el_dt, el_dt) + device = device.union(el_dev) + elif el_dim == dim: + dt = np.promote_types(dt, el_dt) + device = device.union(el_dev) + else: + raise ValueError(f"Inconsistent dimensions, {dim} and {el_dim}") + if dim is None: + dim = () + dt = float + device = _host_set + return (n,) + dim, dt, device + def _cast_fill_val(fill_val, dt): """ @@ -58,6 +125,23 @@ def _cast_fill_val(fill_val, dt): return fill_val +def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False): + """Deduce arange type from sequence spec""" + nd, seq_dt, d = _array_info_sequence(args) + if d != _host_set or nd != (len(args),): + raise ValueError(err_msg) + dt = _get_dtype(dt, sycl_queue, ref_type=seq_dt) + if np.issubdtype(dt, np.integer): + return tuple(int(v) for v in args), dt + if np.issubdtype(dt, np.floating): + return tuple(float(v) for v in args), dt + if np.issubdtype(dt, np.complexfloating): + return tuple(complex(v) for v in args), dt + if allow_bool and dt.char == "?": + return tuple(bool(v) for v in args), dt + raise ValueError(f"Data type {dt} is not supported") + + def _ensure_native_dtype_device_support(dtype, dev) -> None: """Check that dtype is natively supported by device. @@ -99,6 +183,25 @@ def _to_scalar(obj, sc_ty): return zd_arr[()] +def _usm_ndarray_from_suai(obj): + sua_iface = obj.__sycl_usm_array_interface__ + membuf = dpm.as_usm_memory(obj) + ary = dpt.usm_ndarray( + sua_iface["shape"], + dtype=sua_iface["typestr"], + buffer=membuf, + strides=sua_iface.get("strides", None), + ) + _data_field = sua_iface["data"] + if isinstance(_data_field, tuple) and len(_data_field) > 1: + ro_field = _data_field[1] + else: + ro_field = False + if ro_field: + ary.flags["W"] = False + return ary + + def eye( n_rows, n_cols=None, @@ -301,6 +404,109 @@ def full( return res +def linspace( + start, + stop, + /, + num, + *, + dtype=None, + device=None, + endpoint=True, + sycl_queue=None, + usm_type="device", +): + """ + linspace(start, stop, num, dtype=None, device=None, endpoint=True, \ + sycl_queue=None, usm_type="device") + + Returns :class:`dpctl.tensor.usm_ndarray` array populated with + evenly spaced numbers of specified interval. + + Args: + start: + the start of the interval. + stop: + the end of the interval. If the ``endpoint`` is ``False``, the + function generates ``num+1`` evenly spaced points starting + with ``start`` and ending with ``stop`` and exclude the + ``stop`` from the returned array such that the returned array + consists of evenly spaced numbers over the half-open interval + ``[start, stop)``. If ``endpoint`` is ``True``, the output + array consists of evenly spaced numbers over the closed + interval ``[start, stop]``. Default: ``True`` + num (int): + number of samples. Must be a non-negative integer; otherwise, + the function raises ``ValueError`` exception. + dtype: + output array data type. Should be a floating data type. + If ``dtype`` is ``None``, the output array must be the default + floating point data type for target device. + Default: ``None`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + endpoint: boolean indicating whether to include ``stop`` in the + interval. Default: ``True`` + + Returns: + usm_ndarray: + Array populated with evenly spaced numbers in the requested + interval. + """ + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if endpoint not in [True, False]: + raise TypeError("endpoint keyword argument must be of boolean type") + + num = operator.index(num) + if num < 0: + raise ValueError("Number of points must be non-negative") + + _, dt = _coerce_and_infer_dt( + start, + stop, + dt=dtype, + sycl_queue=sycl_queue, + err_msg="start and stop must be Python scalars.", + allow_bool=True, + ) + + int_dt = None + if np.issubdtype(dt, np.integer): + if dtype is not None: + int_dt = dt + dt = ti.default_device_fp_type(sycl_queue) + dt = dpt.dtype(dt) + start = float(start) + stop = float(stop) + + res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + hev, la_ev = ti._linspace_affine( + start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue + ) + _manager.add_event_pair(hev, la_ev) + + return res if int_dt is None else dpt.astype(res, int_dt) + + def tril(x, /, *, k=0): """ Returns the lower triangular part of a matrix (or a stack of matrices) diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index f3dd18153563..4e05d57c6872 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -196,7 +196,7 @@ def dpnp_linspace( if dpnp.isscalar(start) and dpnp.isscalar(stop): # Call linspace() function for scalars. - usm_res = dpt.linspace( + usm_res = dpt_ext.linspace( start, stop, num, diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 534b9404254f..e3f35a0201ec 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -42,11 +42,6 @@ from collections.abc import Sequence import dpctl - -# pylint: disable=no-name-in-module -# TODO: remove it when ti.__linspace_step -# is migrated to dpctl_ext/tensor -import dpctl.tensor._tensor_impl as ti import dpctl.utils as dpu import numpy from dpctl.tensor._numpy_helper import ( @@ -55,10 +50,9 @@ ) from dpctl.utils import ExecutionPlacementError -# pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor._tensor_impl as ti_ext +import dpctl_ext.tensor._tensor_impl as ti import dpnp import dpnp.backend.extensions.fft._fft_impl as fi @@ -205,7 +199,7 @@ def _compute_result(dsc, a, out, forward, c2c, out_strides): if ( out is not None and out.strides == tuple(out_strides) - and not ti_ext._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) + and not ti._array_overlap(a_usm, dpnp.get_usm_ndarray(out)) ): res_usm = out_usm result = out @@ -538,7 +532,7 @@ def _truncate_or_pad(a, shape, axes): ) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events - ht_copy_ev, copy_ev = ti_ext._copy_usm_ndarray_into_usm_ndarray( + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=dpnp.get_usm_ndarray(a), dst=z.get_array()[tuple(index)], sycl_queue=exec_q, From 1054e2d4780c585857f32a0c64b2e4b0ec6241e0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 02:56:11 -0800 Subject: [PATCH 077/145] Move ti.empty() and reuse it in dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_clip.py | 12 ++-- dpctl_ext/tensor/_copy_utils.py | 36 +++++----- dpctl_ext/tensor/_ctors.py | 75 +++++++++++++++++++-- dpctl_ext/tensor/_indexing_functions.py | 6 +- dpctl_ext/tensor/_manipulation_functions.py | 16 ++--- dpctl_ext/tensor/_search_functions.py | 14 ++-- 7 files changed, 117 insertions(+), 44 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 78ba47b635e0..e57bb35ce993 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -37,6 +37,7 @@ to_numpy, ) from dpctl_ext.tensor._ctors import ( + empty, eye, full, linspace, @@ -67,6 +68,7 @@ "can_cast", "copy", "clip", + "empty", "extract", "eye", "finfo", diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py index 50d3ecd568e3..1cd360b753ad 100644 --- a/dpctl_ext/tensor/_clip.py +++ b/dpctl_ext/tensor/_clip.py @@ -197,7 +197,7 @@ def _clip_none(x, val, out, order, _binary_fn): x, val_ary, res_dt, res_shape, res_usm_type, exec_q ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, @@ -242,7 +242,7 @@ def _clip_none(x, val, out, order, _binary_fn): x, buf, res_dt, res_shape, res_usm_type, exec_q ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, @@ -572,7 +572,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): exec_q, ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, @@ -631,7 +631,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): exec_q, ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, @@ -687,7 +687,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): exec_q, ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, @@ -758,7 +758,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index af72544a8b0d..a1c14cf0f69b 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -91,7 +91,7 @@ def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None): ) else: Xusm_dtype = dt - Xusm = dpt.empty( + Xusm = dpt_ext.empty( Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue ) _copy_from_numpy_into(Xusm, Xnp) @@ -176,7 +176,7 @@ def _extract_impl(ary, ary_mask, axis=0): ) mask_nelems = ary_mask.size cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 - cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) + cumsum = dpt_ext.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device) exec_q = cumsum.sycl_queue _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events @@ -184,7 +184,7 @@ def _extract_impl(ary, ary_mask, axis=0): ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs ) dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :] - dst = dpt.empty( + dst = dpt_ext.empty( dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device ) if dst.size == 0: @@ -247,7 +247,7 @@ def _nonzero_impl(ary): usm_type = ary.usm_type mask_nelems = ary.size cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64 - cumsum = dpt.empty( + cumsum = dpt_ext.empty( mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C" ) _manager = dpctl.utils.SequentialOrderManager[exec_q] @@ -256,7 +256,7 @@ def _nonzero_impl(ary): ary, cumsum, sycl_queue=exec_q, depends=dep_evs ) indexes_dt = ti.default_device_index_type(exec_q.sycl_device) - indexes = dpt.empty( + indexes = dpt_ext.empty( (ary.ndim, mask_count), dtype=indexes_dt, usm_type=usm_type, @@ -418,7 +418,7 @@ def _take_multi_index(ary, inds, p, mode=0): if 0 in ary_sh[p:p_end] and ind0.size != 0: raise IndexError("cannot take non-empty indices from an empty axis") res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:] - res = dpt.empty( + res = dpt_ext.empty( res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q ) _manager = dpctl.utils.SequentialOrderManager[exec_q] @@ -681,7 +681,9 @@ def _make_empty_like_orderK(x, dt, usm_type, dev): inv_perm = sorted(range(x.ndim), key=lambda i: perm[i]) sh = x.shape sh_sorted = tuple(sh[i] for i in perm) - R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + R = dpt_ext.empty( + sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) if min(st) < 0: st_sorted = [st[i] for i in perm] sl = tuple( @@ -734,11 +736,11 @@ def _from_numpy_empty_like_orderK(x, dt, usm_type, dev): raise TypeError(f"Expected numpy.ndarray, got {type(x)}") fl = x.flags if fl["C"] or x.size <= 1: - return dpt.empty( + return dpt_ext.empty( x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C" ) elif fl["F"]: - return dpt.empty( + return dpt_ext.empty( x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F" ) return _make_empty_like_orderK(x, dt, usm_type, dev) @@ -758,11 +760,11 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): fl1 = X1.flags fl2 = X2.flags if fl1["C"] or fl2["C"]: - return dpt.empty( + return dpt_ext.empty( res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" ) if fl1["F"] and fl2["F"]: - return dpt.empty( + return dpt_ext.empty( res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" ) st1 = list(X1.strides) @@ -785,7 +787,9 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): st2_sorted = [st2[i] for i in perm] sh = res_shape sh_sorted = tuple(sh[i] for i in perm) - R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + R = dpt_ext.empty( + sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) if max(min(st1_sorted), min(st2_sorted)) < 0: sl = tuple( ( @@ -823,11 +827,11 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): fl2 = X2.flags fl3 = X3.flags if fl1["C"] or fl2["C"] or fl3["C"]: - return dpt.empty( + return dpt_ext.empty( res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C" ) if fl1["F"] and fl2["F"] and fl3["F"]: - return dpt.empty( + return dpt_ext.empty( res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F" ) st1 = list(X1.strides) @@ -855,7 +859,9 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): st3_sorted = [st3[i] for i in perm] sh = res_shape sh_sorted = tuple(sh[i] for i in perm) - R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C") + R = dpt_ext.empty( + sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C" + ) if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0: sl = tuple( ( diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index ed97b260dd7d..dc7da78a9975 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -202,6 +202,71 @@ def _usm_ndarray_from_suai(obj): return ary +def empty( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Creates :class:`dpctl.tensor.usm_ndarray` from uninitialized + USM allocation. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. The ``None`` value creates an + array of floating point data type. Default: ``None`` + order (``"C"``, or ``F"``): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Created empty array. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + return res + + def eye( n_rows, n_cols=None, @@ -497,7 +562,7 @@ def linspace( start = float(start) stop = float(stop) - res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) + res = dpt_ext.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue) _manager = dpctl.utils.SequentialOrderManager[sycl_queue] hev, la_ev = ti._linspace_affine( start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue @@ -546,7 +611,7 @@ def tril(x, /, *, k=0): q = x.sycl_queue if k >= shape[nd - 1] - 1: - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, order=order, @@ -568,7 +633,7 @@ def tril(x, /, *, k=0): sycl_queue=q, ) else: - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, order=order, @@ -632,7 +697,7 @@ def triu(x, /, *, k=0): sycl_queue=q, ) elif k <= -shape[nd - 2] + 1: - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, order=order, @@ -646,7 +711,7 @@ def triu(x, /, *, k=0): ) _manager.add_event_pair(hev, cpy_ev) else: - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, order=order, diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 6ca327192f73..17a4aefbd4ca 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -177,7 +177,7 @@ def place(arr, mask, vals): raise dpctl.utils.ExecutionPlacementError if arr.shape != mask.shape or vals.ndim != 1: raise ValueError("Array sizes are not as required") - cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q) + cumsum = dpt_ext.empty(mask.size, dtype="i8", sycl_queue=exec_q) _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events nz_count = ti.mask_positions( @@ -540,9 +540,9 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"): "Input and output allocation queues are not compatible" ) if ti._array_overlap(x, out): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q ) diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index f1b8b46dbcbc..470bf3ef3876 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -223,7 +223,7 @@ def repeat(x, repeats, /, *, axis=None): res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] else: res_shape = (res_axis_size,) - res = dpt.empty( + res = dpt_ext.empty( res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q ) if res_axis_size > 0: @@ -238,7 +238,7 @@ def repeat(x, repeats, /, *, axis=None): _manager.add_event_pair(ht_rep_ev, rep_ev) else: if repeats.dtype != dpt.int64: - rep_buf = dpt.empty( + rep_buf = dpt_ext.empty( repeats.shape, dtype=dpt.int64, usm_type=usm_type, @@ -248,7 +248,7 @@ def repeat(x, repeats, /, *, axis=None): src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs ) _manager.add_event_pair(ht_copy_ev, copy_ev) - cumsum = dpt.empty( + cumsum = dpt_ext.empty( (axis_size,), dtype=dpt.int64, usm_type=usm_type, @@ -264,7 +264,7 @@ def repeat(x, repeats, /, *, axis=None): ) else: res_shape = (res_axis_size,) - res = dpt.empty( + res = dpt_ext.empty( res_shape, dtype=x.dtype, usm_type=usm_type, @@ -281,7 +281,7 @@ def repeat(x, repeats, /, *, axis=None): ) _manager.add_event_pair(ht_rep_ev, rep_ev) else: - cumsum = dpt.empty( + cumsum = dpt_ext.empty( (axis_size,), dtype=dpt.int64, usm_type=usm_type, @@ -296,7 +296,7 @@ def repeat(x, repeats, /, *, axis=None): ) else: res_shape = (res_axis_size,) - res = dpt.empty( + res = dpt_ext.empty( res_shape, dtype=x.dtype, usm_type=usm_type, @@ -353,7 +353,7 @@ def roll(x, /, shift, *, axis=None): _manager = dputils.SequentialOrderManager[exec_q] if axis is None: shift = operator.index(shift) - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q ) sz = operator.index(x.size) @@ -380,7 +380,7 @@ def roll(x, /, shift, *, axis=None): n_i = operator.index(shape[ax]) shifted = shifts[ax] + operator.index(sh) shifts[ax] = (shifted % n_i) if n_i > 0 else 0 - res = dpt.empty( + res = dpt_ext.empty( x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q ) dep_evs = _manager.submitted_events diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py index 053c68e1857d..e7eb33a5f3fe 100644 --- a/dpctl_ext/tensor/_search_functions.py +++ b/dpctl_ext/tensor/_search_functions.py @@ -291,7 +291,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): if ti._array_overlap(condition, out) and not ti._same_logical_tensors( condition, out ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(x1, dpt.usm_ndarray): if ( @@ -299,7 +299,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): and not ti._same_logical_tensors(x1, out) and x1_dtype == out_dtype ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(x2, dpt.usm_ndarray): if ( @@ -307,7 +307,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): and not ti._same_logical_tensors(x2, out) and x2_dtype == out_dtype ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if order == "A": order = ( @@ -342,7 +342,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): exec_q, ) else: - return dpt.empty( + return dpt_ext.empty( res_shape, dtype=out_dtype, order=order, @@ -356,7 +356,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): if order == "K": _x1 = _empty_like_orderK(x1, out_dtype) else: - _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order) + _x1 = dpt_ext.empty_like(x1, dtype=out_dtype, order=order) ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs ) @@ -367,7 +367,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): if order == "K": _x2 = _empty_like_orderK(x2, out_dtype) else: - _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order) + _x2 = dpt_ext.empty_like(x2, dtype=out_dtype, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs ) @@ -380,7 +380,7 @@ def where(condition, x1, x2, /, *, order="K", out=None): condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q ) else: - out = dpt.empty( + out = dpt_ext.empty( res_shape, dtype=out_dtype, order=order, From d493454439edbc2eee87a379f34d02424106fbad Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:16:23 -0800 Subject: [PATCH 078/145] Reuse dpctl_ext.tensor.empty() in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 2 +- dpnp/dpnp_container.py | 2 +- dpnp/dpnp_iface_indexing.py | 6 ++++-- dpnp/tests/test_fft.py | 9 ++++++--- dpnp/tests/test_mathematical.py | 5 ++++- dpnp/tests/test_nanfunctions.py | 9 ++++++--- dpnp/tests/test_search.py | 5 ++++- dpnp/tests/test_statistics.py | 5 ++++- 8 files changed, 30 insertions(+), 13 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 55d74e8c1803..a7b083f2d734 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -1146,7 +1146,7 @@ def __call__( x1, x2, res_dt, res_shape, res_usm_type, exec_q ) else: - out[i] = dpt.empty( + out[i] = dpt_ext.empty( res_shape, dtype=res_dt, order=order, diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 0727b9bfd775..9a2afe77f3cb 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -165,7 +165,7 @@ def empty( order = "C" """Creates `dpnp_array` from uninitialized USM allocation.""" - array_obj = dpt.empty( + array_obj = dpt_ext.empty( shape, dtype=dtype, order=order, diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index bc190db70c4e..9f9b16d2ad9b 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -143,7 +143,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: - out = dpt.empty( + out = dpt_ext.empty( inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q ) @@ -303,7 +303,9 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): # Allocate a temporary buffer to avoid memory overlapping. out = dpt.empty_like(out) else: - out = dpt.empty(res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q) + out = dpt_ext.empty( + res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q + ) _manager = dpu.SequentialOrderManager[q] dep_evs = _manager.submitted_events diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py index 226420057748..cf6da5830f10 100644 --- a/dpnp/tests/test_fft.py +++ b/dpnp/tests/test_fft.py @@ -5,6 +5,9 @@ from dpctl.utils import ExecutionPlacementError from numpy.testing import assert_raises +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_utils import map_dtype_to_device @@ -80,7 +83,7 @@ def test_usm_ndarray(self, n): a_usm = dpt.asarray(a) expected = numpy.fft.fft(a, n=n) - out = dpt.empty(expected.shape, dtype=a_usm.dtype) + out = dpt_ext.empty(expected.shape, dtype=a_usm.dtype) result = dpnp.fft.fft(a_usm, n=n, out=out) assert out is result.get_array() assert_dtype_allclose(result, expected) @@ -639,7 +642,7 @@ def test_usm_ndarray(self, n): a_usm = dpt.asarray(a) expected = numpy.fft.irfft(a, n=n) - out = dpt.empty(expected.shape, dtype=a_usm.real.dtype) + out = dpt_ext.empty(expected.shape, dtype=a_usm.real.dtype) result = dpnp.fft.irfft(a_usm, n=n, out=out) assert out is result.get_array() assert_dtype_allclose(result, expected) @@ -727,7 +730,7 @@ def test_usm_ndarray(self, n): expected = numpy.fft.rfft(a, n=n) out_dt = map_dtype_to_device(dpnp.complex128, a_usm.sycl_device) - out = dpt.empty(expected.shape, dtype=out_dt) + out = dpt_ext.empty(expected.shape, dtype=out_dt) result = dpnp.fft.rfft(a_usm, n=n, out=out) assert out is result.get_array() diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py index 760c1a0ceb2e..7b6d39875d67 100644 --- a/dpnp/tests/test_mathematical.py +++ b/dpnp/tests/test_mathematical.py @@ -15,6 +15,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from dpnp.dpnp_array import dpnp_array from dpnp.dpnp_utils import map_dtype_to_device @@ -1575,7 +1578,7 @@ def test_out(self): assert_allclose(result, expected) # output is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = dpnp.prod(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py index d92cee045a72..2d3dbd864a2e 100644 --- a/dpnp/tests/test_nanfunctions.py +++ b/dpnp/tests/test_nanfunctions.py @@ -12,6 +12,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -55,7 +58,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) @@ -236,7 +239,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) @@ -545,7 +548,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) diff --git a/dpnp/tests/test_search.py b/dpnp/tests/test_search.py index 64c4eb75f906..05bc56b11d0b 100644 --- a/dpnp/tests/test_search.py +++ b/dpnp/tests/test_search.py @@ -3,6 +3,9 @@ import pytest from numpy.testing import assert_array_equal, assert_equal, assert_raises +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -36,7 +39,7 @@ def test_out(self, func): assert_array_equal(result, expected) # out is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_array_equal(result, expected) diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py index cf436087b607..6944d6bd5bdd 100644 --- a/dpnp/tests/test_statistics.py +++ b/dpnp/tests/test_statistics.py @@ -9,6 +9,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -812,7 +815,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) From 186ae3ce7e84752b8fc1096cb4570664816abe58 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:22:45 -0800 Subject: [PATCH 079/145] Move ti.empty_like() and reuse it in dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_clip.py | 24 ++++----- dpctl_ext/tensor/_copy_utils.py | 4 +- dpctl_ext/tensor/_ctors.py | 94 +++++++++++++++++++++++++++++++++ 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index e57bb35ce993..7ef7f132158e 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -38,6 +38,7 @@ ) from dpctl_ext.tensor._ctors import ( empty, + empty_like, eye, full, linspace, @@ -69,6 +70,7 @@ "copy", "clip", "empty", + "empty_like", "extract", "eye", "finfo", diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py index 1cd360b753ad..8b53552b034b 100644 --- a/dpctl_ext/tensor/_clip.py +++ b/dpctl_ext/tensor/_clip.py @@ -163,7 +163,7 @@ def _clip_none(x, val, out, order, _binary_fn): if ti._array_overlap(x, out): if not ti._same_logical_tensors(x, out): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(val, dpt.usm_ndarray): if ( @@ -171,7 +171,7 @@ def _clip_none(x, val, out, order, _binary_fn): and not ti._same_logical_tensors(val, out) and val_dtype == res_dt ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(val, dpt.usm_ndarray): val_ary = val @@ -229,7 +229,7 @@ def _clip_none(x, val, out, order, _binary_fn): if order == "K": buf = _empty_like_orderK(val_ary, res_dt) else: - buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + buf = dpt_ext.empty_like(val_ary, dtype=res_dt, order=order) _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( @@ -353,14 +353,14 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if ti._array_overlap(x, out): if not ti._same_logical_tensors(x, out): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) else: return out else: if order == "K": out = _empty_like_orderK(x, x.dtype) else: - out = dpt.empty_like(x, order=order) + out = dpt_ext.empty_like(x, order=order) _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events @@ -519,7 +519,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if ti._array_overlap(x, out): if not ti._same_logical_tensors(x, out): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(min, dpt.usm_ndarray): if ( @@ -527,7 +527,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): and not ti._same_logical_tensors(min, out) and buf1_dt is None ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(max, dpt.usm_ndarray): if ( @@ -535,7 +535,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): and not ti._same_logical_tensors(max, out) and buf2_dt is None ): - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) if isinstance(min, dpt.usm_ndarray): a_min = min @@ -612,7 +612,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if order == "K": buf2 = _empty_like_orderK(a_max, buf2_dt) else: - buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order) _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( @@ -668,7 +668,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if order == "K": buf1 = _empty_like_orderK(a_min, buf1_dt) else: - buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order) _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( @@ -736,7 +736,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if order == "K": buf1 = _empty_like_orderK(a_min, buf1_dt) else: - buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order) _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events @@ -747,7 +747,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if order == "K": buf2 = _empty_like_orderK(a_max, buf2_dt) else: - buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order) ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs ) diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index a1c14cf0f69b..1b54587ee393 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -714,11 +714,11 @@ def _empty_like_orderK(x, dt, usm_type=None, dev=None): dev = x.device fl = x.flags if fl["C"] or x.size <= 1: - return dpt.empty_like( + return dpt_ext.empty_like( x, dtype=dt, usm_type=usm_type, device=dev, order="C" ) elif fl["F"]: - return dpt.empty_like( + return dpt_ext.empty_like( x, dtype=dt, usm_type=usm_type, device=dev, order="F" ) return _make_empty_like_orderK(x, dt, usm_type, dev) diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index dc7da78a9975..84e95f178e13 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -42,6 +42,9 @@ # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_impl as ti +from dpctl_ext.tensor._copy_utils import ( + _empty_like_orderK, +) __doc__ = "Implementation of creation functions in :module:`dpctl.tensor`" @@ -174,6 +177,21 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None: ) +def _normalize_order(order, arr): + """ + Utility function for processing the `order` keyword of array-like + constructors, which support `"K"` and `"A"` orders. + """ + arr_flags = arr.flags + f_contig = arr_flags["F"] + c_contig = arr_flags["C"] + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and (f_contig or c_contig): + order = "C" if c_contig else "F" + return order + + def _to_scalar(obj, sc_ty): """A way to convert object to NumPy scalar type. Raises OverflowError if obj can not be represented @@ -267,6 +285,82 @@ def empty( return res +def empty_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Returns an uninitialized :class:`dpctl.tensor.usm_ndarray` with the + same `shape` as the input array `x`. + + Args: + x (usm_ndarray): + Input array from which to derive the output array shape. + dtype (optional): + data type of the array. Can be a typestring, + a :class:`numpy.dtype` object, NumPy char string, + or a NumPy scalar type. Default: ``None`` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"K"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation. Default: ``None`` + + Returns: + usm_ndarray: + Created empty array with uninitialized memory. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + return _empty_like_orderK(x, dtype, usm_type, sycl_queue) + else: + shape = x.shape + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + return res + + def eye( n_rows, n_cols=None, From 5503b9a2e8e0d007a0e99d16b3bdd8bb9d3bec03 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:25:22 -0800 Subject: [PATCH 080/145] Reuse dpctl_ext.tensor.empty_like() in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 14 +++++++------- dpnp/dpnp_iface_indexing.py | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index a7b083f2d734..2e280b58c1d2 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -467,7 +467,7 @@ def __call__( ) # Allocate a temporary buffer with the required dtype - out[i] = dpt.empty_like(res, dtype=res_dt) + out[i] = dpt_ext.empty_like(res, dtype=res_dt) elif ( buf_dt is None and dti._array_overlap(x, res) @@ -476,7 +476,7 @@ def __call__( # Allocate a temporary buffer to avoid memory overlapping. # Note if `buf_dt` is not None, a temporary copy of `x` will be # created, so the array overlap check isn't needed. - out[i] = dpt.empty_like(res) + out[i] = dpt_ext.empty_like(res) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events @@ -486,7 +486,7 @@ def __call__( if order == "K": buf = dtc._empty_like_orderK(x, buf_dt) else: - buf = dpt.empty_like(x, dtype=buf_dt, order=order) + buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order) ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs @@ -503,7 +503,7 @@ def __call__( if order == "K": out[i] = dtc._empty_like_orderK(x, res_dt) else: - out[i] = dpt.empty_like(x, dtype=res_dt, order=order) + out[i] = dpt_ext.empty_like(x, dtype=res_dt, order=order) # Call the unary function with input and output arrays ht_unary_ev, unary_ev = self.get_implementation_function()( @@ -1078,7 +1078,7 @@ def __call__( ) # Allocate a temporary buffer with the required dtype - out[i] = dpt.empty_like(res, dtype=res_dt) + out[i] = dpt_ext.empty_like(res, dtype=res_dt) else: # If `dt` is not None, a temporary copy of `x` will be created, # so the array overlap check isn't needed. @@ -1094,7 +1094,7 @@ def __call__( for x in x_to_check ): # allocate a temporary buffer to avoid memory overlapping - out[i] = dpt.empty_like(res) + out[i] = dpt_ext.empty_like(res) x1 = dpnp.as_usm_ndarray(x1, dtype=x1_dt, sycl_queue=exec_q) x2 = dpnp.as_usm_ndarray(x2, dtype=x2_dt, sycl_queue=exec_q) @@ -1127,7 +1127,7 @@ def __call__( if order == "K": buf = dtc._empty_like_orderK(x, buf_dt) else: - buf = dpt.empty_like(x, dtype=buf_dt, order=order) + buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order) ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray( src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 9f9b16d2ad9b..503c66bfec58 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -141,7 +141,7 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0): ti._array_overlap(out, chc) for chc in chcs ): # Allocate a temporary buffer to avoid memory overlapping. - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) else: out = dpt_ext.empty( inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q @@ -301,7 +301,7 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0): if ti._array_overlap(x, out): # Allocate a temporary buffer to avoid memory overlapping. - out = dpt.empty_like(out) + out = dpt_ext.empty_like(out) else: out = dpt_ext.empty( res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q From 748c5b5e15df4d4f0e618deced77b62deb38604c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:32:38 -0800 Subject: [PATCH 081/145] Move ti.arange() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 132 ++++++++++++++++++++++++ dpctl_ext/tensor/_indexing_functions.py | 2 +- dpnp/dpnp_algo/dpnp_arraycreation.py | 2 +- dpnp/dpnp_container.py | 2 +- dpnp/dpnp_iface_arraycreation.py | 2 +- dpnp/tests/test_arraycreation.py | 2 +- 7 files changed, 139 insertions(+), 5 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 7ef7f132158e..084d4db3bd26 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -37,6 +37,7 @@ to_numpy, ) from dpctl_ext.tensor._ctors import ( + arange, empty, empty_like, eye, @@ -64,6 +65,7 @@ from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ + "arange", "asnumpy", "astype", "can_cast", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 84e95f178e13..9b80c3bc685a 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -177,6 +177,20 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None: ) +def _get_arange_length(start, stop, step): + """Compute length of arange sequence""" + span = stop - start + if hasattr(step, "__float__") and hasattr(span, "__float__"): + return _round_for_arange(span / step) + tmp = span / step + if hasattr(tmp, "__complex__"): + tmp = complex(tmp) + tmp = tmp.real + else: + tmp = float(tmp) + return _round_for_arange(tmp) + + def _normalize_order(order, arr): """ Utility function for processing the `order` keyword of array-like @@ -192,6 +206,13 @@ def _normalize_order(order, arr): return order +def _round_for_arange(tmp): + k = int(tmp) + if k >= 0 and float(k) < tmp: + tmp = tmp + 1 + return tmp + + def _to_scalar(obj, sc_ty): """A way to convert object to NumPy scalar type. Raises OverflowError if obj can not be represented @@ -220,6 +241,117 @@ def _usm_ndarray_from_suai(obj): return ary +def arange( + start, + /, + stop=None, + step=1, + *, + dtype=None, + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Returns evenly spaced values within the half-open interval [start, stop) + as a one-dimensional array. + + Args: + start: + Starting point of the interval + stop: + Ending point of the interval. Default: ``None`` + step: Increment of the returned sequence. Default: ``1`` + dtype: Output array data type. Default: ``None`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Array populated with evenly spaced values. + """ + if stop is None: + stop = start + start = 0 + if step is None: + step = 1 + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + is_bool = False + if dtype: + is_bool = (dtype is bool) or (dpt.dtype(dtype) == dpt.bool) + _, dt = _coerce_and_infer_dt( + start, + stop, + step, + dt=dpt.int8 if is_bool else dtype, + sycl_queue=sycl_queue, + err_msg="start, stop, and step must be Python scalars", + allow_bool=False, + ) + try: + tmp = _get_arange_length(start, stop, step) + sh = max(int(tmp), 0) + except TypeError: + sh = 0 + if is_bool and sh > 2: + raise ValueError("no fill-function for boolean data type") + res = dpt.usm_ndarray( + (sh,), + dtype=dt, + buffer=usm_type, + order="C", + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + sc_ty = dt.type + _first = _to_scalar(start, sc_ty) + if sh > 1: + _second = _to_scalar(start + step, sc_ty) + if dt in [dpt.uint8, dpt.uint16, dpt.uint32, dpt.uint64]: + int64_ty = dpt.int64.type + _step = int64_ty(_second) - int64_ty(_first) + else: + _step = _second - _first + _step = sc_ty(_step) + else: + _step = sc_ty(1) + _start = _first + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating newly allocated array, no task dependencies + hev, lin_ev = ti._linspace_step(_start, _step, res, sycl_queue) + _manager.add_event_pair(hev, lin_ev) + if is_bool: + res_out = dpt.usm_ndarray( + (sh,), + dtype=dpt.bool, + buffer=usm_type, + order="C", + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + hev_cpy, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=res, dst=res_out, sycl_queue=sycl_queue, depends=[lin_ev] + ) + _manager.add_event_pair(hev_cpy, cpy_ev) + return res_out + return res + + def empty( shape, *, diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 17a4aefbd4ca..0dcc6440a275 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -57,7 +57,7 @@ def _get_indexing_mode(name): def _range(sh_i, i, nd, q, usm_t, dt): - ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q) + ind = dpt_ext.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q) ind.shape = tuple(sh_i if i == j else 1 for j in range(nd)) return ind diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 4e05d57c6872..6230054645a8 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -225,7 +225,7 @@ def dpnp_linspace( delta = usm_stop - usm_start - usm_res = dpt.arange( + usm_res = dpt_ext.arange( 0, stop=num, step=1, diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 9a2afe77f3cb..74ddda10ba02 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -75,7 +75,7 @@ def arange( sycl_queue=sycl_queue, device=device ) - array_obj = dpt.arange( + array_obj = dpt_ext.arange( start, stop=stop, step=step, diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 52fc4b7f6448..290cc7aae7e6 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3935,7 +3935,7 @@ def vander( tmp = m[:, ::-1] if not increasing else m dpnp.power( dpt_ext.reshape(usm_x, (-1, 1)), - dpt.arange( + dpt_ext.arange( N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue ), out=tmp, diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index 88e6aacb997d..382606ec377a 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -972,7 +972,7 @@ def test_ones_like(array, dtype, order): ], ) def test_dpctl_tensor_input(func, args): - x0 = dpt_ext.reshape(dpt.arange(9), (3, 3)) + x0 = dpt_ext.reshape(dpt_ext.arange(9), (3, 3)) new_args = [eval(val, {"x0": x0}) for val in args] X = getattr(dpt, func)(*new_args) Y = getattr(dpnp, func)(*new_args) From a7370a802b51e93c78eaf1908a81fb916cba9ad4 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:47:12 -0800 Subject: [PATCH 082/145] Move ti.asarray() and reuse it in dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_clip.py | 6 +- dpctl_ext/tensor/_copy_utils.py | 6 +- dpctl_ext/tensor/_ctors.py | 530 +++++++++++++++++++- dpctl_ext/tensor/_indexing_functions.py | 2 +- dpctl_ext/tensor/_manipulation_functions.py | 2 +- dpctl_ext/tensor/_scalar_utils.py | 6 +- dpctl_ext/tensor/_search_functions.py | 4 +- 8 files changed, 546 insertions(+), 12 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 084d4db3bd26..1c4009e15fd8 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -38,6 +38,7 @@ ) from dpctl_ext.tensor._ctors import ( arange, + asarray, empty, empty_like, eye, @@ -66,6 +67,7 @@ __all__ = [ "arange", + "asarray", "asnumpy", "astype", "can_cast", diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py index 8b53552b034b..bab2271c6453 100644 --- a/dpctl_ext/tensor/_clip.py +++ b/dpctl_ext/tensor/_clip.py @@ -176,7 +176,7 @@ def _clip_none(x, val, out, order, _binary_fn): if isinstance(val, dpt.usm_ndarray): val_ary = val else: - val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + val_ary = dpt_ext.asarray(val, dtype=val_dtype, sycl_queue=exec_q) if order == "A": order = ( @@ -540,11 +540,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"): if isinstance(min, dpt.usm_ndarray): a_min = min else: - a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + a_min = dpt_ext.asarray(min, dtype=min_dtype, sycl_queue=exec_q) if isinstance(max, dpt.usm_ndarray): a_max = max else: - a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + a_max = dpt_ext.asarray(max, dtype=max_dtype, sycl_queue=exec_q) if order == "A": order = ( diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 1b54587ee393..bdcd5d123072 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -159,7 +159,7 @@ def _extract_impl(ary, ary_mask, axis=0): elif isinstance(ary_mask, np.ndarray): dst_usm_type = ary.usm_type exec_q = ary.sycl_queue - ary_mask = dpt.asarray( + ary_mask = dpt_ext.asarray( ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q ) else: @@ -284,7 +284,7 @@ def _prepare_indices_arrays(inds, q, usm_type): lambda ind: ( ind if isinstance(ind, dpt.usm_ndarray) - else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q) + else dpt_ext.asarray(ind, usm_type=usm_type, sycl_queue=q) ), inds, ) @@ -332,7 +332,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0): if exec_q is not None: if not isinstance(vals, dpt.usm_ndarray): - vals = dpt.asarray( + vals = dpt_ext.asarray( vals, dtype=ary.dtype, usm_type=coerced_usm_type, diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 9b80c3bc685a..fb30b89c684b 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -44,6 +44,7 @@ import dpctl_ext.tensor._tensor_impl as ti from dpctl_ext.tensor._copy_utils import ( _empty_like_orderK, + _from_numpy_empty_like_orderK, ) __doc__ = "Implementation of creation functions in :module:`dpctl.tensor`" @@ -112,6 +113,209 @@ def _array_info_sequence(li): return (n,) + dim, dt, device +def _asarray_from_numpy_ndarray( + ary, dtype=None, usm_type=None, sycl_queue=None, order="K" +): + if not isinstance(ary, np.ndarray): + raise TypeError(f"Expected numpy.ndarray, got {type(ary)}") + if usm_type is None: + usm_type = "device" + copy_q = normalize_queue_device(sycl_queue=None, device=sycl_queue) + if ary.dtype.char not in "?bBhHiIlLqQefdFD": + raise TypeError( + f"Numpy array of data type {ary.dtype} is not supported. " + "Please convert the input to an array with numeric data type." + ) + if dtype is None: + # deduce device-representable output data type + dtype = _map_to_device_dtype(ary.dtype, copy_q) + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + f_contig = ary.flags["F"] + c_contig = ary.flags["C"] + fc_contig = f_contig or c_contig + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and fc_contig: + order = "C" if c_contig else "F" + if order == "K": + # new USM allocation + res = _from_numpy_empty_like_orderK(ary, dtype, usm_type, copy_q) + else: + res = dpt.usm_ndarray( + ary.shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": copy_q}, + ) + res[...] = ary + return res + + +def _asarray_from_seq( + seq_obj, + seq_shape, + seq_dt, + alloc_q, + exec_q, + dtype=None, + usm_type=None, + order="C", +): + """`seq_obj` is a sequence""" + if usm_type is None: + usm_types_in_seq = [] + _usm_types_walker(seq_obj, usm_types_in_seq) + usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq) + dpctl.utils.validate_usm_type(usm_type) + if dtype is None: + dtype = _map_to_device_dtype(seq_dt, alloc_q) + else: + _mapped_dt = _map_to_device_dtype(dtype, alloc_q) + if _mapped_dt != dtype: + raise ValueError( + f"Device {alloc_q.sycl_device} " + f"does not support {dtype} natively." + ) + dtype = _mapped_dt + if order in "KA": + order = "C" + if isinstance(exec_q, dpctl.SyclQueue): + res = dpt_ext.empty( + seq_shape, + dtype=dtype, + usm_type=usm_type, + sycl_queue=alloc_q, + order=order, + ) + _manager = dpctl.utils.SequentialOrderManager[exec_q] + _device_copy_walker(seq_obj, res, _manager) + return res + else: + res = dpt_ext.empty( + seq_shape, + dtype=dtype, + usm_type=usm_type, + sycl_queue=alloc_q, + order=order, + ) + _copy_through_host_walker(seq_obj, res) + return res + + +def _asarray_from_seq_single_device( + obj, + seq_shape, + seq_dt, + seq_dev, + dtype=None, + usm_type=None, + sycl_queue=None, + order="C", +): + if sycl_queue is None: + exec_q = seq_dev + alloc_q = seq_dev + else: + exec_q = dpctl.utils.get_execution_queue( + ( + sycl_queue, + seq_dev, + ) + ) + alloc_q = sycl_queue + return _asarray_from_seq( + obj, + seq_shape, + seq_dt, + alloc_q, + exec_q, + dtype=dtype, + usm_type=usm_type, + order=order, + ) + + +def _asarray_from_usm_ndarray( + usm_ndary, + dtype=None, + copy=None, + usm_type=None, + sycl_queue=None, + order="K", +): + if not isinstance(usm_ndary, dpt.usm_ndarray): + raise TypeError( + f"Expected dpctl.tensor.usm_ndarray, got {type(usm_ndary)}" + ) + if usm_type is None: + usm_type = usm_ndary.usm_type + if sycl_queue is not None: + exec_q = dpctl.utils.get_execution_queue( + [usm_ndary.sycl_queue, sycl_queue] + ) + copy_q = normalize_queue_device(sycl_queue=sycl_queue, device=exec_q) + else: + copy_q = usm_ndary.sycl_queue + if dtype is None: + dtype = _map_to_device_dtype(usm_ndary.dtype, copy_q) + # Conditions for zero copy: + can_zero_copy = copy is not True + # dtype is unchanged + can_zero_copy = can_zero_copy and dtype == usm_ndary.dtype + # USM allocation type is unchanged + can_zero_copy = can_zero_copy and usm_type == usm_ndary.usm_type + # sycl_queue is unchanged + can_zero_copy = can_zero_copy and copy_q is usm_ndary.sycl_queue + # order is unchanged + c_contig = usm_ndary.flags.c_contiguous + f_contig = usm_ndary.flags.f_contiguous + fc_contig = usm_ndary.flags.forc + if can_zero_copy: + if order == "C" and c_contig: + pass + elif order == "F" and f_contig: + pass + elif order == "A" and fc_contig: + pass + elif order == "K": + pass + else: + can_zero_copy = False + if copy is False and can_zero_copy is False: + raise ValueError("asarray(..., copy=False) is not possible") + if can_zero_copy: + return usm_ndary + if order == "A": + order = "F" if f_contig and not c_contig else "C" + if order == "K" and fc_contig: + order = "C" if c_contig else "F" + if order == "K": + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + res = _empty_like_orderK(usm_ndary, dtype, usm_type, copy_q) + else: + _ensure_native_dtype_device_support(dtype, copy_q.sycl_device) + res = dpt.usm_ndarray( + usm_ndary.shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": copy_q}, + ) + eq = dpctl.utils.get_execution_queue([usm_ndary.sycl_queue, copy_q]) + if eq is not None: + _manager = dpctl.utils.SequentialOrderManager[eq] + dep_evs = _manager.submitted_events + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ndary, dst=res, sycl_queue=eq, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + else: + tmp = dpt_ext.asnumpy(usm_ndary) + res[...] = tmp + return res + + def _cast_fill_val(fill_val, dt): """ Casts the Python scalar `fill_val` to another Python type coercible to the @@ -145,6 +349,82 @@ def _coerce_and_infer_dt(*args, dt, sycl_queue, err_msg, allow_bool=False): raise ValueError(f"Data type {dt} is not supported") +def _copy_through_host_walker(seq_o, usm_res): + if isinstance(seq_o, dpt.usm_ndarray): + if ( + dpctl.utils.get_execution_queue( + ( + usm_res.sycl_queue, + seq_o.sycl_queue, + ) + ) + is None + ): + usm_res[...] = dpt.asnumpy(seq_o).copy() + return + else: + usm_res[...] = seq_o + if hasattr(seq_o, "__usm_ndarray__"): + usm_arr = seq_o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + _copy_through_host_walker(usm_arr, usm_res) + return + if hasattr(seq_o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(seq_o) + if ( + dpctl.utils.get_execution_queue( + ( + usm_res.sycl_queue, + usm_ar.sycl_queue, + ) + ) + is None + ): + usm_res[...] = dpt_ext.asnumpy(usm_ar).copy() + else: + usm_res[...] = usm_ar + return + if _is_object_with_buffer_protocol(seq_o): + np_ar = np.asarray(seq_o) + usm_res[...] = np_ar + return + if isinstance(seq_o, (list, tuple)): + for i, el in enumerate(seq_o): + _copy_through_host_walker(el, usm_res[i]) + return + usm_res[...] = np.asarray(seq_o) + + +def _device_copy_walker(seq_o, res, _manager): + if isinstance(seq_o, dpt.usm_ndarray): + exec_q = res.sycl_queue + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=seq_o, dst=res, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_ev, cpy_ev) + return + if hasattr(seq_o, "__usm_ndarray__"): + usm_arr = seq_o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + _device_copy_walker(usm_arr, res, _manager) + return + if hasattr(seq_o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(seq_o) + exec_q = res.sycl_queue + deps = _manager.submitted_events + ht_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=usm_ar, dst=res, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_ev, cpy_ev) + return + if isinstance(seq_o, (list, tuple)): + for i, el in enumerate(seq_o): + _device_copy_walker(el, res[i], _manager) + return + raise TypeError + + def _ensure_native_dtype_device_support(dtype, dev) -> None: """Check that dtype is natively supported by device. @@ -191,6 +471,28 @@ def _get_arange_length(start, stop, step): return _round_for_arange(tmp) +def _map_to_device_dtype(dt, q): + dtc = dt.char + if dtc == "?" or np.issubdtype(dt, np.integer): + return dt + d = q.sycl_device + if np.issubdtype(dt, np.floating): + if dtc == "f": + return dt + if dtc == "d" and d.has_aspect_fp64: + return dt + if dtc == "e" and d.has_aspect_fp16: + return dt + return dpt.dtype("f4") + if np.issubdtype(dt, np.complexfloating): + if dtc == "F": + return dt + if dtc == "D" and d.has_aspect_fp64: + return dt + return dpt.dtype("c8") + raise RuntimeError(f"Unrecognized data type '{dt}' encountered.") + + def _normalize_order(order, arr): """ Utility function for processing the `order` keyword of array-like @@ -241,6 +543,30 @@ def _usm_ndarray_from_suai(obj): return ary +def _usm_types_walker(o, usm_types_list): + if isinstance(o, dpt.usm_ndarray): + usm_types_list.append(o.usm_type) + return + if hasattr(o, "__usm_ndarray__"): + usm_arr = o.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + usm_types_list.append(usm_arr.usm_type) + return + if hasattr(o, "__sycl_usm_array_interface__"): + usm_ar = _usm_ndarray_from_suai(o) + usm_types_list.append(usm_ar.usm_type) + return + if _is_object_with_buffer_protocol(o): + return + if isinstance(o, (int, bool, float, complex)): + return + if isinstance(o, (list, tuple, range)): + for el in o: + _usm_types_walker(el, usm_types_list) + return + raise TypeError + + def arange( start, /, @@ -352,6 +678,208 @@ def arange( return res +def asarray( + obj, + /, + *, + dtype=None, + device=None, + copy=None, + usm_type=None, + sycl_queue=None, + order="K", +): + """ + Converts input object to :class:`dpctl.tensor.usm_ndarray`. + + Args: + obj: Python object to convert. Can be an instance of + :class:`dpctl.tensor.usm_ndarray`, + an object representing SYCL USM allocation and implementing + ``__sycl_usm_array_interface__`` protocol, an instance + of :class:`numpy.ndarray`, an object supporting Python buffer + protocol, a Python scalar, or a (possibly nested) sequence of + Python scalars. + dtype (data type, optional): + output array data type. If ``dtype`` is + ``None``, the output array data type is inferred from data types in + ``obj``. Default: ``None`` + copy (`bool`, optional): + boolean indicating whether or not to copy the + input. If ``True``, always creates a copy. If ``False``, the + need to copy raises :exc:`ValueError`. If ``None``, tries to reuse + existing memory allocations if possible, but allows to perform + a copy otherwise. Default: ``None`` + order (``"C"``, ``"F"``, ``"A"``, ``"K"``, optional): + memory layout of the output array. Default: ``"K"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Array created from input object. + """ + # 1. Check that copy is a valid keyword + if copy not in [None, True, False]: + raise TypeError( + "Recognized copy keyword values should be True, False, or None" + ) + # 2. Check that dtype is None, or a valid dtype + if dtype is not None: + dtype = dpt.dtype(dtype) + # 3. Validate order + if not isinstance(order, str): + raise TypeError( + f"Expected order keyword to be of type str, got {type(order)}" + ) + if len(order) == 0 or order[0] not in "KkAaCcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'K', 'A', 'F', or 'C'." + ) + order = order[0].upper() + # 4. Check that usm_type is None, or a valid value + dpctl.utils.validate_usm_type(usm_type, allow_none=True) + # 5. Normalize device/sycl_queue [keep it None if was None] + if device is not None or sycl_queue is not None: + sycl_queue = normalize_queue_device( + sycl_queue=sycl_queue, device=device + ) + + # handle instance(obj, usm_ndarray) + if isinstance(obj, dpt.usm_ndarray): + return _asarray_from_usm_ndarray( + obj, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if hasattr(obj, "__usm_ndarray__"): + usm_arr = obj.__usm_ndarray__ + if isinstance(usm_arr, dpt.usm_ndarray): + return _asarray_from_usm_ndarray( + usm_arr, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if hasattr(obj, "__sycl_usm_array_interface__"): + ary = _usm_ndarray_from_suai(obj) + return _asarray_from_usm_ndarray( + ary, + dtype=dtype, + copy=copy, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if isinstance(obj, np.ndarray): + if copy is False: + raise ValueError( + "Converting numpy.ndarray to usm_ndarray requires a copy" + ) + return _asarray_from_numpy_ndarray( + obj, + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if _is_object_with_buffer_protocol(obj): + if copy is False: + raise ValueError( + f"Converting {type(obj)} to usm_ndarray requires a copy" + ) + return _asarray_from_numpy_ndarray( + np.array(obj), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + if isinstance(obj, (list, tuple, range)): + if copy is False: + raise ValueError( + "Converting Python sequence to usm_ndarray requires a copy" + ) + seq_shape, seq_dt, devs = _array_info_sequence(obj) + if devs == _host_set: + return _asarray_from_numpy_ndarray( + np.asarray(obj, dtype=dtype, order=order), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + elif len(devs) == 1: + seq_dev = list(devs)[0] + return _asarray_from_seq_single_device( + obj, + seq_shape, + seq_dt, + seq_dev, + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order=order, + ) + elif len(devs) > 1: + devs = [dev for dev in devs if dev is not None] + if sycl_queue is None: + if len(devs) == 1: + alloc_q = devs[0] + else: + raise dpctl.utils.ExecutionPlacementError( + "Please specify `device` or `sycl_queue` keyword " + "argument to determine where to allocate the " + "resulting array." + ) + else: + alloc_q = sycl_queue + return _asarray_from_seq( + obj, + seq_shape, + seq_dt, + alloc_q, + # force copying via host + None, + dtype=dtype, + usm_type=usm_type, + order=order, + ) + if copy is False: + raise ValueError( + f"Converting {type(obj)} to usm_ndarray requires a copy" + ) + # obj is a scalar, create 0d array + return _asarray_from_numpy_ndarray( + np.asarray(obj, dtype=dtype), + dtype=dtype, + usm_type=usm_type, + sycl_queue=sycl_queue, + order="C", + ) + + def empty( shape, *, @@ -665,7 +1193,7 @@ def full( sycl_queue = normalize_queue_device( sycl_queue=sycl_queue, device=device ) - X = dpt.asarray( + X = dpt_ext.asarray( fill_value, dtype=dtype, order=order, diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 0dcc6440a275..369bd6a49022 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -329,7 +329,7 @@ def put_vec_duplicates(vec, ind, vals): val_shape = indices.shape if not isinstance(vals, dpt.usm_ndarray): - vals = dpt.asarray( + vals = dpt_ext.asarray( vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q ) # choose to throw here for consistency with `place` diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 470bf3ef3876..37cba6828818 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -204,7 +204,7 @@ def repeat(x, repeats, /, *, axis=None): "`repeats` sequence must have the same length as the " "repeated axis" ) - repeats = dpt.asarray( + repeats = dpt_ext.asarray( repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q ) if not dpt.all(repeats >= 0): diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py index 86787baea8cc..3ab92b42ad00 100644 --- a/dpctl_ext/tensor/_scalar_utils.py +++ b/dpctl_ext/tensor/_scalar_utils.py @@ -33,6 +33,10 @@ import numpy as np from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext + from ._type_utils import ( WeakBooleanType, WeakComplexType, @@ -59,7 +63,7 @@ def _get_dtype(o, dev): if isinstance(o, dpt.usm_ndarray): return o.dtype if hasattr(o, "__sycl_usm_array_interface__"): - return dpt.asarray(o).dtype + return dpt_ext.asarray(o).dtype if _is_buffer(o): host_dt = np.array(o).dtype dev_dt = _to_device_supported_dtype(host_dt, dev) diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py index e7eb33a5f3fe..cc76bfe8b543 100644 --- a/dpctl_ext/tensor/_search_functions.py +++ b/dpctl_ext/tensor/_search_functions.py @@ -323,9 +323,9 @@ def where(condition, x1, x2, /, *, order="K", out=None): else "C" ) if not isinstance(x1, dpt.usm_ndarray): - x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q) + x1 = dpt_ext.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q) if not isinstance(x2, dpt.usm_ndarray): - x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q) + x2 = dpt_ext.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q) if condition.size == 0: if out is not None: From effcbe8cab3a74dc04daf491292e16c1640f31c4 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 03:58:49 -0800 Subject: [PATCH 083/145] Reuse dpctl_ext.tensor.asarray() in dpnp --- dpnp/dpnp_algo/dpnp_arraycreation.py | 8 ++++---- dpnp/dpnp_algo/dpnp_elementwise_common.py | 4 ++-- dpnp/dpnp_container.py | 4 ++-- dpnp/dpnp_iface.py | 2 +- dpnp/dpnp_iface_arraycreation.py | 2 +- dpnp/dpnp_iface_indexing.py | 4 ++-- dpnp/dpnp_iface_manipulation.py | 2 +- dpnp/dpnp_iface_searching.py | 2 +- dpnp/tests/test_arraycreation.py | 8 ++++---- dpnp/tests/test_arraymanipulation.py | 6 +++--- dpnp/tests/test_fft.py | 9 ++++----- dpnp/tests/test_indexing.py | 5 ++--- dpnp/tests/test_linalg.py | 4 +++- dpnp/tests/test_manipulation.py | 4 +++- dpnp/tests/test_mathematical.py | 8 ++++---- dpnp/tests/test_nanfunctions.py | 9 ++++----- dpnp/tests/test_ndarray.py | 5 ++++- dpnp/tests/test_statistics.py | 5 ++--- dpnp/tests/test_utils.py | 4 +++- 19 files changed, 50 insertions(+), 45 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 6230054645a8..86b9642ecd5d 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -53,7 +53,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue): if isinstance(a, dpnp_array): a = a.get_array() - return dpt.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue) + return dpt_ext.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue) def _check_has_zero_val(a): @@ -213,13 +213,13 @@ def dpnp_linspace( else: step = dpnp.nan else: - usm_start = dpt.asarray( + usm_start = dpt_ext.asarray( start, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized, ) - usm_stop = dpt.asarray( + usm_stop = dpt_ext.asarray( stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized ) @@ -266,7 +266,7 @@ def dpnp_linspace( if retstep is True: if dpnp.isscalar(step): - step = dpt.asarray( + step = dpt_ext.asarray( step, usm_type=res.usm_type, sycl_queue=res.sycl_queue ) return res, dpnp_array._create_from_usm_ndarray(step) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 2e280b58c1d2..85dc9473206c 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -713,7 +713,7 @@ def __call__( if dtype is not None: if dpnp.isscalar(x1): - x1_usm = dpt.asarray( + x1_usm = dpt_ext.asarray( x1, dtype=dtype, sycl_queue=x2.sycl_queue, @@ -722,7 +722,7 @@ def __call__( x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False) elif dpnp.isscalar(x2): x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False) - x2_usm = dpt.asarray( + x2_usm = dpt_ext.asarray( x2, dtype=dtype, sycl_queue=x1.sycl_queue, diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index 74ddda10ba02..e2c5070d807e 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -103,7 +103,7 @@ def asarray( """Converts incoming 'x1' object to 'dpnp_array'.""" if isinstance(x1, (list, tuple, range)): - array_obj = dpt.asarray( + array_obj = dpt_ext.asarray( x1, dtype=dtype, copy=copy, @@ -122,7 +122,7 @@ def asarray( x1_obj, device=device, sycl_queue=sycl_queue ) - array_obj = dpt.asarray( + array_obj = dpt_ext.asarray( x1_obj, dtype=dtype, copy=copy, diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py index 6c050a208981..9fca083a6413 100644 --- a/dpnp/dpnp_iface.py +++ b/dpnp/dpnp_iface.py @@ -191,7 +191,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None): if is_supported_array_type(a): return get_usm_ndarray(a) - return dpt.asarray( + return dpt_ext.asarray( a, dtype=dtype, device=device, usm_type=usm_type, sycl_queue=sycl_queue ) diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 290cc7aae7e6..64c34ed80fce 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3912,7 +3912,7 @@ def vander( if dpnp.is_supported_array_type(x): x = dpnp.get_usm_ndarray(x) - usm_x = dpt.asarray( + usm_x = dpt_ext.asarray( x, device=device, usm_type=usm_type, sycl_queue=sycl_queue ) diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 503c66bfec58..58c8f495c440 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -1805,7 +1805,7 @@ def put_along_axis(a, ind, values, axis, mode="wrap"): if dpnp.is_supported_array_type(values): usm_vals = dpnp.get_usm_ndarray(values) else: - usm_vals = dpt.asarray( + usm_vals = dpt_ext.asarray( values, usm_type=a.usm_type, sycl_queue=a.sycl_queue ) @@ -2153,7 +2153,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"): usm_a = dpnp.get_usm_ndarray(a) if not dpnp.is_supported_array_type(indices): - usm_ind = dpt.asarray( + usm_ind = dpt_ext.asarray( indices, usm_type=a.usm_type, sycl_queue=a.sycl_queue ) else: diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 08fd55c58ace..e4a12346bc6c 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -375,7 +375,7 @@ def _get_first_nan_index(usm_a): ): if dpnp.issubdtype(usm_a.dtype, dpnp.complexfloating): # for complex all NaNs are considered equivalent - true_val = dpt.asarray( + true_val = dpt_ext.asarray( True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type ) return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left") diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index a2389978d506..15f52338ec7e 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -376,7 +376,7 @@ def searchsorted(a, v, side="left", sorter=None): usm_a = dpnp.get_usm_ndarray(a) if dpnp.isscalar(v): - usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type) + usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type) else: usm_v = dpnp.get_usm_ndarray(v) diff --git a/dpnp/tests/test_arraycreation.py b/dpnp/tests/test_arraycreation.py index 382606ec377a..698e22b9f873 100644 --- a/dpnp/tests/test_arraycreation.py +++ b/dpnp/tests/test_arraycreation.py @@ -668,7 +668,7 @@ def test_tri_default_dtype(): 5, numpy.array(1), dpnp.array(2), - dpt.asarray(3), + dpt_ext.asarray(3), ], ids=[ "-3", @@ -682,7 +682,7 @@ def test_tri_default_dtype(): "5", "np.array(1)", "dpnp.array(2)", - "dpt.asarray(3)", + "dpt_ext.asarray(3)", ], ) @pytest.mark.parametrize( @@ -725,7 +725,7 @@ def test_tril(m, k, dtype): 5, numpy.array(1), dpnp.array(2), - dpt.asarray(3), + dpt_ext.asarray(3), ], ids=[ "-3", @@ -739,7 +739,7 @@ def test_tril(m, k, dtype): "5", "np.array(1)", "dpnp.array(2)", - "dpt.asarray(3)", + "dpt_ext.asarray(3)", ], ) @pytest.mark.parametrize( diff --git a/dpnp/tests/test_arraymanipulation.py b/dpnp/tests/test_arraymanipulation.py index ba83ee94d8b0..7d5d2efeebb1 100644 --- a/dpnp/tests/test_arraymanipulation.py +++ b/dpnp/tests/test_arraymanipulation.py @@ -1,11 +1,11 @@ -import warnings - -import dpctl.tensor as dpt import numpy import pytest from dpctl.tensor._numpy_helper import AxisError from numpy.testing import assert_array_equal, assert_equal, assert_raises +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp from .helper import get_all_dtypes, get_float_complex_dtypes diff --git a/dpnp/tests/test_fft.py b/dpnp/tests/test_fft.py index cf6da5830f10..3a19a2cf3668 100644 --- a/dpnp/tests/test_fft.py +++ b/dpnp/tests/test_fft.py @@ -1,5 +1,4 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from dpctl.utils import ExecutionPlacementError @@ -7,7 +6,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp from dpnp.dpnp_utils import map_dtype_to_device @@ -83,7 +82,7 @@ def test_usm_ndarray(self, n): a_usm = dpt.asarray(a) expected = numpy.fft.fft(a, n=n) - out = dpt_ext.empty(expected.shape, dtype=a_usm.dtype) + out = dpt.empty(expected.shape, dtype=a_usm.dtype) result = dpnp.fft.fft(a_usm, n=n, out=out) assert out is result.get_array() assert_dtype_allclose(result, expected) @@ -642,7 +641,7 @@ def test_usm_ndarray(self, n): a_usm = dpt.asarray(a) expected = numpy.fft.irfft(a, n=n) - out = dpt_ext.empty(expected.shape, dtype=a_usm.real.dtype) + out = dpt.empty(expected.shape, dtype=a_usm.real.dtype) result = dpnp.fft.irfft(a_usm, n=n, out=out) assert out is result.get_array() assert_dtype_allclose(result, expected) @@ -730,7 +729,7 @@ def test_usm_ndarray(self, n): expected = numpy.fft.rfft(a, n=n) out_dt = map_dtype_to_device(dpnp.complex128, a_usm.sycl_device) - out = dpt_ext.empty(expected.shape, dtype=out_dt) + out = dpt.empty(expected.shape, dtype=out_dt) result = dpnp.fft.rfft(a_usm, n=n, out=out) assert out is result.get_array() diff --git a/dpnp/tests/test_indexing.py b/dpnp/tests/test_indexing.py index 79c41a2f45f7..d8822d77080b 100644 --- a/dpnp/tests/test_indexing.py +++ b/dpnp/tests/test_indexing.py @@ -1,7 +1,6 @@ import functools import dpctl -import dpctl.tensor as dpt import numpy import pytest from dpctl.utils import ExecutionPlacementError @@ -13,10 +12,10 @@ assert_raises_regex, ) -import dpnp - # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt +import dpnp from dpctl_ext.tensor._numpy_helper import AxisError from dpctl_ext.tensor._type_utils import _to_device_supported_dtype from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py index 31d99d71ce49..b9673d21a161 100644 --- a/dpnp/tests/test_linalg.py +++ b/dpnp/tests/test_linalg.py @@ -1,7 +1,6 @@ import warnings import dpctl -import dpctl.tensor as dpt import numpy import pytest from dpctl.tensor._numpy_helper import AxisError @@ -14,6 +13,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp from .helper import ( diff --git a/dpnp/tests/test_manipulation.py b/dpnp/tests/test_manipulation.py index 8ddba08dbb92..8095a0daa858 100644 --- a/dpnp/tests/test_manipulation.py +++ b/dpnp/tests/test_manipulation.py @@ -1,6 +1,5 @@ import itertools -import dpctl.tensor as dpt import numpy import pytest from dpctl.tensor._numpy_helper import AxisError @@ -10,6 +9,9 @@ assert_raises, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp from .helper import ( diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py index 7b6d39875d67..841494bde1ed 100644 --- a/dpnp/tests/test_mathematical.py +++ b/dpnp/tests/test_mathematical.py @@ -669,15 +669,15 @@ def test_to_begin_to_end(self, to_begin, to_end): "to_begin, to_end", [ (-20, 20), - (dpt.asarray([-20, -30]), dpt.asarray([20, 15])), - (dpt.asarray([[-20, -30]]), dpt.asarray([[20, 15]])), + (dpt_ext.asarray([-20, -30]), dpt_ext.asarray([20, 15])), + (dpt_ext.asarray([[-20, -30]]), dpt_ext.asarray([[20, 15]])), ([1, 2], [3, 4]), ((1, 2), (3, 4)), ], ) def test_usm_ndarray(self, to_begin, to_end): a = numpy.array([[1, 2, 0]]) - dpt_a = dpt.asarray(a) + dpt_a = dpt_ext.asarray(a) if isinstance(to_begin, dpt.usm_ndarray): np_to_begin = dpt.asnumpy(to_begin) @@ -2631,7 +2631,7 @@ def test_out_float16(self, func): def test_out_usm_ndarray(self, func, dt): a = generate_random_numpy_array(10, dt) out = numpy.empty(a.shape, dtype=dt) - ia, usm_out = dpnp.array(a), dpt.asarray(out) + ia, usm_out = dpnp.array(a), dpt_ext.asarray(out) expected = getattr(numpy, func)(a, out=out) result = getattr(dpnp, func)(ia, out=usm_out) diff --git a/dpnp/tests/test_nanfunctions.py b/dpnp/tests/test_nanfunctions.py index 2d3dbd864a2e..2cb70df5954a 100644 --- a/dpnp/tests/test_nanfunctions.py +++ b/dpnp/tests/test_nanfunctions.py @@ -1,5 +1,4 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from dpctl.utils import ExecutionPlacementError @@ -14,7 +13,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp from .helper import ( @@ -58,7 +57,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) @@ -239,7 +238,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) @@ -548,7 +547,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py index 4e4e42bbc85e..a27f0fe6aa14 100644 --- a/dpnp/tests/test_ndarray.py +++ b/dpnp/tests/test_ndarray.py @@ -9,6 +9,9 @@ assert_raises_regex, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp from .helper import ( @@ -407,7 +410,7 @@ def test_error(self): class TestUsmNdarrayProtocol: def test_basic(self): a = dpnp.arange(256, dtype=dpnp.int64) - usm_a = dpt.asarray(a) + usm_a = dpt_ext.asarray(a) assert a.sycl_queue == usm_a.sycl_queue assert a.usm_type == usm_a.usm_type diff --git a/dpnp/tests/test_statistics.py b/dpnp/tests/test_statistics.py index 6944d6bd5bdd..fe8848b6c858 100644 --- a/dpnp/tests/test_statistics.py +++ b/dpnp/tests/test_statistics.py @@ -1,5 +1,4 @@ import dpctl -import dpctl.tensor as dpt import numpy import pytest from numpy.testing import ( @@ -11,7 +10,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp from .helper import ( @@ -815,7 +814,7 @@ def test_out(self, func): assert_allclose(result, expected) # out is usm_ndarray - dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype) + dpt_out = dpt.empty(expected.shape, dtype=expected.dtype) result = getattr(dpnp, func)(ia, axis=0, out=dpt_out) assert dpt_out is result.get_array() assert_allclose(result, expected) diff --git a/dpnp/tests/test_utils.py b/dpnp/tests/test_utils.py index eef9132e5b55..ddbd267c2108 100644 --- a/dpnp/tests/test_utils.py +++ b/dpnp/tests/test_utils.py @@ -1,7 +1,9 @@ -import dpctl.tensor as dpt import numpy import pytest +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp From f9f547ad05e4549090b7e8f5b043a6d24ca482ef Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:04:46 -0800 Subject: [PATCH 084/145] Move ti.full_like() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 117 +++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 1c4009e15fd8..bbf1f2107347 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -43,6 +43,7 @@ empty_like, eye, full, + full_like, linspace, tril, triu, @@ -80,6 +81,7 @@ "finfo", "from_numpy", "full", + "full_like", "iinfo", "isdtype", "linspace", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index fb30b89c684b..664c238cbdbf 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1223,6 +1223,123 @@ def full( return res +def full_like( + x, + /, + fill_value, + *, + dtype=None, + order="K", + device=None, + usm_type=None, + sycl_queue=None, +): + """full_like(x, fill_value, dtype=None, order="K", \ + device=None, usm_type=None, sycl_queue=None) + + Returns a new :class:`dpctl.tensor.usm_ndarray` filled with `fill_value` + and having the same `shape` as the input array `x`. + + Args: + x (usm_ndarray): Input array from which to derive the output array + shape. + fill_value: the value to fill output array with + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or a + NumPy scalar type. If ``dtype`` is ``None``, the output array data + type is inferred from ``x``. Default: ``None`` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"K"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with given value. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + sh = x.shape + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)): + X = dpt_ext.asarray( + fill_value, + dtype=dtype, + order=order, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + X = dpt.broadcast_to(X, sh) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # order copy after tasks populating X + dep_evs = _manager.submitted_events + hev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=X, dst=res, sycl_queue=sycl_queue, depends=dep_evs + ) + _manager.add_event_pair(hev, copy_ev) + return res + else: + _validate_fill_value(fill_value) + + dtype = _get_dtype(dtype, sycl_queue, ref_type=type(fill_value)) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + fill_value = _cast_fill_val(fill_value, dtype) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(fill_value, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + return full( + sh, + fill_value, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + + def linspace( start, stop, From 0d84d7b8bd9ae7b395a9b0c6ef2954cb495e3835 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:08:07 -0800 Subject: [PATCH 085/145] Move ti.meshgrid() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 75 ++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index bbf1f2107347..782170bcd07f 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -45,6 +45,7 @@ full, full_like, linspace, + meshgrid, tril, triu, ) @@ -85,6 +86,7 @@ "iinfo", "isdtype", "linspace", + "meshgrid", "nonzero", "place", "put", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 664c238cbdbf..57f4769b6616 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1443,6 +1443,81 @@ def linspace( return res if int_dt is None else dpt.astype(res, int_dt) +def meshgrid(*arrays, indexing="xy"): + """ + Creates list of :class:`dpctl.tensor.usm_ndarray` coordinate matrices + from vectors. + + Args: + arrays (usm_ndarray): + an arbitrary number of one-dimensional arrays + representing grid coordinates. Each array should have the same + numeric data type. + indexing (``"xy"``, or ``"ij"``): + Cartesian (``"xy"``) or matrix (``"ij"``) indexing of output. + If provided zero or one one-dimensional vector(s) (i.e., the + zero- and one-dimensional cases, respectively), the ``indexing`` + keyword has no effect and should be ignored. Default: ``"xy"`` + + Returns: + List[array]: + list of ``N`` arrays, where ``N`` is the number of + provided one-dimensional input arrays. Each returned array must + have rank ``N``. + For a set of ``n`` vectors with lengths ``N0``, ``N1``, ``N2``, ... + The cartesian indexing results in arrays of shape + ``(N1, N0, N2, ...)``, while the + matrix indexing results in arrays of shape + ``(N0, N1, N2, ...)``. + Default: ``"xy"``. + + Raises: + ValueError: If vectors are not of the same data type, or are not + one-dimensional. + + """ + ref_dt = None + ref_unset = True + for array in arrays: + if not isinstance(array, dpt.usm_ndarray): + raise TypeError( + f"Expected instance of dpt.usm_ndarray, got {type(array)}." + ) + if array.ndim != 1: + raise ValueError("All arrays must be one-dimensional.") + if ref_unset: + ref_unset = False + ref_dt = array.dtype + else: + if not ref_dt == array.dtype: + raise ValueError( + "All arrays must be of the same numeric data type." + ) + if indexing not in ["xy", "ij"]: + raise ValueError( + "Unrecognized indexing keyword value, expecting 'xy' or 'ij.'" + ) + n = len(arrays) + if n == 0: + return [] + + sh = (-1,) + (1,) * (n - 1) + + res = [] + if n > 1 and indexing == "xy": + res.append(dpt_ext.reshape(arrays[0], (1, -1) + sh[2:], copy=True)) + res.append(dpt_ext.reshape(arrays[1], sh, copy=True)) + arrays, sh = arrays[2:], sh[-2:] + sh[:-2] + + for array in arrays: + res.append(dpt_ext.reshape(array, sh, copy=True)) + sh = sh[-1:] + sh[:-1] + + output = dpt.broadcast_arrays(*res) + + return output + + def tril(x, /, *, k=0): """ Returns the lower triangular part of a matrix (or a stack of matrices) From 8c15ddb7b5c4bfe555b79432cb3281f9c3654890 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:23:19 -0800 Subject: [PATCH 086/145] Move ti.ones() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 68 ++++++++++++++++++++++++++++++++ dpnp/dpnp_container.py | 2 +- dpnp/dpnp_iface_arraycreation.py | 2 +- dpnp/tests/test_memory.py | 5 ++- dpnp/tests/test_sycl_queue.py | 4 +- dpnp/tests/test_usm_type.py | 4 +- 7 files changed, 82 insertions(+), 5 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 782170bcd07f..faa2cf3a07d8 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -46,6 +46,7 @@ full_like, linspace, meshgrid, + ones, tril, triu, ) @@ -88,6 +89,7 @@ "linspace", "meshgrid", "nonzero", + "ones", "place", "put", "put_along_axis", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 57f4769b6616..ede3a9e207a1 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1518,6 +1518,74 @@ def meshgrid(*arrays, indexing="xy"): return output +def ones( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ones(shape, dtype=None, order="C", \ + device=None, usm_type="device", sycl_queue=None) + + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with ones. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Created array initialized with ones. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + + def tril(x, /, *, k=0): """ Returns the lower triangular part of a matrix (or a stack of matrices) diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index e2c5070d807e..cb2e775a9653 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -260,7 +260,7 @@ def ones( order = "C" """Creates `dpnp_array` of ones with the given shape, dtype, and order.""" - array_obj = dpt.ones( + array_obj = dpt_ext.ones( shape, dtype=dtype, order=order, diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 64c34ed80fce..078ac7a10f4b 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3696,7 +3696,7 @@ def tri( if usm_type is None: usm_type = "device" - m = dpt.ones( + m = dpt_ext.ones( (N, M), dtype=_dtype, device=device, diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py index 1bc0da8c1535..94aeda33f505 100644 --- a/dpnp/tests/test_memory.py +++ b/dpnp/tests/test_memory.py @@ -2,6 +2,9 @@ import numpy import pytest +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp import dpnp.memory as dpm @@ -21,7 +24,7 @@ def test_wrong_input_type(self, x): dpm.create_data(x) def test_wrong_usm_data(self): - a = dpt.ones(10) + a = dpt_ext.ones(10) d = IntUsmData(a.shape, buffer=a) with pytest.raises(TypeError): diff --git a/dpnp/tests/test_sycl_queue.py b/dpnp/tests/test_sycl_queue.py index d1853579036a..a9c076a7c476 100644 --- a/dpnp/tests/test_sycl_queue.py +++ b/dpnp/tests/test_sycl_queue.py @@ -2,12 +2,14 @@ import tempfile import dpctl -import dpctl.tensor as dpt import numpy import pytest from dpctl.utils import ExecutionPlacementError from numpy.testing import assert_array_equal, assert_raises +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp import dpnp.linalg from dpnp.dpnp_array import dpnp_array diff --git a/dpnp/tests/test_usm_type.py b/dpnp/tests/test_usm_type.py index 4fc0f2b958fa..8f8efd1cdd10 100644 --- a/dpnp/tests/test_usm_type.py +++ b/dpnp/tests/test_usm_type.py @@ -2,11 +2,13 @@ import tempfile from math import prod -import dpctl.tensor as dpt import dpctl.utils as du import numpy import pytest +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp from dpnp.dpnp_utils import get_usm_allocations From 23d2229b2f643cd0583d26e640b93bb4f779d86b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:34:06 -0800 Subject: [PATCH 087/145] Move ti.ones_like() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 81 ++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index faa2cf3a07d8..b87d5f435782 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -47,6 +47,7 @@ linspace, meshgrid, ones, + ones_like, tril, triu, ) @@ -90,6 +91,7 @@ "meshgrid", "nonzero", "ones", + "ones_like", "place", "put", "put_along_axis", diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index ede3a9e207a1..ced78b9d2dcc 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1586,6 +1586,87 @@ def ones( return res +def ones_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` filled with ones and + having the same `shape` as the input array `x`. + + Args: + x (usm_ndarray): + Input array from which to derive the output array shape + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: `None` + order ("C", "F", "A", or "K"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with ones. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(1, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + sh = x.shape + return ones( + sh, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) + + def tril(x, /, *, k=0): """ Returns the lower triangular part of a matrix (or a stack of matrices) From 97dc7e1e67da1aa83b83c7b99c65bfa8865bca11 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:39:19 -0800 Subject: [PATCH 088/145] Move ti.zeros() to dpctl_ext/tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 75 ++++++++++++++++++++++++++++++++++-- dpnp/dpnp_container.py | 23 ++++++----- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index b87d5f435782..7da3d622bb02 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -50,6 +50,7 @@ ones_like, tril, triu, + zeros, ) from dpctl_ext.tensor._indexing_functions import ( extract, @@ -105,4 +106,5 @@ "tril", "triu", "where", + "zeros", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index ced78b9d2dcc..da1c3dc19017 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1091,7 +1091,7 @@ def eye( n_cols = n_rows if n_cols is None else operator.index(n_cols) k = operator.index(k) if k >= n_cols or -k >= n_rows: - return dpt.zeros( + return dpt_ext.zeros( (n_rows, n_cols), dtype=dtype, order=order, @@ -1720,7 +1720,7 @@ def tril(x, /, *, k=0): ) _manager.add_event_pair(hev, cpy_ev) elif k < -shape[nd - 2]: - res = dpt.zeros( + res = dpt_ext.zeros( x.shape, dtype=x.dtype, order=order, @@ -1784,7 +1784,7 @@ def triu(x, /, *, k=0): q = x.sycl_queue if k > shape[nd - 1]: - res = dpt.zeros( + res = dpt_ext.zeros( x.shape, dtype=x.dtype, order=order, @@ -1821,3 +1821,72 @@ def triu(x, /, *, k=0): _manager.add_event_pair(hev, triu_ev) return res + + +def zeros( + shape, + *, + dtype=None, + order="C", + device=None, + usm_type="device", + sycl_queue=None, +): + """ + Returns a new :class:`dpctl.tensor.usm_ndarray` having a specified + shape and filled with zeros. + + Args: + shape (Tuple[int], int): + Dimensions of the array to be created. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, + or a NumPy scalar type. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + Constructed array initialized with zeros. + """ + if not isinstance(order, str) or len(order) == 0 or order[0] not in "CcFf": + raise ValueError( + "Unrecognized order keyword value, expecting 'F' or 'C'." + ) + order = order[0].upper() + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = _get_dtype(dtype, sycl_queue) + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = dpt.usm_ndarray( + shape, + dtype=dtype, + buffer=usm_type, + order=order, + buffer_ctor_kwargs={"queue": sycl_queue}, + ) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, zeros_ev = ti._zeros_usm_ndarray(res, sycl_queue) + _manager.add_event_pair(hev, zeros_ev) + + return res diff --git a/dpnp/dpnp_container.py b/dpnp/dpnp_container.py index cb2e775a9653..9fe955746593 100644 --- a/dpnp/dpnp_container.py +++ b/dpnp/dpnp_container.py @@ -35,12 +35,11 @@ """ -import dpctl.tensor as dpt import dpctl.utils as dpu # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp from dpnp.dpnp_array import dpnp_array @@ -75,7 +74,7 @@ def arange( sycl_queue=sycl_queue, device=device ) - array_obj = dpt_ext.arange( + array_obj = dpt.arange( start, stop=stop, step=step, @@ -103,7 +102,7 @@ def asarray( """Converts incoming 'x1' object to 'dpnp_array'.""" if isinstance(x1, (list, tuple, range)): - array_obj = dpt_ext.asarray( + array_obj = dpt.asarray( x1, dtype=dtype, copy=copy, @@ -122,7 +121,7 @@ def asarray( x1_obj, device=device, sycl_queue=sycl_queue ) - array_obj = dpt_ext.asarray( + array_obj = dpt.asarray( x1_obj, dtype=dtype, copy=copy, @@ -143,7 +142,7 @@ def copy(x1, /, *, order="K"): if order is None: order = "K" - array_obj = dpt_ext.copy(dpnp.get_usm_ndarray(x1), order=order) + array_obj = dpt.copy(dpnp.get_usm_ndarray(x1), order=order) return dpnp_array._create_from_usm_ndarray(array_obj) @@ -165,7 +164,7 @@ def empty( order = "C" """Creates `dpnp_array` from uninitialized USM allocation.""" - array_obj = dpt_ext.empty( + array_obj = dpt.empty( shape, dtype=dtype, order=order, @@ -196,7 +195,7 @@ def eye( order = "C" """Creates `dpnp_array` with ones on the `k`th diagonal.""" - array_obj = dpt_ext.eye( + array_obj = dpt.eye( N, M, k=k, @@ -231,7 +230,7 @@ def full( fill_value = fill_value.get_array() """Creates `dpnp_array` having a specified shape, filled with fill_value.""" - array_obj = dpt_ext.full( + array_obj = dpt.full( shape, fill_value, dtype=dtype, @@ -260,7 +259,7 @@ def ones( order = "C" """Creates `dpnp_array` of ones with the given shape, dtype, and order.""" - array_obj = dpt_ext.ones( + array_obj = dpt.ones( shape, dtype=dtype, order=order, @@ -272,13 +271,13 @@ def ones( def tril(x1, /, *, k=0): """Creates `dpnp_array` as lower triangular part of an input array.""" - array_obj = dpt_ext.tril(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt.tril(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) def triu(x1, /, *, k=0): """Creates `dpnp_array` as upper triangular part of an input array.""" - array_obj = dpt_ext.triu(dpnp.get_usm_ndarray(x1), k=k) + array_obj = dpt.triu(dpnp.get_usm_ndarray(x1), k=k) return dpnp_array._create_from_usm_ndarray(array_obj) From a6c397ea61c8013dad9a0c4c6a14ffc155cdc2d4 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 04:41:49 -0800 Subject: [PATCH 089/145] Move ti.zeros_like() to dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_ctors.py | 84 ++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 7da3d622bb02..a6c169d3fad9 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -51,6 +51,7 @@ tril, triu, zeros, + zeros_like, ) from dpctl_ext.tensor._indexing_functions import ( extract, @@ -107,4 +108,5 @@ "triu", "where", "zeros", + "zeros_like", ] diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index da1c3dc19017..2648290f36b0 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1890,3 +1890,87 @@ def zeros( _manager.add_event_pair(hev, zeros_ev) return res + + +def zeros_like( + x, /, *, dtype=None, order="K", device=None, usm_type=None, sycl_queue=None +): + """ + Creates :class:`dpctl.tensor.usm_ndarray` from USM allocation + initialized with zeros. + + Args: + x (usm_ndarray): + Input array from which to derive the shape of the + output array. + dtype (optional): + data type of the array. Can be typestring, + a :class:`numpy.dtype` object, :mod:`numpy` char string, or a + NumPy scalar type. If `None`, output array has the same data + type as the input array. Default: ``None`` + order ("C", or "F"): + memory layout for the array. Default: ``"C"`` + device (optional): + array API concept of device where the output array + is created. ``device`` can be ``None``, a oneAPI filter selector + string, an instance of :class:`dpctl.SyclDevice` corresponding to + a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device` object + returned by :attr:`dpctl.tensor.usm_ndarray.device`. + Default: ``None`` + usm_type (``"device"``, ``"shared"``, ``"host"``, optional): + The type of SYCL USM allocation for the output array. + Default: ``"device"`` + sycl_queue (:class:`dpctl.SyclQueue`, optional): + The SYCL queue to use + for output array allocation and copying. ``sycl_queue`` and + ``device`` are complementary arguments, i.e. use one or another. + If both are specified, a :exc:`TypeError` is raised unless both + imply the same underlying SYCL queue to be used. If both are + ``None``, a cached queue targeting default-selected device is + used for allocation and population. Default: ``None`` + + Returns: + usm_ndarray: + New array initialized with zeros. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected instance of dpt.usm_ndarray, got {type(x)}.") + if ( + not isinstance(order, str) + or len(order) == 0 + or order[0] not in "CcFfAaKk" + ): + raise ValueError( + "Unrecognized order keyword value, expecting 'C', 'F', 'A', or 'K'." + ) + order = order[0].upper() + if dtype is None: + dtype = x.dtype + if usm_type is None: + usm_type = x.usm_type + dpctl.utils.validate_usm_type(usm_type, allow_none=False) + if device is None and sycl_queue is None: + device = x.device + sycl_queue = normalize_queue_device(sycl_queue=sycl_queue, device=device) + dtype = dpt.dtype(dtype) + order = _normalize_order(order, x) + if order == "K": + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) + _manager = dpctl.utils.SequentialOrderManager[sycl_queue] + # populating new allocation, no dependent events + hev, full_ev = ti._full_usm_ndarray(0, res, sycl_queue) + _manager.add_event_pair(hev, full_ev) + return res + else: + _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device) + sh = x.shape + return zeros( + sh, + dtype=dtype, + order=order, + device=device, + usm_type=usm_type, + sycl_queue=sycl_queue, + ) From b8c5390868d14280c8c621c826fe110fa59bfc6b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 05:50:22 -0800 Subject: [PATCH 090/145] Move ti.broadcast_to() to dpctl_ext/tensor and reuse it in dpctl_ext/tensor --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_clip.py | 32 ++++++------ dpctl_ext/tensor/_copy_utils.py | 2 +- dpctl_ext/tensor/_ctors.py | 4 +- dpctl_ext/tensor/_indexing_functions.py | 2 +- dpctl_ext/tensor/_manipulation_functions.py | 54 +++++++++++++++++++++ dpctl_ext/tensor/_search_functions.py | 6 +-- 7 files changed, 79 insertions(+), 23 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a6c169d3fad9..240323cf45fa 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -63,6 +63,7 @@ take_along_axis, ) from dpctl_ext.tensor._manipulation_functions import ( + broadcast_to, repeat, roll, ) @@ -76,6 +77,7 @@ "asarray", "asnumpy", "astype", + "broadcast_to", "can_cast", "copy", "clip", diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py index bab2271c6453..9fc42abc0d8b 100644 --- a/dpctl_ext/tensor/_clip.py +++ b/dpctl_ext/tensor/_clip.py @@ -205,9 +205,9 @@ def _clip_none(x, val, out, order, _binary_fn): order=order, ) if x_shape != res_shape: - x = dpt.broadcast_to(x, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) if val_ary.shape != res_shape: - val_ary = dpt.broadcast_to(val_ary, res_shape) + val_ary = dpt_ext.broadcast_to(val_ary, res_shape) _manager = SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events ht_binary_ev, binary_ev = _binary_fn( @@ -251,8 +251,8 @@ def _clip_none(x, val, out, order, _binary_fn): ) if x_shape != res_shape: - x = dpt.broadcast_to(x, res_shape) - buf = dpt.broadcast_to(buf, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) + buf = dpt_ext.broadcast_to(buf, res_shape) ht_binary_ev, binary_ev = _binary_fn( src1=x, src2=buf, @@ -580,11 +580,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"): order=order, ) if x_shape != res_shape: - x = dpt.broadcast_to(x, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) if a_min.shape != res_shape: - a_min = dpt.broadcast_to(a_min, res_shape) + a_min = dpt_ext.broadcast_to(a_min, res_shape) if a_max.shape != res_shape: - a_max = dpt.broadcast_to(a_max, res_shape) + a_max = dpt_ext.broadcast_to(a_max, res_shape) _manager = SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events ht_binary_ev, binary_ev = ti._clip( @@ -639,10 +639,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"): order=order, ) - x = dpt.broadcast_to(x, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) if a_min.shape != res_shape: - a_min = dpt.broadcast_to(a_min, res_shape) - buf2 = dpt.broadcast_to(buf2, res_shape) + a_min = dpt_ext.broadcast_to(a_min, res_shape) + buf2 = dpt_ext.broadcast_to(buf2, res_shape) ht_binary_ev, binary_ev = ti._clip( src=x, min=a_min, @@ -695,10 +695,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"): order=order, ) - x = dpt.broadcast_to(x, res_shape) - buf1 = dpt.broadcast_to(buf1, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) + buf1 = dpt_ext.broadcast_to(buf1, res_shape) if a_max.shape != res_shape: - a_max = dpt.broadcast_to(a_max, res_shape) + a_max = dpt_ext.broadcast_to(a_max, res_shape) ht_binary_ev, binary_ev = ti._clip( src=x, min=buf1, @@ -766,9 +766,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"): order=order, ) - x = dpt.broadcast_to(x, res_shape) - buf1 = dpt.broadcast_to(buf1, res_shape) - buf2 = dpt.broadcast_to(buf2, res_shape) + x = dpt_ext.broadcast_to(x, res_shape) + buf1 = dpt_ext.broadcast_to(buf1, res_shape) + buf2 = dpt_ext.broadcast_to(buf2, res_shape) ht_, clip_ev = ti._clip( src=x, min=buf1, diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index bdcd5d123072..f2544e06718a 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -368,7 +368,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0): rhs = vals else: rhs = dpt_ext.astype(vals, ary.dtype) - rhs = dpt.broadcast_to(rhs, expected_vals_shape) + rhs = dpt_ext.broadcast_to(rhs, expected_vals_shape) _manager = dpctl.utils.SequentialOrderManager[exec_q] dep_ev = _manager.submitted_events hev, put_ev = ti._put( diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 2648290f36b0..03f97c7cd8f0 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1200,7 +1200,7 @@ def full( usm_type=usm_type, sycl_queue=sycl_queue, ) - return dpt_ext.copy(dpt.broadcast_to(X, shape), order=order) + return dpt_ext.copy(dpt_ext.broadcast_to(X, shape), order=order) else: _validate_fill_value(fill_value) @@ -1307,7 +1307,7 @@ def full_like( usm_type=usm_type, sycl_queue=sycl_queue, ) - X = dpt.broadcast_to(X, sh) + X = dpt_ext.broadcast_to(X, sh) res = _empty_like_orderK(x, dtype, usm_type, sycl_queue) _manager = dpctl.utils.SequentialOrderManager[sycl_queue] # order copy after tasks populating X diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py index 369bd6a49022..91ffc759a920 100644 --- a/dpctl_ext/tensor/_indexing_functions.py +++ b/dpctl_ext/tensor/_indexing_functions.py @@ -341,7 +341,7 @@ def put_vec_duplicates(vec, ind, vals): rhs = vals else: rhs = dpt_ext.astype(vals, x.dtype) - rhs = dpt.broadcast_to(rhs, val_shape) + rhs = dpt_ext.broadcast_to(rhs, val_shape) _manager = dpctl.utils.SequentialOrderManager[exec_q] deps_ev = _manager.submitted_events diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 37cba6828818..9eeca08df8e3 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -86,6 +86,60 @@ def _broadcast_shape_impl(shapes): return tuple(common_shape) +def _broadcast_strides(X_shape, X_strides, res_ndim): + """ + Broadcasts strides to match the given dimensions; + returns tuple type strides. + """ + out_strides = [0] * res_ndim + X_shape_len = len(X_shape) + str_dim = -X_shape_len + for i in range(X_shape_len): + shape_value = X_shape[i] + if not shape_value == 1: + out_strides[str_dim] = X_strides[i] + str_dim += 1 + + return tuple(out_strides) + + +def broadcast_to(X, /, shape): + """broadcast_to(x, shape) + + Broadcast an array to a new `shape`; returns the broadcasted + :class:`dpctl.tensor.usm_ndarray` as a view. + + Args: + x (usm_ndarray): input array + shape (Tuple[int,...]): array shape. The `shape` must be + compatible with `x` according to broadcasting rules. + + Returns: + usm_ndarray: + An array with the specified `shape`. + The output array is a view of the input array, and + hence has the same data type, USM allocation type and + device attributes. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + # Use numpy.broadcast_to to check the validity of the input + # parameter 'shape'. Raise ValueError if 'X' is not compatible + # with 'shape' according to NumPy's broadcasting rules. + new_array = np.broadcast_to( + np.broadcast_to(np.empty(tuple(), dtype="u1"), X.shape), shape + ) + new_sts = _broadcast_strides(X.shape, X.strides, new_array.ndim) + return dpt.usm_ndarray( + shape=new_array.shape, + dtype=X.dtype, + buffer=X, + strides=new_sts, + offset=X._element_offset, + ) + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py index cc76bfe8b543..26100b0479f7 100644 --- a/dpctl_ext/tensor/_search_functions.py +++ b/dpctl_ext/tensor/_search_functions.py @@ -389,11 +389,11 @@ def where(condition, x1, x2, /, *, order="K", out=None): ) if condition_shape != res_shape: - condition = dpt.broadcast_to(condition, res_shape) + condition = dpt_ext.broadcast_to(condition, res_shape) if x1_shape != res_shape: - x1 = dpt.broadcast_to(x1, res_shape) + x1 = dpt_ext.broadcast_to(x1, res_shape) if x2_shape != res_shape: - x2 = dpt.broadcast_to(x2, res_shape) + x2 = dpt_ext.broadcast_to(x2, res_shape) dep_evs = _manager.submitted_events hev, where_ev = ti._where( From a7cbfdc150e0854e0d7b4b659ea893e39da9a70b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 05:52:18 -0800 Subject: [PATCH 091/145] Reuse dpctl_ext.tensor.broadcast_to() in dpnp --- dpnp/dpnp_algo/dpnp_elementwise_common.py | 4 ++-- dpnp/dpnp_algo/dpnp_fill.py | 5 ++--- dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py index 85dc9473206c..b3e0c74c228e 100644 --- a/dpnp/dpnp_algo/dpnp_elementwise_common.py +++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py @@ -1156,9 +1156,9 @@ def __call__( # Broadcast shapes of input arrays if x1.shape != res_shape: - x1 = dpt.broadcast_to(x1, res_shape) + x1 = dpt_ext.broadcast_to(x1, res_shape) if x2.shape != res_shape: - x2 = dpt.broadcast_to(x2, res_shape) + x2 = dpt_ext.broadcast_to(x2, res_shape) # Call the binary function with input and output arrays ht_binary_ev, binary_ev = self.get_implementation_function()( diff --git a/dpnp/dpnp_algo/dpnp_fill.py b/dpnp/dpnp_algo/dpnp_fill.py index ddba9f634cb1..7e0a70f25ff9 100644 --- a/dpnp/dpnp_algo/dpnp_fill.py +++ b/dpnp/dpnp_algo/dpnp_fill.py @@ -28,13 +28,12 @@ from numbers import Number -import dpctl.tensor as dpt import dpctl.utils as dpu from dpctl.tensor._ctors import _cast_fill_val # TODO: revert to `from dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp from dpctl_ext.tensor._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, @@ -56,7 +55,7 @@ def dpnp_fill(arr, val): raise dpu.ExecutionPlacementError( "Input arrays have incompatible queues." ) - a_val = dpt_ext.astype(val, arr.dtype) + a_val = dpt.astype(val, arr.dtype) a_val = dpt.broadcast_to(a_val, arr.shape) _manager = dpu.SequentialOrderManager[exec_q] dep_evs = _manager.submitted_events diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index e4a12346bc6c..5727d082e163 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1178,7 +1178,7 @@ def broadcast_to(array, /, shape, subok=False): raise NotImplementedError(f"subok={subok} is currently not supported") usm_array = dpnp.get_usm_ndarray(array) - new_array = dpt.broadcast_to(usm_array, shape) + new_array = dpt_ext.broadcast_to(usm_array, shape) return dpnp_array._create_from_usm_ndarray(new_array) From bd265dabf9c6275255babae2f624ff94b2fe08d9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:02:49 -0800 Subject: [PATCH 092/145] Move ti.broadcast_arrays() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_copy_utils.py | 2 +- dpctl_ext/tensor/_ctors.py | 2 +- dpctl_ext/tensor/_manipulation_functions.py | 40 +++++++++++++++++++++ dpnp/dpnp_iface_arraycreation.py | 2 +- dpnp/dpnp_iface_indexing.py | 2 +- 6 files changed, 46 insertions(+), 4 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 240323cf45fa..756ab4c3a422 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -63,6 +63,7 @@ take_along_axis, ) from dpctl_ext.tensor._manipulation_functions import ( + broadcast_arrays, broadcast_to, repeat, roll, @@ -77,6 +78,7 @@ "asarray", "asnumpy", "astype", + "broadcast_arrays", "broadcast_to", "can_cast", "copy", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index f2544e06718a..78dbb5be17e8 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -306,7 +306,7 @@ def _prepare_indices_arrays(inds, q, usm_type): ) # broadcast - inds = dpt.broadcast_arrays(*inds) + inds = dpt_ext.broadcast_arrays(*inds) return inds diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py index 03f97c7cd8f0..532802c0c519 100644 --- a/dpctl_ext/tensor/_ctors.py +++ b/dpctl_ext/tensor/_ctors.py @@ -1513,7 +1513,7 @@ def meshgrid(*arrays, indexing="xy"): res.append(dpt_ext.reshape(array, sh, copy=True)) sh = sh[-1:] + sh[:-1] - output = dpt.broadcast_arrays(*res) + output = dpt_ext.broadcast_arrays(*res) return output diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 9eeca08df8e3..7a2b5b6de910 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -47,6 +47,15 @@ ) +def _broadcast_shapes(*args): + """ + Broadcast the input shapes into a single shape; + returns tuple broadcasted shape. + """ + array_shapes = [array.shape for array in args] + return _broadcast_shape_impl(array_shapes) + + def _broadcast_shape_impl(shapes): if len(set(shapes)) == 1: return shapes[0] @@ -103,6 +112,37 @@ def _broadcast_strides(X_shape, X_strides, res_ndim): return tuple(out_strides) +def broadcast_arrays(*args): + """broadcast_arrays(*arrays) + + Broadcasts one or more :class:`dpctl.tensor.usm_ndarrays` against + one another. + + Args: + arrays (usm_ndarray): an arbitrary number of arrays to be + broadcasted. + + Returns: + List[usm_ndarray]: + A list of broadcasted arrays. Each array + must have the same shape. Each array must have the same `dtype`, + `device` and `usm_type` attributes as its corresponding input + array. + """ + if len(args) == 0: + raise ValueError("`broadcast_arrays` requires at least one argument") + for X in args: + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + shape = _broadcast_shapes(*args) + + if all(X.shape == shape for X in args): + return args + + return [broadcast_to(X, shape) for X in args] + + def broadcast_to(X, /, shape): """broadcast_to(x, shape) diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py index 078ac7a10f4b..d09cc17bde79 100644 --- a/dpnp/dpnp_iface_arraycreation.py +++ b/dpnp/dpnp_iface_arraycreation.py @@ -3131,7 +3131,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"): output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:]) if not sparse: - output = dpt.broadcast_arrays(*output) + output = dpt_ext.broadcast_arrays(*output) if copy: output = [dpt_ext.copy(x) for x in output] diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py index 58c8f495c440..a52196e9e4db 100644 --- a/dpnp/dpnp_iface_indexing.py +++ b/dpnp/dpnp_iface_indexing.py @@ -260,7 +260,7 @@ def choose(a, choices, out=None, mode="wrap"): choices, ) ) - arrs_broadcast = dpt.broadcast_arrays(inds, *choices) + arrs_broadcast = dpt_ext.broadcast_arrays(inds, *choices) inds = arrs_broadcast[0] choices = tuple(arrs_broadcast[1:]) From 080c7d801c29436b951dec9b7448c714107b30fc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:10:11 -0800 Subject: [PATCH 093/145] Move ti.concat() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 179 ++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 182 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 756ab4c3a422..84ae89b9a269 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -65,6 +65,7 @@ from dpctl_ext.tensor._manipulation_functions import ( broadcast_arrays, broadcast_to, + concat, repeat, roll, ) @@ -81,6 +82,7 @@ "broadcast_arrays", "broadcast_to", "can_cast", + "concat", "copy", "clip", "empty", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 7a2b5b6de910..2bb92c2616d6 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -40,6 +40,7 @@ import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index, normalize_axis_tuple +from ._type_utils import _supported_dtype, _to_device_supported_dtype __doc__ = ( "Implementation module for array manipulation " @@ -47,6 +48,46 @@ ) +def _arrays_validation(arrays, check_ndim=True): + n = len(arrays) + if n == 0: + raise TypeError("Missing 1 required positional argument: 'arrays'.") + + if not isinstance(arrays, (list, tuple)): + raise TypeError(f"Expected tuple or list type, got {type(arrays)}.") + + for X in arrays: + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + exec_q = dputils.get_execution_queue([X.sycl_queue for X in arrays]) + if exec_q is None: + raise ValueError("All the input arrays must have same sycl queue.") + + res_usm_type = dputils.get_coerced_usm_type([X.usm_type for X in arrays]) + if res_usm_type is None: + raise ValueError("All the input arrays must have usm_type.") + + X0 = arrays[0] + _supported_dtype(Xi.dtype for Xi in arrays) + + res_dtype = X0.dtype + dev = exec_q.sycl_device + for i in range(1, n): + res_dtype = np.promote_types(res_dtype, arrays[i]) + res_dtype = _to_device_supported_dtype(res_dtype, dev) + + if check_ndim: + for i in range(1, n): + if X0.ndim != arrays[i].ndim: + raise ValueError( + "All the input arrays must have same number of dimensions, " + f"but the array at index 0 has {X0.ndim} dimension(s) and " + f"the array at index {i} has {arrays[i].ndim} dimension(s)." + ) + return res_dtype, res_usm_type, exec_q + + def _broadcast_shapes(*args): """ Broadcast the input shapes into a single shape; @@ -112,6 +153,74 @@ def _broadcast_strides(X_shape, X_strides, res_ndim): return tuple(out_strides) +def _check_same_shapes(X0_shape, axis, n, arrays): + for i in range(1, n): + Xi_shape = arrays[i].shape + for j, X0j in enumerate(X0_shape): + if X0j != Xi_shape[j] and j != axis: + raise ValueError( + "All the input array dimensions for the concatenation " + f"axis must match exactly, but along dimension {j}, the " + f"array at index 0 has size {X0j} and the array " + f"at index {i} has size {Xi_shape[j]}." + ) + + +def _concat_axis_None(arrays): + """Implementation of concat(arrays, axis=None).""" + res_dtype, res_usm_type, exec_q = _arrays_validation( + arrays, check_ndim=False + ) + res_shape = 0 + for array in arrays: + res_shape += array.size + res = dpt_ext.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + fill_start = 0 + _manager = dputils.SequentialOrderManager[exec_q] + deps = _manager.submitted_events + for array in arrays: + fill_end = fill_start + array.size + if array.flags.c_contiguous: + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=dpt_ext.reshape(array, -1), + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + else: + src_ = array + # _copy_usm_ndarray_for_reshape requires src and dst to have + # the same data type + if not array.dtype == res_dtype: + src2_ = dpt_ext.empty_like(src_, dtype=res_dtype) + ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=src_, dst=src2_, sycl_queue=exec_q, depends=deps + ) + _manager.add_event_pair(ht_copy_ev, cpy_ev) + hev, reshape_copy_ev = ti._copy_usm_ndarray_for_reshape( + src=src2_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=[cpy_ev], + ) + _manager.add_event_pair(hev, reshape_copy_ev) + else: + hev, cpy_ev = ti._copy_usm_ndarray_for_reshape( + src=src_, + dst=res[fill_start:fill_end], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + fill_start = fill_end + + return res + + def broadcast_arrays(*args): """broadcast_arrays(*arrays) @@ -180,6 +289,76 @@ def broadcast_to(X, /, shape): ) +def concat(arrays, /, *, axis=0): + """concat(arrays, axis) + + Joins a sequence of arrays along an existing axis. + + Args: + arrays (Union[List[usm_ndarray, Tuple[usm_ndarray,...]]]): + input arrays to join. The arrays must have the same shape, + except in the dimension specified by `axis`. + axis (Optional[int]): axis along which the arrays will be joined. + If `axis` is `None`, arrays must be flattened before + concatenation. If `axis` is negative, it is understood as + being counted from the last dimension. Default: `0`. + + Returns: + usm_ndarray: + An output array containing the concatenated + values. The output array data type is determined by Type + Promotion Rules of array API. + + All input arrays must have the same device attribute. The output array + is allocated on that same device, and data movement operations are + scheduled on a queue underlying the device. The USM allocation type + of the output array is determined by USM allocation type promotion + rules. + """ + if axis is None: + return _concat_axis_None(arrays) + + res_dtype, res_usm_type, exec_q = _arrays_validation(arrays) + n = len(arrays) + X0 = arrays[0] + + axis = normalize_axis_index(axis, X0.ndim) + X0_shape = X0.shape + _check_same_shapes(X0_shape, axis, n, arrays) + + res_shape_axis = 0 + for X in arrays: + res_shape_axis = res_shape_axis + X.shape[axis] + + res_shape = tuple( + X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim) + ) + + res = dpt_ext.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = dputils.SequentialOrderManager[exec_q] + deps = _manager.submitted_events + fill_start = 0 + for i in range(n): + fill_end = fill_start + arrays[i].shape[axis] + c_shapes_copy = tuple( + np.s_[fill_start:fill_end] if j == axis else np.s_[:] + for j in range(X0.ndim) + ) + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], + dst=res[c_shapes_copy], + sycl_queue=exec_q, + depends=deps, + ) + _manager.add_event_pair(hev, cpy_ev) + fill_start = fill_end + + return res + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 5727d082e163..e90993da7b8a 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1416,7 +1416,7 @@ def concatenate( ) usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays] - usm_res = dpt.concat(usm_arrays, axis=axis) + usm_res = dpt_ext.concat(usm_arrays, axis=axis) res = dpnp_array._create_from_usm_ndarray(usm_res) if dtype is not None: From ea7ea3c8d0b392a6612f1cc175de2d4562bc75c6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:13:46 -0800 Subject: [PATCH 094/145] Move ti.expand_dims() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 46 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 84ae89b9a269..10030b6c2107 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -66,6 +66,7 @@ broadcast_arrays, broadcast_to, concat, + expand_dims, repeat, roll, ) @@ -88,6 +89,7 @@ "empty", "empty_like", "extract", + "expand_dims", "eye", "finfo", "from_numpy", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 2bb92c2616d6..969f5653f68c 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -359,6 +359,52 @@ def concat(arrays, /, *, axis=0): return res +def expand_dims(X, /, *, axis=0): + """expand_dims(x, axis) + + Expands the shape of an array by inserting a new axis (dimension) + of size one at the position specified by axis. + + Args: + x (usm_ndarray): + input array + axis (Union[int, Tuple[int]]): + axis position in the expanded axes (zero-based). If `x` has rank + (i.e, number of dimensions) `N`, a valid `axis` must reside + in the closed-interval `[-N-1, N]`. If provided a negative + `axis`, the `axis` position at which to insert a singleton + dimension is computed as `N + axis + 1`. Hence, if + provided `-1`, the resolved axis position is `N` (i.e., + a singleton dimension must be appended to the input array `x`). + If provided `-N-1`, the resolved axis position is `0` (i.e., a + singleton dimension is prepended to the input array `x`). + + Returns: + usm_ndarray: + Returns a view, if possible, and a copy otherwise with the number + of dimensions increased. + The expanded array has the same data type as the input array `x`. + The expanded array is located on the same device as the input + array, and has the same USM allocation type. + + Raises: + IndexError: if `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + if type(axis) not in (tuple, list): + axis = (axis,) + + out_ndim = len(axis) + X.ndim + axis = normalize_axis_tuple(axis, out_ndim) + + shape_it = iter(X.shape) + shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim)) + + return dpt_ext.reshape(X, shape) + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index e90993da7b8a..95897843821d 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1849,7 +1849,7 @@ def expand_dims(a, axis): """ usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.expand_dims(usm_a, axis=axis) + usm_res = dpt_ext.expand_dims(usm_a, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) From d2e9279806cdb47075c9d4d2b00c496164c65027 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:16:20 -0800 Subject: [PATCH 095/145] Move ti.flip() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_manipulation_functions.py | 32 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 10030b6c2107..6c9a6b17cdde 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -67,6 +67,7 @@ broadcast_to, concat, expand_dims, + flip, repeat, roll, ) @@ -92,6 +93,7 @@ "expand_dims", "eye", "finfo", + "flip", "from_numpy", "full", "full_like", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 969f5653f68c..5b06b6815162 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -405,6 +405,38 @@ def expand_dims(X, /, *, axis=0): return dpt_ext.reshape(X, shape) +def flip(X, /, *, axis=None): + """flip(x, axis) + + Reverses the order of elements in an array `x` along the given `axis`. + The shape of the array is preserved, but the elements are reordered. + + Args: + x (usm_ndarray): input array. + axis (Optional[Union[int, Tuple[int,...]]]): axis (or axes) along + which to flip. + If `axis` is `None`, all input array axes are flipped. + If `axis` is negative, the flipped axis is counted from the + last dimension. If provided more than one axis, only the specified + axes are flipped. Default: `None`. + + Returns: + usm_ndarray: + A view of `x` with the entries of `axis` reversed. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + X_ndim = X.ndim + if axis is None: + indexer = (np.s_[::-1],) * X_ndim + else: + axis = normalize_axis_tuple(axis, X_ndim) + indexer = tuple( + np.s_[::-1] if i in axis else np.s_[:] for i in range(X.ndim) + ) + return X[indexer] + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 95897843821d..a46c117a8ce2 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1920,7 +1920,7 @@ def flip(m, axis=None): """ m_usm = dpnp.get_usm_ndarray(m) - return dpnp_array._create_from_usm_ndarray(dpt.flip(m_usm, axis=axis)) + return dpnp_array._create_from_usm_ndarray(dpt_ext.flip(m_usm, axis=axis)) def fliplr(m): From 4e63cca92daf956d097cfe3c2971ff5d4c9e9ea5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:20:48 -0800 Subject: [PATCH 096/145] Move ti.permute_dims() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_copy_utils.py | 6 ++-- dpctl_ext/tensor/_manipulation_functions.py | 37 +++++++++++++++++++++ dpctl_ext/tensor/_reshape.py | 6 +++- dpnp/dpnp_array.py | 2 +- 5 files changed, 48 insertions(+), 5 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 6c9a6b17cdde..6c237b2e26c6 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -68,6 +68,7 @@ concat, expand_dims, flip, + permute_dims, repeat, roll, ) @@ -101,6 +102,7 @@ "isdtype", "linspace", "meshgrid", + "permute_dims", "nonzero", "ones", "ones_like", diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py index 78dbb5be17e8..878dabc581d2 100644 --- a/dpctl_ext/tensor/_copy_utils.py +++ b/dpctl_ext/tensor/_copy_utils.py @@ -695,7 +695,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev): for i in range(x.ndim) ) R = R[sl] - return dpt.permute_dims(R, inv_perm) + return dpt_ext.permute_dims(R, inv_perm) def _empty_like_orderK(x, dt, usm_type=None, dev=None): @@ -800,7 +800,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev): for i in range(nd1) ) R = R[sl] - return dpt.permute_dims(R, inv_perm) + return dpt_ext.permute_dims(R, inv_perm) def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): @@ -876,7 +876,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev): for i in range(nd1) ) R = R[sl] - return dpt.permute_dims(R, inv_perm) + return dpt_ext.permute_dims(R, inv_perm) def copy(usm_ary, /, *, order="K"): diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 5b06b6815162..4dbd524423fd 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -437,6 +437,43 @@ def flip(X, /, *, axis=None): return X[indexer] +def permute_dims(X, /, axes): + """permute_dims(x, axes) + + Permute the axes (dimensions) of an array; returns the permuted + array as a view. + + Args: + x (usm_ndarray): input array. + axes (Tuple[int, ...]): tuple containing permutation of + `(0,1,...,N-1)` where `N` is the number of axes (dimensions) + of `x`. + Returns: + usm_ndarray: + An array with permuted axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same USM allocation + type as `x`. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + axes = normalize_axis_tuple(axes, X.ndim, "axes") + if not X.ndim == len(axes): + raise ValueError( + "The length of the passed axes does not match " + "to the number of usm_ndarray dimensions." + ) + newstrides = tuple(X.strides[i] for i in axes) + newshape = tuple(X.shape[i] for i in axes) + return dpt.usm_ndarray( + shape=newshape, + dtype=X.dtype, + buffer=X, + strides=newstrides, + offset=X._element_offset, + ) + + def repeat(x, repeats, /, *, axis=None): """repeat(x, repeats, axis=None) diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py index 6afa1dc245c3..b7b6b068bfd0 100644 --- a/dpctl_ext/tensor/_reshape.py +++ b/dpctl_ext/tensor/_reshape.py @@ -37,6 +37,10 @@ _unravel_index, ) +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext + __doc__ = "Implementation module for :func:`dpctl.tensor.reshape`." @@ -184,7 +188,7 @@ def reshape(X, /, shape, *, order="C", copy=None): src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) else: - X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1)) + X_t = dpt_ext.permute_dims(X, range(X.ndim - 1, -1, -1)) hev, r_e = _copy_usm_ndarray_for_reshape( src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs ) diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py index 564627bacf2f..0d05ef8d49d5 100644 --- a/dpnp/dpnp_array.py +++ b/dpnp/dpnp_array.py @@ -2283,7 +2283,7 @@ def transpose(self, *axes): # self.transpose(None).shape == self.shape[::-1] axes = tuple((ndim - x - 1) for x in range(ndim)) - usm_res = dpt.permute_dims(self._array_obj, axes) + usm_res = dpt_ext.permute_dims(self._array_obj, axes) return dpnp_array._create_from_usm_ndarray(usm_res) def var( From 9c88edbcfc45d2912fb8584fd9a567bed94bc920 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:23:20 -0800 Subject: [PATCH 097/145] Move ti.moveaxis() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 51 +++++++++++++++++++++ dpnp/dpnp_algo/dpnp_arraycreation.py | 2 +- dpnp/dpnp_iface_manipulation.py | 2 +- 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 6c237b2e26c6..2a6b79b715d6 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -68,6 +68,7 @@ concat, expand_dims, flip, + moveaxis, permute_dims, repeat, roll, @@ -102,6 +103,7 @@ "isdtype", "linspace", "meshgrid", + "moveaxis", "permute_dims", "nonzero", "ones", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 4dbd524423fd..97fc344b07c9 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -437,6 +437,57 @@ def flip(X, /, *, axis=None): return X[indexer] +def moveaxis(X, source, destination, /): + """moveaxis(x, source, destination) + + Moves axes of an array to new positions. + + Args: + x (usm_ndarray): input array + + source (int or a sequence of int): + Original positions of the axes to move. + These must be unique. If `x` has rank (i.e., number of + dimensions) `N`, a valid `axis` must be in the + half-open interval `[-N, N)`. + + destination (int or a sequence of int): + Destination positions for each of the original axes. + These must also be unique. If `x` has rank + (i.e., number of dimensions) `N`, a valid `axis` must be + in the half-open interval `[-N, N)`. + + Returns: + usm_ndarray: + Array with moved axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same + USM allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + ValueError: if `src` and `dst` have not equal number of elements. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + source = normalize_axis_tuple(source, X.ndim, "source") + destination = normalize_axis_tuple(destination, X.ndim, "destination") + + if len(source) != len(destination): + raise ValueError( + "`source` and `destination` arguments must have " + "the same number of elements" + ) + + ind = [n for n in range(X.ndim) if n not in source] + + for src, dst in sorted(zip(destination, source)): + ind.insert(src, dst) + + return dpt_ext.permute_dims(X, tuple(ind)) + + def permute_dims(X, /, axes): """permute_dims(x, axes) diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py index 86b9642ecd5d..4e2ee8531a18 100644 --- a/dpnp/dpnp_algo/dpnp_arraycreation.py +++ b/dpnp/dpnp_algo/dpnp_arraycreation.py @@ -256,7 +256,7 @@ def dpnp_linspace( usm_res[-1, ...] = usm_stop if axis != 0: - usm_res = dpt.moveaxis(usm_res, 0, axis) + usm_res = dpt_ext.moveaxis(usm_res, 0, axis) if dpnp.issubdtype(dtype, dpnp.integer): dpt.floor(usm_res, out=usm_res) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index a46c117a8ce2..625da379b0d5 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -2408,7 +2408,7 @@ def moveaxis(a, source, destination): usm_array = dpnp.get_usm_ndarray(a) return dpnp_array._create_from_usm_ndarray( - dpt.moveaxis(usm_array, source, destination) + dpt_ext.moveaxis(usm_array, source, destination) ) From ba5163674a38f720259143435443af23a89ff73f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:25:54 -0800 Subject: [PATCH 098/145] Move ti.squeeze() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 45 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 2a6b79b715d6..d971c2b862cd 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -72,6 +72,7 @@ permute_dims, repeat, roll, + squeeze, ) from dpctl_ext.tensor._reshape import reshape @@ -115,6 +116,7 @@ "reshape", "result_type", "roll", + "squeeze", "take", "take_along_axis", "to_numpy", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 97fc344b07c9..22cda939c2c3 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -828,3 +828,48 @@ def roll(x, /, shift, *, axis=None): ) _manager.add_event_pair(ht_e, roll_ev) return res + + +def squeeze(X, /, axis=None): + """squeeze(x, axis) + + Removes singleton dimensions (axes) from array `x`. + + Args: + x (usm_ndarray): input array + axis (Union[int, Tuple[int,...]]): axis (or axes) to squeeze. + + Returns: + usm_ndarray: + Output array is a view, if possible, + and a copy otherwise, but with all or a subset of the + dimensions of length 1 removed. Output has the same data + type as the input, is allocated on the same device as the + input and has the same USM allocation type as the input + array `x`. + + Raises: + ValueError: if the specified axis has a size greater than one. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + X_shape = X.shape + if axis is not None: + axis = normalize_axis_tuple(axis, X.ndim if X.ndim != 0 else X.ndim + 1) + new_shape = [] + for i, x in enumerate(X_shape): + if i not in axis: + new_shape.append(x) + else: + if x != 1: + raise ValueError( + "Cannot select an axis to squeeze out " + "which has size not equal to one." + ) + new_shape = tuple(new_shape) + else: + new_shape = tuple(axis for axis in X_shape if axis != 1) + if new_shape == X.shape: + return X + else: + return dpt_ext.reshape(X, new_shape) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 625da379b0d5..c495b8866578 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3663,7 +3663,7 @@ def squeeze(a, /, axis=None): """ usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.squeeze(usm_a, axis=axis) + usm_res = dpt_ext.squeeze(usm_a, axis=axis) return dpnp_array._create_from_usm_ndarray(usm_res) From bb16c19b6dbecfed2276a8c4a2d06df6d69f0f9c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:27:53 -0800 Subject: [PATCH 099/145] Move ti.stack() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 61 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index d971c2b862cd..626e2063a43e 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -73,6 +73,7 @@ repeat, roll, squeeze, + stack, ) from dpctl_ext.tensor._reshape import reshape @@ -117,6 +118,7 @@ "result_type", "roll", "squeeze", + "stack", "take", "take_along_axis", "to_numpy", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 22cda939c2c3..0fe05ef70998 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -873,3 +873,64 @@ def squeeze(X, /, axis=None): return X else: return dpt_ext.reshape(X, new_shape) + + +def stack(arrays, /, *, axis=0): + """ + stack(arrays, axis) + + Joins a sequence of arrays along a new axis. + + Args: + arrays (Union[List[usm_ndarray], Tuple[usm_ndarray,...]]): + input arrays to join. Each array must have the same shape. + axis (int): axis along which the arrays will be joined. Providing + an `axis` specified the index of the new axis in the dimensions + of the output array. A valid axis must be on the interval + `[-N, N)`, where `N` is the rank (number of dimensions) of `x`. + Default: `0`. + + Returns: + usm_ndarray: + An output array having rank `N+1`, where `N` is + the rank (number of dimensions) of `x`. If the input arrays have + different data types, array API Type Promotion Rules apply. + + Raises: + ValueError: if not all input arrays have the same shape + IndexError: if provided an `axis` outside of the required interval. + """ + res_dtype, res_usm_type, exec_q = _arrays_validation(arrays) + + n = len(arrays) + X0 = arrays[0] + res_ndim = X0.ndim + 1 + axis = normalize_axis_index(axis, res_ndim) + X0_shape = X0.shape + + for i in range(1, n): + if X0_shape != arrays[i].shape: + raise ValueError("All input arrays must have the same shape") + + res_shape = tuple( + X0_shape[i - 1 * (i >= axis)] if i != axis else n + for i in range(res_ndim) + ) + + res = dpt_ext.empty( + res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + for i in range(n): + c_shapes_copy = tuple( + i if j == axis else np.s_[:] for j in range(res_ndim) + ) + _dst = res[c_shapes_copy] + hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arrays[i], dst=_dst, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cpy_ev) + + return res diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index c495b8866578..a511c31a8177 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3751,7 +3751,7 @@ def stack(arrays, /, *, axis=0, out=None, dtype=None, casting="same_kind"): ) usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays] - usm_res = dpt.stack(usm_arrays, axis=axis) + usm_res = dpt_ext.stack(usm_arrays, axis=axis) res = dpnp_array._create_from_usm_ndarray(usm_res) if dtype is not None: From ccad5f06e0d345e5bf4f53ac0fd0b67b784c294b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:30:21 -0800 Subject: [PATCH 100/145] Move ti.swapaxes() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_manipulation_functions.py | 38 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 626e2063a43e..793ff8c7f31f 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -74,6 +74,7 @@ roll, squeeze, stack, + swapaxes, ) from dpctl_ext.tensor._reshape import reshape @@ -119,6 +120,7 @@ "roll", "squeeze", "stack", + "swapaxes", "take", "take_along_axis", "to_numpy", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 0fe05ef70998..e07b198f5566 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -934,3 +934,41 @@ def stack(arrays, /, *, axis=0): _manager.add_event_pair(hev, cpy_ev) return res + + +def swapaxes(X, axis1, axis2): + """swapaxes(x, axis1, axis2) + + Interchanges two axes of an array. + + Args: + x (usm_ndarray): input array + + axis1 (int): First axis. + If `x` has rank (i.e., number of dimensions) `N`, + a valid `axis` must be in the half-open interval `[-N, N)`. + + axis2 (int): Second axis. + If `x` has rank (i.e., number of dimensions) `N`, + a valid `axis` must be in the half-open interval `[-N, N)`. + + Returns: + usm_ndarray: + Array with swapped axes. + The returned array must has the same data type as `x`, + is created on the same device as `x` and has the same USM + allocation type as `x`. + + Raises: + AxisError: if `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + axis1 = normalize_axis_index(axis1, X.ndim, "axis1") + axis2 = normalize_axis_index(axis2, X.ndim, "axis2") + + ind = list(range(0, X.ndim)) + ind[axis1] = axis2 + ind[axis2] = axis1 + return dpt_ext.permute_dims(X, tuple(ind)) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index a511c31a8177..4f0531f8b27f 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3812,7 +3812,7 @@ def swapaxes(a, axis1, axis2): """ usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.swapaxes(usm_a, axis1=axis1, axis2=axis2) + usm_res = dpt_ext.swapaxes(usm_a, axis1=axis1, axis2=axis2) return dpnp_array._create_from_usm_ndarray(usm_res) From b5e3541e6ea0943df885bb2c76089be76d18a40d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:33:25 -0800 Subject: [PATCH 101/145] Move ti.tile() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 97 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 2 +- 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 793ff8c7f31f..ceefa3572ea1 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -75,6 +75,7 @@ squeeze, stack, swapaxes, + tile, ) from dpctl_ext.tensor._reshape import reshape @@ -123,6 +124,7 @@ "swapaxes", "take", "take_along_axis", + "tile", "to_numpy", "tril", "triu", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index e07b198f5566..6e26ddb88bcb 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -972,3 +972,100 @@ def swapaxes(X, axis1, axis2): ind[axis1] = axis2 ind[axis2] = axis1 return dpt_ext.permute_dims(X, tuple(ind)) + + +def tile(x, repetitions, /): + """tile(x, repetitions) + + Repeat an input array `x` along each axis a number of times given by + `repetitions`. + + For `N` = len(`repetitions`) and `M` = len(`x.shape`): + + * If `M < N`, `x` will have `N - M` new axes prepended to its shape + * If `M > N`, `repetitions` will have `M - N` ones prepended to it + + Args: + x (usm_ndarray): input array + + repetitions (Union[int, Tuple[int, ...]]): + The number of repetitions along each dimension of `x`. + + Returns: + usm_ndarray: + tiled output array. + + The returned array will have rank `max(M, N)`. If `S` is the + shape of `x` after prepending dimensions and `R` is + `repetitions` after prepending ones, then the shape of the + result will be `S[i] * R[i]` for each dimension `i`. + + The returned array will have the same data type as `x`. + The returned array will be located on the same device as `x` and + have the same USM allocation type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") + + if not isinstance(repetitions, tuple): + if isinstance(repetitions, int): + repetitions = (repetitions,) + else: + raise TypeError( + f"Expected tuple or integer type, got {type(repetitions)}." + ) + + rep_dims = len(repetitions) + x_dims = x.ndim + if rep_dims < x_dims: + repetitions = (x_dims - rep_dims) * (1,) + repetitions + elif x_dims < rep_dims: + x = dpt_ext.reshape(x, (rep_dims - x_dims) * (1,) + x.shape) + res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions)) + # case of empty input + if x.size == 0: + return dpt_ext.empty( + res_shape, + dtype=x.dtype, + usm_type=x.usm_type, + sycl_queue=x.sycl_queue, + ) + in_sh = x.shape + if res_shape == in_sh: + return dpt_ext.copy(x) + expanded_sh = [] + broadcast_sh = [] + out_sz = 1 + for i in range(len(res_shape)): + out_sz *= res_shape[i] + reps, sh = repetitions[i], in_sh[i] + if reps == 1: + # dimension will be unchanged + broadcast_sh.append(sh) + expanded_sh.append(sh) + elif sh == 1: + # dimension will be broadcast + broadcast_sh.append(reps) + expanded_sh.append(sh) + else: + broadcast_sh.extend([reps, sh]) + expanded_sh.extend([1, sh]) + exec_q = x.sycl_queue + xdt = x.dtype + xut = x.usm_type + res = dpt_ext.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q) + # no need to copy data for empty output + if out_sz > 0: + x = dpt_ext.broadcast_to( + # this reshape should never copy + dpt_ext.reshape(x, expanded_sh), + broadcast_sh, + ) + # copy broadcast input into flat array + _manager = dputils.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + hev, cp_ev = ti._copy_usm_ndarray_for_reshape( + src=x, dst=res, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(hev, cp_ev) + return dpt_ext.reshape(res, res_shape) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 4f0531f8b27f..d11d67c2de0b 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -3892,7 +3892,7 @@ def tile(A, reps): """ usm_a = dpnp.get_usm_ndarray(A) - usm_res = dpt.tile(usm_a, reps) + usm_res = dpt_ext.tile(usm_a, reps) return dpnp_array._create_from_usm_ndarray(usm_res) From 325729bfbad021a18671e0815c20c18deeb869b8 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 23 Feb 2026 07:37:38 -0800 Subject: [PATCH 102/145] Move ti.unstack() to dpctl_ext.tensor and reuse it --- dpctl_ext/tensor/__init__.py | 2 ++ dpctl_ext/tensor/_manipulation_functions.py | 29 +++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 8 +++--- 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index ceefa3572ea1..d46cf4a4a65f 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -76,6 +76,7 @@ stack, swapaxes, tile, + unstack, ) from dpctl_ext.tensor._reshape import reshape @@ -128,6 +129,7 @@ "to_numpy", "tril", "triu", + "unstack", "where", "zeros", "zeros_like", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 6e26ddb88bcb..08459dcaea76 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -974,6 +974,35 @@ def swapaxes(X, axis1, axis2): return dpt_ext.permute_dims(X, tuple(ind)) +def unstack(X, /, *, axis=0): + """unstack(x, axis=0) + + Splits an array in a sequence of arrays along the given axis. + + Args: + x (usm_ndarray): input array + + axis (int, optional): axis along which `x` is unstacked. + If `x` has rank (i.e, number of dimensions) `N`, + a valid `axis` must reside in the half-open interval `[-N, N)`. + Default: `0`. + + Returns: + Tuple[usm_ndarray,...]: + Output sequence of arrays which are views into the input array. + + Raises: + AxisError: if the `axis` value is invalid. + """ + if not isinstance(X, dpt.usm_ndarray): + raise TypeError(f"Expected usm_ndarray type, got {type(X)}.") + + axis = normalize_axis_index(axis, X.ndim) + Y = dpt_ext.moveaxis(X, axis, 0) + + return tuple(Y[i] for i in range(Y.shape[0])) + + def tile(x, repetitions, /): """tile(x, repetitions) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index d11d67c2de0b..c034d1c43793 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -1093,7 +1093,9 @@ def broadcast_arrays(*args, subok=False): if len(args) == 0: return [] - usm_arrays = dpt.broadcast_arrays(*[dpnp.get_usm_ndarray(a) for a in args]) + usm_arrays = dpt_ext.broadcast_arrays( + *[dpnp.get_usm_ndarray(a) for a in args] + ) return [dpnp_array._create_from_usm_ndarray(a) for a in usm_arrays] @@ -1521,7 +1523,7 @@ def copyto(dst, src, casting="same_kind", where=True): f"but got {where.dtype}" ) - dst_usm, src_usm, mask_usm = dpt.broadcast_arrays( + dst_usm, src_usm, mask_usm = dpt_ext.broadcast_arrays( dpnp.get_usm_ndarray(dst), dpnp.get_usm_ndarray(src), dpnp.get_usm_ndarray(where), @@ -4522,7 +4524,7 @@ def unstack(x, /, *, axis=0): if usm_x.ndim == 0: raise ValueError("Input array must be at least 1-d.") - res = dpt.unstack(usm_x, axis=axis) + res = dpt_ext.unstack(usm_x, axis=axis) return tuple(dpnp_array._create_from_usm_ndarray(a) for a in res) From 4552e78359b8fb67094614b106c99dd9fb8acea9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 27 Feb 2026 08:57:58 -0800 Subject: [PATCH 103/145] Initialize _tensor_accumulation_impl extension and move _cumsum_over_axis --- dpctl_ext/tensor/CMakeLists.txt | 32 +- .../accumulators/accumulate_over_axis.hpp | 459 ++++++++++++++++++ .../accumulators/accumulators_common.cpp | 55 +++ .../accumulators/accumulators_common.hpp | 46 ++ .../source/accumulators/cumulative_sum.cpp | 357 ++++++++++++++ .../source/accumulators/cumulative_sum.hpp | 46 ++ .../libtensor/source/tensor_accumulation.cpp | 43 ++ 7 files changed, 1030 insertions(+), 8 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 864e34ddaba4..7cd60cda6edd 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -63,6 +63,16 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) +set(_accumulator_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp +) +set(_tensor_accumulation_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp + ${_accumulator_sources} +) set(_static_lib_trgt simplify_iteration_space) @@ -85,6 +95,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_accumulation_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_accumulation_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if(WIN32) set(_clang_prefix "/clang:") @@ -97,14 +113,14 @@ set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ) -#list( -#APPEND _no_fast_math_sources -# ${_elementwise_sources} -# ${_reduction_sources} -# ${_sorting_sources} -# ${_linalg_sources} -# ${_accumulator_sources} -#) +list( + APPEND _no_fast_math_sources + # ${_elementwise_sources} + # ${_reduction_sources} + # ${_sorting_sources} + # ${_linalg_sources} + ${_accumulator_sources} +) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp new file mode 100644 index 000000000000..712ab180ef37 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -0,0 +1,459 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/accumulators.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template +std::pair + py_accumulate_over_axis(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_accumulate, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iter_nd = src_nd - trailing_dims_to_accumulate; + if (trailing_dims_to_accumulate <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_accumulate must be positive, but no " + "greater than rank of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + int acc_nd = trailing_dims_to_accumulate; + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +template +std::pair py_accumulate_final_axis_include_initial( + const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + std::vector const &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + + static constexpr int acc_nd = 1; + + int iter_nd = src_nd - acc_nd; + if (iter_nd < 0) { + throw py::value_error("accumulation axis must not be greater than rank " + "of the input array"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t acc_nelems(1); + for (int i = iter_nd; same_shapes && (i < src_nd); ++i) { + auto dst_shape_i = dst_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_ptr[i] + 1 == dst_shape_i); + acc_nelems *= static_cast(dst_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (acc_nelems == 0)) { + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, acc_nelems * iter_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + std::vector host_task_events; + + if ((is_src_c_contig && is_dst_c_contig) && iter_nd == 0) { + auto fn = contig_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + sycl::event acc_ev = fn(exec_q, acc_nelems, src_data, dst_data, + host_task_events, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {acc_ev}), + acc_ev); + } + + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + using shT = std::vector; + shT acc_shape(std::begin(src_shape_vec) + iter_nd, std::end(src_shape_vec)); + + shT acc_src_strides(std::begin(src_strides_vec) + iter_nd, + std::end(src_strides_vec)); + + shT acc_dst_strides(std::begin(dst_strides_vec) + iter_nd, + std::end(dst_strides_vec)); + + shT iter_shape(std::begin(src_shape_vec), + std::begin(src_shape_vec) + iter_nd); + + shT iter_src_strides(std::begin(src_strides_vec), + std::begin(src_strides_vec) + iter_nd); + + shT iter_dst_strides(std::begin(dst_strides_vec), + std::begin(dst_strides_vec) + iter_nd); + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + // Strided implementation + auto strided_fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (strided_fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, acc_shape, + acc_src_strides, acc_dst_strides); + auto packed_shapes_and_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const auto ©_shapes_strides_ev = std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_and_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *acc_shapes_and_strides = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), copy_shapes_strides_ev); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + + sycl::event acc_ev = strided_fn( + exec_q, iter_nelems, acc_nelems, src_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, acc_nd, + acc_shapes_and_strides, dst_data, host_task_events, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {acc_ev}, packed_shapes_and_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events), + acc_ev); +} + +/*! @brief Template implementing Python API for querying accumulation + * type support */ +template +bool py_accumulate_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + fn = dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp new file mode 100644 index 000000000000..c6cbce0f6ced --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp @@ -0,0 +1,55 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include + +// #include "cumulative_logsumexp.hpp" +// #include "cumulative_prod.hpp" +#include "cumulative_sum.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +/*! @brief Add accumulators to Python module */ +void init_accumulator_functions(py::module_ m) +{ + // init_cumulative_logsumexp(m); + // init_cumulative_prod(m); + init_cumulative_sum(m); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp new file mode 100644 index 000000000000..c33a040a7fa7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_accumulator_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp new file mode 100644 index 000000000000..0087122ce1ac --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -0,0 +1,357 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumsum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumsum_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumsum_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForSumAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +using CumSumScanOpT = std:: + conditional_t, sycl::logical_or, sycl::plus>; + +template +struct CumSum1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSum1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumSumIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForSumAccumulation::is_defined) + { + using ScanOpT = CumSumScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumsum_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumsum_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumsum_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumsum_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(cumsum_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumsum_dispatch_tables; + populate_cumsum_dispatch_tables(); + + using impl::cumsum_1d_contig_dispatch_table; + using impl::cumsum_strided_dispatch_table; + auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table); + }; + m.def("_cumsum_over_axis", cumsum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumsum_1d_include_initial_contig_dispatch_table; + using impl::cumsum_include_initial_strided_dispatch_table; + auto cumsum_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumsum_include_initial_strided_dispatch_table, + cumsum_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumsum_final_axis_include_initial", cumsum_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumsum_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumsum_strided_dispatch_table); + }; + m.def("_cumsum_dtype_supported", cumsum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp new file mode 100644 index 000000000000..5e06b222a3bc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_sum(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp new file mode 100644 index 000000000000..faa3fc8b52c6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_accumulation.cpp @@ -0,0 +1,43 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include + +#include "accumulators/accumulators_common.hpp" + +PYBIND11_MODULE(_tensor_accumulation_impl, m) +{ + dpctl::tensor::py_internal::init_accumulator_functions(m); +} From 6b81e7aa6e9caf365bb5a7f7d3bf5bf71a7491d6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 27 Feb 2026 08:59:16 -0800 Subject: [PATCH 104/145] Move ti.cumulative_sum() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_accumulation.py | 311 ++++++++++++++++++++++++++++++ dpnp/dpnp_iface_mathematical.py | 4 +- 3 files changed, 315 insertions(+), 2 deletions(-) create mode 100644 dpctl_ext/tensor/_accumulation.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index d46cf4a4a65f..ce4a70d202f2 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -80,6 +80,7 @@ ) from dpctl_ext.tensor._reshape import reshape +from ._accumulation import cumulative_sum from ._clip import clip from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type @@ -94,6 +95,7 @@ "concat", "copy", "clip", + "cumulative_sum", "empty", "empty_like", "extract", diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py new file mode 100644 index 000000000000..252b892fe481 --- /dev/null +++ b/dpctl_ext/tensor/_accumulation.py @@ -0,0 +1,311 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._type_utils import ( # _default_accumulation_dtype_fp_types, + _default_accumulation_dtype, + _to_device_supported_dtype, +) +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_accumulation_impl as tai +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index + + +def _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + _accumulate_fn, + _accumulate_include_initial_fn, + _dtype_supported, + _default_accumulation_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + appended_axis = False + if x.ndim == 0: + x = x[dpt.newaxis] + appended_axis = True + nd = x.ndim + if axis is None: + if nd > 1: + raise ValueError( + "`axis` cannot be `None` for array of dimension `{}`".format(nd) + ) + axis = 0 + else: + axis = normalize_axis_index(axis, nd, "axis") + sh = x.shape + res_sh = ( + sh[:axis] + (sh[axis] + 1,) + sh[axis + 1 :] if include_initial else sh + ) + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt_ext.permute_dims(x, perm) + q = x.sycl_queue + inp_dt = x.dtype + res_usm_type = x.usm_type + if dtype is None: + res_dt = _default_accumulation_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + # checking now avoids unnecessary allocations + implemented_types = _dtype_supported(inp_dt, res_dt) + if dtype is None and not implemented_types: + raise RuntimeError( + "Automatically determined accumulation data type does not " + "have direct implementation" + ) + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + out_sh = out.shape + # append an axis to `out` if scalar + if appended_axis and not include_initial: + out = out[dpt.newaxis, ...] + orig_out = out + final_res_sh = res_sh[1:] + else: + final_res_sh = res_sh + if not out_sh == final_res_sh: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_sh}, got {out_sh}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " f"got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + # permute out array dims if necessary + if a1 != nd: + out = dpt_ext.permute_dims(out, perm) + orig_out = out + if ti._array_overlap(x, out) and implemented_types: + out = dpt_ext.empty_like(out) + else: + out = dpt_ext.empty( + res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + out = dpt_ext.permute_dims(out, perm) + + _manager = SequentialOrderManager[q] + depends = _manager.submitted_events + if implemented_types: + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=arr, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + depends=depends, + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=arr, dst=out, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e, acc_ev) + if not (orig_out is None or out is orig_out): + # Copy the out data from temporary buffer to original memory + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[acc_ev] + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + out = orig_out + else: + if _dtype_supported(res_dt, res_dt): + tmp = dpt_ext.empty( + arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=tmp, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e, acc_ev) + else: + buf_dt = _default_accumulation_type_fn(inp_dt, q) + tmp = dpt_ext.empty( + arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=depends + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + tmp_res = dpt_ext.empty( + res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + if a1 != nd: + tmp_res = dpt_ext.permute_dims(tmp_res, perm) + if not include_initial: + ht_e, acc_ev = _accumulate_fn( + src=tmp, + trailing_dims_to_accumulate=1, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + else: + ht_e, acc_ev = _accumulate_include_initial_fn( + src=tmp, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e, acc_ev) + ht_e_cpy2, cpy_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=out, sycl_queue=q, depends=[acc_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy_e2) + + if appended_axis: + out = dpt_ext.squeeze(out) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt_ext.permute_dims(out, inv_perm) + + return out + + +def cumulative_sum( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_sum(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative sum of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative sum must be computed. + If `None`, the sum is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative sum. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative sums. The returned array has the data + type as described in the `dtype` parameter description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative sum is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative sums are computed + along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumsum_over_axis, + tai._cumsum_final_axis_include_initial, + tai._cumsum_dtype_supported, + _default_accumulation_dtype, + ) diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index 3dbf07be080a..a6ff69c08ca7 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -1218,7 +1218,7 @@ def cumsum(a, axis=None, dtype=None, out=None): return dpnp_wrap_reduction_call( usm_a, out, - dpt.cumulative_sum, + dpt_ext.cumulative_sum, _get_reduction_res_dt(a, dtype), axis=axis, dtype=dtype, @@ -1403,7 +1403,7 @@ def cumulative_sum( return dpnp_wrap_reduction_call( dpnp.get_usm_ndarray(x), out, - dpt.cumulative_sum, + dpt_ext.cumulative_sum, _get_reduction_res_dt(x, dtype), axis=axis, dtype=dtype, From 5af94c87b47ccf73d55c42bc5584f3d583b0dddc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 02:17:34 -0800 Subject: [PATCH 105/145] Move _cumprod_over_axis to dpctl_ext.tensor._tensor_accumulation_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../accumulators/accumulators_common.cpp | 4 +- .../source/accumulators/cumulative_prod.cpp | 359 ++++++++++++++++++ .../source/accumulators/cumulative_prod.hpp | 46 +++ 4 files changed, 408 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 7cd60cda6edd..a64a47f8e7e0 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -66,7 +66,7 @@ set(_tensor_impl_sources set(_accumulator_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp ) set(_tensor_accumulation_impl_sources diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp index c6cbce0f6ced..96c1b6d12cbc 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp @@ -36,7 +36,7 @@ #include // #include "cumulative_logsumexp.hpp" -// #include "cumulative_prod.hpp" +#include "cumulative_prod.hpp" #include "cumulative_sum.hpp" namespace py = pybind11; @@ -48,7 +48,7 @@ namespace dpctl::tensor::py_internal void init_accumulator_functions(py::module_ m) { // init_cumulative_logsumexp(m); - // init_cumulative_prod(m); + init_cumulative_prod(m); init_cumulative_sum(m); } diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp new file mode 100644 index 000000000000..ca92db16cb05 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -0,0 +1,359 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumprod_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumprod_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumprod_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForProdAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +using CumProdScanOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + +template +struct CumProd1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProd1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumProdIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForProdAccumulation::is_defined) + { + using ScanOpT = CumProdScanOpT; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumprod_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumprod_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumprod_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumprod_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumprod_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumprod_dispatch_tables; + populate_cumprod_dispatch_tables(); + + using impl::cumprod_1d_contig_dispatch_table; + using impl::cumprod_strided_dispatch_table; + auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis( + src, trailing_dims_to_accumulate, dst, exec_q, depends, + cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table); + }; + m.def("_cumprod_over_axis", cumprod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumprod_1d_include_initial_contig_dispatch_table; + using impl::cumprod_include_initial_strided_dispatch_table; + auto cumprod_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumprod_include_initial_strided_dispatch_table, + cumprod_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumprod_final_axis_include_initial", cumprod_include_initial_pyapi, + "", py::arg("src"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumprod_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported(input_dtype, output_dtype, + cumprod_strided_dispatch_table); + }; + m.def("_cumprod_dtype_supported", cumprod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp new file mode 100644 index 000000000000..e14bb2c44361 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_prod(py::module_ m); + +} // namespace dpctl::tensor::py_internal From 91547cc90a4fb507b5c0a9617ce20df70e322a03 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 02:33:58 -0800 Subject: [PATCH 106/145] Add missing include --- .../tensor/libtensor/source/accumulators/cumulative_prod.cpp | 1 + .../tensor/libtensor/source/accumulators/cumulative_sum.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp index ca92db16cb05..07c802c8cffa 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -43,6 +43,7 @@ #include "dpnp4pybind11.hpp" #include #include +#include #include "accumulate_over_axis.hpp" #include "kernels/accumulators.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp index 0087122ce1ac..3c422bfd0998 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -43,6 +43,7 @@ #include "dpnp4pybind11.hpp" #include #include +#include #include "accumulate_over_axis.hpp" #include "kernels/accumulators.hpp" From 668f3fb3beec3633261644d7eba8f3de5c4e1f12 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 02:34:41 -0800 Subject: [PATCH 107/145] Move ti.cumulative_prod() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 3 +- dpctl_ext/tensor/_accumulation.py | 80 +++++++++++++++++++++++++++++++ dpnp/dpnp_iface_mathematical.py | 4 +- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index ce4a70d202f2..ce489fe8a36d 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -80,7 +80,7 @@ ) from dpctl_ext.tensor._reshape import reshape -from ._accumulation import cumulative_sum +from ._accumulation import cumulative_prod, cumulative_sum from ._clip import clip from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type @@ -95,6 +95,7 @@ "concat", "copy", "clip", + "cumulative_prod", "cumulative_sum", "empty", "empty_like", diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py index 252b892fe481..6ee5c2cfaa28 100644 --- a/dpctl_ext/tensor/_accumulation.py +++ b/dpctl_ext/tensor/_accumulation.py @@ -309,3 +309,83 @@ def cumulative_sum( tai._cumsum_dtype_supported, _default_accumulation_dtype, ) + + +def cumulative_prod( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_prod(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative product must be computed. + If `None`, the product is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative product. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided + axis in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative products. The returned array has + the data type as described in the `dtype` parameter description + above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative product is calculated, which will have size `N+1` + + where `N` is the size of the axis the cumulative products are + computed along. + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumprod_over_axis, + tai._cumprod_final_axis_include_initial, + tai._cumprod_dtype_supported, + _default_accumulation_dtype, + ) diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index a6ff69c08ca7..e5e379f8dc00 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -1126,7 +1126,7 @@ def cumprod(a, axis=None, dtype=None, out=None): return dpnp_wrap_reduction_call( usm_a, out, - dpt.cumulative_prod, + dpt_ext.cumulative_prod, _get_reduction_res_dt(a, dtype), axis=axis, dtype=dtype, @@ -1307,7 +1307,7 @@ def cumulative_prod( return dpnp_wrap_reduction_call( dpnp.get_usm_ndarray(x), out, - dpt.cumulative_prod, + dpt_ext.cumulative_prod, _get_reduction_res_dt(x, dtype), axis=axis, dtype=dtype, From 72d2109ddb27537973094555210116c0d0069e31 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 02:47:00 -0800 Subject: [PATCH 108/145] Move _cumlogsumexp_over_axis to dpctl_ext.tensor._tensor_accumulation_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../accumulators/accumulators_common.cpp | 4 +- .../accumulators/cumulative_logsumexp.cpp | 351 ++++++++++++++++++ .../accumulators/cumulative_logsumexp.hpp | 46 +++ 4 files changed, 400 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index a64a47f8e7e0..eff5e7552648 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -65,7 +65,7 @@ set(_tensor_impl_sources ) set(_accumulator_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/accumulators_common.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_logsumexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp ) diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp index 96c1b6d12cbc..5e07e81b7ad5 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulators_common.cpp @@ -35,7 +35,7 @@ #include -// #include "cumulative_logsumexp.hpp" +#include "cumulative_logsumexp.hpp" #include "cumulative_prod.hpp" #include "cumulative_sum.hpp" @@ -47,7 +47,7 @@ namespace dpctl::tensor::py_internal /*! @brief Add accumulators to Python module */ void init_accumulator_functions(py::module_ m) { - // init_cumulative_logsumexp(m); + init_cumulative_logsumexp(m); init_cumulative_prod(m); init_cumulative_sum(m); } diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp new file mode 100644 index 000000000000..572c93c0066e --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -0,0 +1,351 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "accumulate_over_axis.hpp" +#include "kernels/accumulators.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +namespace su_ns = dpctl::tensor::sycl_utils; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::accumulators::accumulate_1d_contig_impl_fn_ptr_t; +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; + +using dpctl::tensor::kernels::accumulators::accumulate_strided_impl_fn_ptr_t; +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static accumulate_1d_contig_impl_fn_ptr_t + cumlogsumexp_1d_include_initial_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static accumulate_strided_impl_fn_ptr_t + cumlogsumexp_include_initial_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForLogSumExpAccumulation +{ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct CumLogSumExp1DContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExp1DIncludeInitialContigFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_1d_contig_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = false; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +template +struct CumLogSumExpIncludeInitialStridedFactory +{ + fnT get() + { + if constexpr (TypePairSupportDataForLogSumExpAccumulation< + srcTy, dstTy>::is_defined) + { + using ScanOpT = su_ns::LogSumExp; + static constexpr bool include_initial = true; + if constexpr (std::is_same_v) { + using dpctl::tensor::kernels::accumulators::NoOpTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, ScanOpT, + include_initial>; + return fn; + } + else { + using dpctl::tensor::kernels::accumulators::CastTransformer; + fnT fn = dpctl::tensor::kernels::accumulators:: + accumulate_strided_impl, + ScanOpT, include_initial>; + return fn; + } + } + else { + return nullptr; + } + } +}; + +void populate_cumlogsumexp_dispatch_tables(void) +{ + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(cumlogsumexp_1d_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(cumlogsumexp_strided_dispatch_table); + + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + cumlogsumexp_1d_include_initial_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + cumlogsumexp_include_initial_strided_dispatch_table); + + return; +} + +} // namespace impl + +void init_cumulative_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + using impl::populate_cumlogsumexp_dispatch_tables; + populate_cumlogsumexp_dispatch_tables(); + + using impl::cumlogsumexp_1d_contig_dispatch_table; + using impl::cumlogsumexp_strided_dispatch_table; + auto cumlogsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_accumulate, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_accumulate_over_axis; + return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst, + exec_q, depends, + cumlogsumexp_strided_dispatch_table, + cumlogsumexp_1d_contig_dispatch_table); + }; + m.def("_cumlogsumexp_over_axis", cumlogsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_accumulate"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + using impl::cumlogsumexp_1d_include_initial_contig_dispatch_table; + using impl::cumlogsumexp_include_initial_strided_dispatch_table; + auto cumlogsumexp_include_initial_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal:: + py_accumulate_final_axis_include_initial; + return py_accumulate_final_axis_include_initial( + src, dst, exec_q, depends, + cumlogsumexp_include_initial_strided_dispatch_table, + cumlogsumexp_1d_include_initial_contig_dispatch_table); + }; + m.def("_cumlogsumexp_final_axis_include_initial", + cumlogsumexp_include_initial_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_accumulate_dtype_supported; + return py_accumulate_dtype_supported( + input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table); + }; + m.def("_cumlogsumexp_dtype_supported", cumlogsumexp_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp new file mode 100644 index 000000000000..f1292320bd0d --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_accumulation_impl +// extensions +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_cumulative_logsumexp(py::module_ m); + +} // namespace dpctl::tensor::py_internal From d8c3680b6e19f15779652f41dee6172970903a19 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 02:47:29 -0800 Subject: [PATCH 109/145] Move ti.cumulative_logsumexp() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 3 +- dpctl_ext/tensor/_accumulation.py | 87 +++++++++++++++++++++++++++++-- dpnp/dpnp_iface_trigonometric.py | 3 +- 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index ce489fe8a36d..95c5e64c9322 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -80,7 +80,7 @@ ) from dpctl_ext.tensor._reshape import reshape -from ._accumulation import cumulative_prod, cumulative_sum +from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type @@ -95,6 +95,7 @@ "concat", "copy", "clip", + "cumulative_logsumexp", "cumulative_prod", "cumulative_sum", "empty", diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py index 6ee5c2cfaa28..17596e5647fc 100644 --- a/dpctl_ext/tensor/_accumulation.py +++ b/dpctl_ext/tensor/_accumulation.py @@ -28,10 +28,6 @@ import dpctl import dpctl.tensor as dpt -from dpctl.tensor._type_utils import ( # _default_accumulation_dtype_fp_types, - _default_accumulation_dtype, - _to_device_supported_dtype, -) from dpctl.utils import ExecutionPlacementError, SequentialOrderManager # TODO: revert to `import dpctl.tensor...` @@ -39,6 +35,11 @@ import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_accumulation_impl as tai import dpctl_ext.tensor._tensor_impl as ti +from dpctl_ext.tensor._type_utils import ( + _default_accumulation_dtype, + _default_accumulation_dtype_fp_types, + _to_device_supported_dtype, +) from ._numpy_helper import normalize_axis_index @@ -389,3 +390,81 @@ def cumulative_prod( tai._cumprod_dtype_supported, _default_accumulation_dtype, ) + + +def cumulative_logsumexp( + x, /, *, axis=None, dtype=None, include_initial=False, out=None +): + """ + cumulative_logsumexp(x, /, *, axis=None, dtype=None, include_initial=False, + out=None) + + Calculates the cumulative logsmumexp of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which cumulative logsumexp must be computed. + If `None`, the logsumexp is computed over the entire array. + If `x` is a one-dimensional array, providing an `axis` is optional; + however, if `x` has more than one dimension, providing an `axis` + is required. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + + * If `x` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + `x`. + * If `x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the cumulative logsumexp. + Default: `None`. + include_initial (bool): + boolean indicating whether to include the initial value (i.e., the + additive identity, zero) as the first value along the provided axis + in the output. Default: `False`. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of `out` must match the expected shape and the + expected data type of the result or (if provided) `dtype`. + If `None` then a new array is returned. Default: `None`. + + Returns: + usm_ndarray: + an array containing cumulative logsumexp results. The returned + array has the data type as described in the `dtype` parameter + description above. + + The returned array shape is determined as follows: + + * If `include_initial` is `False`, the returned array will + have the same shape as `x` + * If `include_initial` is `True`, the returned array will + have the same shape as `x` except the axis along which the + cumulative logsumexp is calculated, which will have size + `N+1` + """ + return _accumulate_common( + x, + axis, + dtype, + include_initial, + out, + tai._cumlogsumexp_over_axis, + tai._cumlogsumexp_final_axis_include_initial, + tai._cumlogsumexp_dtype_supported, + _default_accumulation_dtype_fp_types, + ) diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py index 9894bd304701..460a0dc80f0f 100644 --- a/dpnp/dpnp_iface_trigonometric.py +++ b/dpnp/dpnp_iface_trigonometric.py @@ -48,6 +48,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi @@ -934,7 +935,7 @@ def cumlogsumexp( return dpnp_wrap_reduction_call( usm_x, out, - dpt.cumulative_logsumexp, + dpt_ext.cumulative_logsumexp, _get_accumulation_res_dt(x, dtype), axis=axis, dtype=dtype, From a1d9b0a737a7ef1048de4f5c6723eeff5a5ccc3a Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 05:06:49 -0800 Subject: [PATCH 110/145] Extend ignore-words-list codespell --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 67fb75cb5f54..73844d1b0c17 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT" quiet-level = 3 [tool.coverage.report] From 8a87fea7458a803be34f81139b3ac27f741eed1d Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 05:07:00 -0800 Subject: [PATCH 111/145] Move utils rich_comparisons.hpp file --- .../include/utils/rich_comparisons.hpp | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp diff --git a/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp new file mode 100644 index 000000000000..87cdfbfbd54f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/utils/rich_comparisons.hpp @@ -0,0 +1,149 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include "sycl/sycl.hpp" + +namespace dpctl::tensor::rich_comparisons +{ + +namespace detail +{ +template +struct ExtendedRealFPLess +{ + /* [R, nan] */ + bool operator()(const fpT v1, const fpT v2) const + { + return (!std::isnan(v1) && (std::isnan(v2) || (v1 < v2))); + } +}; + +template +struct ExtendedRealFPGreater +{ + bool operator()(const fpT v1, const fpT v2) const + { + return (!std::isnan(v2) && (std::isnan(v1) || (v2 < v1))); + } +}; + +template +struct ExtendedComplexFPLess +{ + /* [(R, R), (R, nan), (nan, R), (nan, nan)] */ + + bool operator()(const cT &v1, const cT &v2) const + { + using realT = typename cT::value_type; + + const realT real1 = std::real(v1); + const realT real2 = std::real(v2); + + const bool r1_nan = std::isnan(real1); + const bool r2_nan = std::isnan(real2); + + const realT imag1 = std::imag(v1); + const realT imag2 = std::imag(v2); + + const bool i1_nan = std::isnan(imag1); + const bool i2_nan = std::isnan(imag2); + + const int idx1 = ((r1_nan) ? 2 : 0) + ((i1_nan) ? 1 : 0); + const int idx2 = ((r2_nan) ? 2 : 0) + ((i2_nan) ? 1 : 0); + + const bool res = + !(r1_nan && i1_nan) && + ((idx1 < idx2) || + ((idx1 == idx2) && + ((r1_nan && !i1_nan && (imag1 < imag2)) || + (!r1_nan && i1_nan && (real1 < real2)) || + (!r1_nan && !i1_nan && + ((real1 < real2) || (!(real2 < real1) && (imag1 < imag2))))))); + + return res; + } +}; + +template +struct ExtendedComplexFPGreater +{ + bool operator()(const cT &v1, const cT &v2) const + { + auto less_ = ExtendedComplexFPLess{}; + return less_(v2, v1); + } +}; + +template +inline constexpr bool is_fp_v = (std::is_same_v || + std::is_same_v || + std::is_same_v); + +} // namespace detail + +template +struct AscendingSorter +{ + using type = std::conditional_t, + detail::ExtendedRealFPLess, + std::less>; +}; + +template +struct AscendingSorter> +{ + using type = detail::ExtendedComplexFPLess>; +}; + +template +struct DescendingSorter +{ + using type = std::conditional_t, + detail::ExtendedRealFPGreater, + std::greater>; +}; + +template +struct DescendingSorter> +{ + using type = detail::ExtendedComplexFPGreater>; +}; + +} // namespace dpctl::tensor::rich_comparisons From 43a7a0474168c29c55a9c07a572cb138e3117c4e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 05:49:51 -0800 Subject: [PATCH 112/145] Initialize _tensor_sorting_impl extension and move _radix_sort and py_sort --- dpctl_ext/tensor/CMakeLists.txt | 21 +- .../include/kernels/sorting/radix_sort.hpp | 1920 +++++++++++++++++ .../kernels/sorting/sort_impl_fn_ptr_t.hpp | 61 + .../include/kernels/sorting/sort_utils.hpp | 148 ++ .../source/sorting/py_sort_common.hpp | 179 ++ .../libtensor/source/sorting/radix_sort.cpp | 192 ++ .../libtensor/source/sorting/radix_sort.hpp | 47 + .../source/sorting/radix_sort_support.hpp | 78 + .../libtensor/source/tensor_sorting.cpp | 57 + pyproject.toml | 2 +- 10 files changed, 2703 insertions(+), 2 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index eff5e7552648..cf0a78efdd59 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -69,10 +69,23 @@ set(_accumulator_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp ) +set(_sorting_sources + #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp +) set(_tensor_accumulation_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp ${_accumulator_sources} ) +set(_tensor_sorting_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp + ${_sorting_sources} +) set(_static_lib_trgt simplify_iteration_space) @@ -101,6 +114,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_sorting_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if(WIN32) set(_clang_prefix "/clang:") @@ -117,7 +136,7 @@ list( APPEND _no_fast_math_sources # ${_elementwise_sources} # ${_reduction_sources} - # ${_sorting_sources} + ${_sorting_sources} # ${_linalg_sources} ${_accumulator_sources} ) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp new file mode 100644 index 000000000000..545444101fc6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -0,0 +1,1920 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/sort_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace radix_sort_details +{ + +template +class radix_sort_count_kernel; + +template +class radix_sort_scan_kernel; + +template +class radix_sort_reorder_peer_kernel; + +template +class radix_sort_reorder_kernel; + +/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */ +template && + sizeof(SizeT) == sizeof(std::uint64_t), + int> = 0> +std::uint32_t ceil_log2(SizeT n) +{ + // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b + // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1) + // ceil_log2(n) == 1 + floor_log2(n-1) + if (n <= 1) + return std::uint32_t{1}; + + std::uint32_t exp{1}; + --n; + if (n >= (SizeT{1} << 32)) { + n >>= 32; + exp += 32; + } + if (n >= (SizeT{1} << 16)) { + n >>= 16; + exp += 16; + } + if (n >= (SizeT{1} << 8)) { + n >>= 8; + exp += 8; + } + if (n >= (SizeT{1} << 4)) { + n >>= 4; + exp += 4; + } + if (n >= (SizeT{1} << 2)) { + n >>= 2; + exp += 2; + } + if (n >= (SizeT{1} << 1)) { + n >>= 1; + ++exp; + } + return exp; +} + +//---------------------------------------------------------- +// bitwise order-preserving conversions to unsigned integers +//---------------------------------------------------------- + +template +bool order_preserving_cast(bool val) +{ + if constexpr (is_ascending) + return val; + else + return !val; +} + +template , int> = 0> +UIntT order_preserving_cast(UIntT val) +{ + if constexpr (is_ascending) { + return val; + } + else { + // bitwise invert + return (~val); + } +} + +template && std::is_signed_v, + int> = 0> +std::make_unsigned_t order_preserving_cast(IntT val) +{ + using UIntT = std::make_unsigned_t; + const UIntT uint_val = sycl::bit_cast(val); + + if constexpr (is_ascending) { + // ascending_mask: 100..0 + static constexpr UIntT ascending_mask = + (UIntT(1) << std::numeric_limits::digits); + return (uint_val ^ ascending_mask); + } + else { + // descending_mask: 011..1 + static constexpr UIntT descending_mask = + (std::numeric_limits::max() >> 1); + return (uint_val ^ descending_mask); + } +} + +template +std::uint16_t order_preserving_cast(sycl::half val) +{ + using UIntT = std::uint16_t; + + const UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() + : val); + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 15)); + + static constexpr UIntT zero_mask = UIntT(0x8000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFu); + + static constexpr UIntT inv_zero_mask = static_cast(~zero_mask); + static constexpr UIntT inv_nonzero_mask = static_cast(~nonzero_mask); + + if constexpr (is_ascending) { + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + } + else { + mask = (zero_fp_sign_bit) ? (inv_zero_mask) : (inv_nonzero_mask); + } + + return (uint_val ^ mask); +} + +template && + sizeof(FloatT) == sizeof(std::uint32_t), + int> = 0> +std::uint32_t order_preserving_cast(FloatT val) +{ + using UIntT = std::uint32_t; + + UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() : val); + + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 31)); + + static constexpr UIntT zero_mask = UIntT(0x80000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFu); + + if constexpr (is_ascending) + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + else + mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask); + + return (uint_val ^ mask); +} + +template && + sizeof(FloatT) == sizeof(std::uint64_t), + int> = 0> +std::uint64_t order_preserving_cast(FloatT val) +{ + using UIntT = std::uint64_t; + + UIntT uint_val = sycl::bit_cast( + (sycl::isnan(val)) ? std::numeric_limits::quiet_NaN() : val); + UIntT mask; + + // test the sign bit of the original value + const bool zero_fp_sign_bit = (UIntT(0) == (uint_val >> 63)); + + static constexpr UIntT zero_mask = UIntT(0x8000000000000000u); + static constexpr UIntT nonzero_mask = UIntT(0xFFFFFFFFFFFFFFFFu); + + if constexpr (is_ascending) + mask = (zero_fp_sign_bit) ? zero_mask : nonzero_mask; + else + mask = (zero_fp_sign_bit) ? (~zero_mask) : (~nonzero_mask); + + return (uint_val ^ mask); +} + +//----------------- +// bucket functions +//----------------- + +template +constexpr std::size_t number_of_bits_in_type() +{ + constexpr std::size_t type_bits = + (sizeof(T) * std::numeric_limits::digits); + return type_bits; +} + +// the number of buckets (size of radix bits) in T +template +constexpr std::uint32_t number_of_buckets_in_type(std::uint32_t radix_bits) +{ + constexpr std::size_t type_bits = number_of_bits_in_type(); + return (type_bits + radix_bits - 1) / radix_bits; +} + +// get bits value (bucket) in a certain radix position +template +std::uint32_t get_bucket_id(T val, std::uint32_t radix_offset) +{ + static_assert(std::is_unsigned_v); + + return (val >> radix_offset) & T(radix_mask); +} + +//-------------------------------- +// count kernel (single iteration) +//-------------------------------- + +template +sycl::event + radix_sort_count_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::size_t wg_size, + std::uint32_t radix_offset, + std::size_t n_values, + ValueT *vals_ptr, + std::size_t n_counts, + CountT *counts_ptr, + const Proj &proj_op, + const bool is_ascending, + const std::vector &dependency_events) +{ + // bin_count = radix_states used for an array storing bucket state counters + static constexpr std::uint32_t radix_states = + (std::uint32_t(1) << radix_bits); + static constexpr std::uint32_t radix_mask = radix_states - 1; + + // iteration space info + const std::size_t n = n_values; + // each segment is processed by a work-group + const std::size_t elems_per_segment = (n + n_segments - 1) / n_segments; + const std::size_t no_op_flag_id = n_counts - 1; + + assert(n_counts == (n_segments + 1) * radix_states + 1); + + sycl::event local_count_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependency_events); + + sycl::local_accessor counts_lacc(wg_size * radix_states, + cgh); + + sycl::nd_range<1> ndRange(n_iters * n_segments * wg_size, wg_size); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + // 0 <= lid < wg_size + const std::size_t lid = ndit.get_local_id(0); + // 0 <= group_id < n_segments * n_iters + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / n_segments; + const std::size_t val_iter_offset = iter_id * n; + // 0 <= wgr_id < n_segments + const std::size_t wgr_id = group_id - iter_id * n_segments; + + const std::size_t seg_start = elems_per_segment * wgr_id; + + // count per work-item: create a private array for storing count + // values here bin_count = radix_states + std::array counts_arr = {CountT{0}}; + + // count per work-item: count values and write result to private + // count array + const std::size_t seg_end = + sycl::min(seg_start + elems_per_segment, n); + if (is_ascending) { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += wg_size) { + // get the bucket for the bit-ordered input value, + // applying the offset and mask for radix bits + const auto val = + order_preserving_cast( + proj_op(vals_ptr[val_iter_offset + val_id])); + const std::uint32_t bucket_id = + get_bucket_id(val, radix_offset); + + // increment counter for this bit bucket + ++counts_arr[bucket_id]; + } + } + else { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += wg_size) { + // get the bucket for the bit-ordered input value, + // applying the offset and mask for radix bits + const auto val = + order_preserving_cast( + proj_op(vals_ptr[val_iter_offset + val_id])); + const std::uint32_t bucket_id = + get_bucket_id(val, radix_offset); + + // increment counter for this bit bucket + ++counts_arr[bucket_id]; + } + } + + // count per work-item: write private count array to local count + // array counts_lacc is concatenation of private count arrays from + // each work-item in the order of their local ids + const std::uint32_t count_start_id = radix_states * lid; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) + { + counts_lacc[count_start_id + radix_state_id] = + counts_arr[radix_state_id]; + } + + sycl::group_barrier(ndit.get_group()); + + // count per work-group: reduce till count_lacc[] size > wg_size + // all work-items in the work-group do the work. + for (std::uint32_t i = 1; i < radix_states; ++i) { + // Since we interested in computing total count over work-group + // for each radix state, the correct result is only assured if + // wg_size >= radix_states + counts_lacc[lid] += counts_lacc[wg_size * i + lid]; + } + + sycl::group_barrier(ndit.get_group()); + + // count per work-group: reduce until count_lacc[] size > + // radix_states (n_witems /= 2 per iteration) + for (std::uint32_t n_witems = (wg_size >> 1); + n_witems >= radix_states; n_witems >>= 1) + { + if (lid < n_witems) + counts_lacc[lid] += counts_lacc[n_witems + lid]; + + sycl::group_barrier(ndit.get_group()); + } + + const std::size_t iter_counter_offset = iter_id * n_counts; + + // count per work-group: write local count array to global count + // array + if (lid < radix_states) { + // move buckets with the same id to adjacent positions, + // thus splitting count array into radix_states regions + counts_ptr[iter_counter_offset + (n_segments + 1) * lid + + wgr_id] = counts_lacc[lid]; + } + + // side work: reset 'no-operation-flag', signaling to skip re-order + // phase + if (wgr_id == 0 && lid == 0) { + CountT &no_op_flag = + counts_ptr[iter_counter_offset + no_op_flag_id]; + no_op_flag = 0; + } + }); + }); + + return local_count_ev; +} + +//----------------------------------------------------------------------- +// radix sort: scan kernel (single iteration) +//----------------------------------------------------------------------- + +template +sycl::event radix_sort_scan_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::size_t wg_size, + std::size_t n_values, + std::size_t n_counts, + CountT *counts_ptr, + const std::vector depends) +{ + const std::size_t no_op_flag_id = n_counts - 1; + + // Scan produces local offsets using count values. + // There are no local offsets for the first segment, but the rest segments + // should be scanned with respect to the count value in the first segment + // what requires n + 1 positions + const std::size_t scan_size = n_segments + 1; + wg_size = std::min(scan_size, wg_size); + + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + // compilation of the kernel prevents out of resources issue, which may + // occur due to usage of collective algorithms such as joint_exclusive_scan + // even if local memory is not explicitly requested + sycl::event scan_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::nd_range<1> ndRange(n_iters * radix_states * wg_size, wg_size); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / radix_states; + const std::size_t wgr_id = group_id - iter_id * radix_states; + // find borders of a region with a specific bucket id + auto begin_ptr = + counts_ptr + scan_size * wgr_id + iter_id * n_counts; + + sycl::joint_exclusive_scan(ndit.get_group(), begin_ptr, + begin_ptr + scan_size, begin_ptr, + CountT(0), sycl::plus{}); + + const auto lid = ndit.get_local_linear_id(); + + // NB: No race condition here, because the condition may ever be + // true for only on one WG, one WI. + if ((lid == wg_size - 1) && (begin_ptr[scan_size - 1] == n_values)) + { + // set flag, since all the values got into one + // this is optimization, may happy often for + // higher radix offsets (all zeros) + auto &no_op_flag = + counts_ptr[iter_id * n_counts + no_op_flag_id]; + no_op_flag = 1; + } + }); + }); + + return scan_ev; +} + +//----------------------------------------------------------------------- +// radix sort: group level reorder algorithms +//----------------------------------------------------------------------- + +struct empty_storage +{ + template + empty_storage(T &&...) + { + } +}; + +// Number with `n` least significant bits of uint32_t +inline std::uint32_t n_ls_bits_set(std::uint32_t n) noexcept +{ + static constexpr std::uint32_t zero{}; + static constexpr std::uint32_t all_bits_set = ~zero; + + return ~(all_bits_set << n); +} + +enum class peer_prefix_algo +{ + subgroup_ballot, + atomic_fetch_or, + scan_then_broadcast +}; + +template +struct peer_prefix_helper; + +template +auto get_accessor_pointer(const AccT &acc) +{ + return acc.template get_multi_ptr().get(); +} + +template +struct peer_prefix_helper +{ + using AtomicT = sycl::atomic_ref; + using TempStorageT = sycl::local_accessor; + +private: + sycl::sub_group sgroup; + std::uint32_t lid; + std::uint32_t item_mask; + AtomicT atomic_peer_mask; + +public: + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT lacc) + : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()), + item_mask(n_ls_bits_set(lid)), atomic_peer_mask(lacc[0]) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + // reset mask for each radix state + if (lid == 0) + atomic_peer_mask.store(std::uint32_t{0}); + sycl::group_barrier(sgroup); + + const std::uint32_t uint_contrib{wi_bit_set ? std::uint32_t{1} + : std::uint32_t{0}}; + + // set local id's bit to 1 if the bucket value matches the radix state + atomic_peer_mask.fetch_or(uint_contrib << lid); + sycl::group_barrier(sgroup); + std::uint32_t peer_mask_bits = atomic_peer_mask.load(); + std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits); + + // get the local offset index from the bits set in the peer mask with + // index less than the work item ID + peer_mask_bits &= item_mask; + new_offset_id |= wi_bit_set + ? (offset_prefix + sycl::popcount(peer_mask_bits)) + : OffsetT{0}; + return sg_total_offset; + } +}; + +template +struct peer_prefix_helper +{ + using TempStorageT = empty_storage; + using ItemType = sycl::nd_item<1>; + using SubGroupType = sycl::sub_group; + +private: + SubGroupType sgroup; + std::uint32_t sg_size; + +public: + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT) + : sgroup(ndit.get_sub_group()), sg_size(sgroup.get_local_range()[0]) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + const std::uint32_t contrib{wi_bit_set ? std::uint32_t{1} + : std::uint32_t{0}}; + + std::uint32_t sg_item_offset = sycl::exclusive_scan_over_group( + sgroup, contrib, sycl::plus{}); + + new_offset_id |= + (wi_bit_set ? (offset_prefix + sg_item_offset) : OffsetT(0)); + + // the last scanned value does not contain number of all copies, thus + // adding contribution + std::uint32_t sg_total_offset = sycl::group_broadcast( + sgroup, sg_item_offset + contrib, sg_size - 1); + + return sg_total_offset; + } +}; + +template +struct peer_prefix_helper +{ +private: + sycl::sub_group sgroup; + std::uint32_t lid; + sycl::ext::oneapi::sub_group_mask item_sg_mask; + + sycl::ext::oneapi::sub_group_mask mask_builder(std::uint32_t mask, + std::uint32_t sg_size) + { + return sycl::detail::Builder::createSubGroupMask< + sycl::ext::oneapi::sub_group_mask>(mask, sg_size); + } + +public: + using TempStorageT = empty_storage; + + peer_prefix_helper(sycl::nd_item<1> ndit, TempStorageT) + : sgroup(ndit.get_sub_group()), lid(ndit.get_local_linear_id()), + item_sg_mask( + mask_builder(n_ls_bits_set(lid), sgroup.get_local_linear_range())) + { + } + + std::uint32_t peer_contribution(OffsetT &new_offset_id, + OffsetT offset_prefix, + bool wi_bit_set) const + { + // set local id's bit to 1 if the bucket value matches the radix state + auto peer_mask = sycl::ext::oneapi::group_ballot(sgroup, wi_bit_set); + std::uint32_t peer_mask_bits{}; + + peer_mask.extract_bits(peer_mask_bits); + std::uint32_t sg_total_offset = sycl::popcount(peer_mask_bits); + + // get the local offset index from the bits set in the peer mask with + // index less than the work item ID + peer_mask &= item_sg_mask; + peer_mask.extract_bits(peer_mask_bits); + + new_offset_id |= wi_bit_set + ? (offset_prefix + sycl::popcount(peer_mask_bits)) + : OffsetT(0); + + return sg_total_offset; + } +}; + +template +void copy_func_for_radix_sort(const std::size_t n_segments, + const std::size_t elems_per_segment, + const std::size_t sg_size, + const std::uint32_t lid, + const std::size_t wgr_id, + const InputT *input_ptr, + const std::size_t n_values, + OutputT *output_ptr) +{ + // item info + const std::size_t seg_start = elems_per_segment * wgr_id; + + std::size_t seg_end = sycl::min(seg_start + elems_per_segment, n_values); + + // ensure that each work item in a subgroup does the same number of loop + // iterations + const std::uint16_t tail_size = (seg_end - seg_start) % sg_size; + seg_end -= tail_size; + + // find offsets for the same values within a segment and fill the resulting + // buffer + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + output_ptr[val_id] = std::move(input_ptr[val_id]); + } + + if (tail_size > 0 && lid < tail_size) { + const std::size_t val_id = seg_end + lid; + output_ptr[val_id] = std::move(input_ptr[val_id]); + } +} + +//----------------------------------------------------------------------- +// radix sort: reorder kernel (per iteration) +//----------------------------------------------------------------------- +template +sycl::event + radix_sort_reorder_submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::uint32_t radix_offset, + std::size_t n_values, + const InputT *input_ptr, + OutputT *output_ptr, + std::size_t n_offsets, + OffsetT *offset_ptr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector dependency_events) +{ + using ValueT = InputT; + using PeerHelper = peer_prefix_helper; + + static constexpr std::uint32_t radix_states = std::uint32_t{1} + << radix_bits; + static constexpr std::uint32_t radix_mask = radix_states - 1; + const std::size_t elems_per_segment = + (n_values + n_segments - 1) / n_segments; + + const std::size_t no_op_flag_id = n_offsets - 1; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + sycl::event reorder_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependency_events); + cgh.use_kernel_bundle(kb); + + using StorageT = typename PeerHelper::TempStorageT; + + StorageT peer_temp(1, cgh); + + sycl::range<1> lRange{sg_size}; + sycl::range<1> gRange{n_iters * n_segments * sg_size}; + + sycl::nd_range<1> ndRange{gRange, lRange}; + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> ndit) { + const std::size_t group_id = ndit.get_group(0); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = group_id - iter_id * n_segments; + + auto b_offset_ptr = offset_ptr + iter_id * n_offsets; + auto b_input_ptr = input_ptr + iter_id * n_values; + auto b_output_ptr = output_ptr + iter_id * n_values; + + const std::uint32_t lid = ndit.get_local_id(0); + + auto &no_op_flag = b_offset_ptr[no_op_flag_id]; + if (no_op_flag) { + // no reordering necessary, simply copy + copy_func_for_radix_sort( + n_segments, elems_per_segment, sg_size, lid, segment_id, + b_input_ptr, n_values, b_output_ptr); + return; + } + + // create a private array for storing offset values + // and add total offset and offset for compute unit + // for a certain radix state + std::array offset_arr{}; + const std::size_t scan_size = n_segments + 1; + + OffsetT scanned_bin = 0; + + /* find cumulative offset */ + static constexpr std::uint32_t zero_radix_state_id = 0; + offset_arr[zero_radix_state_id] = b_offset_ptr[segment_id]; + + for (std::uint32_t radix_state_id = 1; + radix_state_id < radix_states; ++radix_state_id) + { + const std::uint32_t local_offset_id = + segment_id + scan_size * radix_state_id; + + // scan bins serially + const std::size_t last_segment_bucket_id = + radix_state_id * scan_size - 1; + scanned_bin += b_offset_ptr[last_segment_bucket_id]; + + offset_arr[radix_state_id] = + scanned_bin + b_offset_ptr[local_offset_id]; + } + + const std::size_t seg_start = elems_per_segment * segment_id; + std::size_t seg_end = + sycl::min(seg_start + elems_per_segment, n_values); + // ensure that each work item in a subgroup does the same number of + // loop iterations + const std::uint32_t tail_size = (seg_end - seg_start) % sg_size; + seg_end -= tail_size; + + const PeerHelper peer_prefix_hlp(ndit, peer_temp); + + // find offsets for the same values within a segment and fill the + // resulting buffer + if (is_ascending) { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + ValueT in_val = std::move(b_input_ptr[val_id]); + + // get the bucket for the bit-ordered input value, applying + // the offset and mask for radix bits + const auto mapped_val = + order_preserving_cast( + proj_op(in_val)); + std::uint32_t bucket_id = + get_bucket_id(mapped_val, radix_offset); + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) + { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + /* modified by reference */ new_offset_id, + offset_arr[radix_state_id], + /* bit contribution from this work-item */ + is_current_bucket); + offset_arr[radix_state_id] += sg_total_offset; + } + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + else { + for (std::size_t val_id = seg_start + lid; val_id < seg_end; + val_id += sg_size) { + ValueT in_val = std::move(b_input_ptr[val_id]); + + // get the bucket for the bit-ordered input value, applying + // the offset and mask for radix bits + const auto mapped_val = + order_preserving_cast( + proj_op(in_val)); + std::uint32_t bucket_id = + get_bucket_id(mapped_val, radix_offset); + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) + { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + /* modified by reference */ new_offset_id, + offset_arr[radix_state_id], + /* bit contribution from this work-item */ + is_current_bucket); + offset_arr[radix_state_id] += sg_total_offset; + } + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + if (tail_size > 0) { + ValueT in_val; + + // default: is greater than any actual radix state + std::uint32_t bucket_id = radix_states; + if (lid < tail_size) { + in_val = std::move(b_input_ptr[seg_end + lid]); + + const auto proj_val = proj_op(in_val); + const auto mapped_val = + (is_ascending) + ? order_preserving_cast( + proj_val) + : order_preserving_cast( + proj_val); + bucket_id = + get_bucket_id(mapped_val, radix_offset); + } + + OffsetT new_offset_id = 0; + for (std::uint32_t radix_state_id = 0; + radix_state_id < radix_states; ++radix_state_id) + { + bool is_current_bucket = (bucket_id == radix_state_id); + std::uint32_t sg_total_offset = + peer_prefix_hlp.peer_contribution( + new_offset_id, offset_arr[radix_state_id], + is_current_bucket); + + offset_arr[radix_state_id] += sg_total_offset; + } + + if (lid < tail_size) { + b_output_ptr[new_offset_id] = std::move(in_val); + } + } + }); + }); + + return reorder_ev; +} + +template +sizeT _slm_adjusted_work_group_size(sycl::queue &exec_q, + sizeT required_slm_bytes_per_wg, + sizeT wg_size) +{ + const auto &dev = exec_q.get_device(); + + if (wg_size == 0) + wg_size = + dev.template get_info(); + + const auto local_mem_sz = + dev.template get_info(); + + return sycl::min(local_mem_sz / required_slm_bytes_per_wg, wg_size); +} + +//----------------------------------------------------------------------- +// radix sort: one iteration +//----------------------------------------------------------------------- + +template +struct parallel_radix_sort_iteration_step +{ + template + using count_phase = radix_sort_count_kernel; + template + using local_scan_phase = radix_sort_scan_kernel; + template + using reorder_peer_phase = + radix_sort_reorder_peer_kernel; + template + using reorder_phase = radix_sort_reorder_kernel; + + template + static sycl::event submit(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_segments, + std::uint32_t radix_iter, + std::size_t n_values, + const InputT *in_ptr, + OutputT *out_ptr, + std::size_t n_counts, + CountT *counts_ptr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector &dependency_events) + { + using _RadixCountKernel = count_phase; + using _RadixLocalScanKernel = + local_scan_phase; + using _RadixReorderPeerKernel = + reorder_peer_phase; + using _RadixReorderKernel = + reorder_phase; + + const auto &supported_sub_group_sizes = + exec_q.get_device() + .template get_info(); + const std::size_t max_sg_size = + (supported_sub_group_sizes.empty() + ? 0 + : supported_sub_group_sizes.back()); + const std::size_t reorder_sg_size = max_sg_size; + const std::size_t scan_wg_size = + exec_q.get_device() + .template get_info(); + + static constexpr std::size_t two_mils = (std::size_t(1) << 21); + std::size_t count_wg_size = + ((max_sg_size > 0) && (n_values > two_mils) ? 128 : max_sg_size); + + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + // correct count_wg_size according to local memory limit in count phase + const auto max_count_wg_size = _slm_adjusted_work_group_size( + exec_q, sizeof(CountT) * radix_states, count_wg_size); + count_wg_size = + static_cast<::std::size_t>((max_count_wg_size / radix_states)) * + radix_states; + + // work-group size must be a power of 2 and not less than the number of + // states, for scanning to work correctly + + const std::size_t rounded_down_count_wg_size = + std::size_t{1} << (number_of_bits_in_type() - + sycl::clz(count_wg_size) - 1); + count_wg_size = + sycl::max(rounded_down_count_wg_size, std::size_t(radix_states)); + + // Compute the radix position for the given iteration + std::uint32_t radix_offset = radix_iter * radix_bits; + + // 1. Count Phase + sycl::event count_ev = + radix_sort_count_submit<_RadixCountKernel, radix_bits>( + exec_q, n_iters, n_segments, count_wg_size, radix_offset, + n_values, in_ptr, n_counts, counts_ptr, proj_op, is_ascending, + dependency_events); + + // 2. Scan Phase + sycl::event scan_ev = + radix_sort_scan_submit<_RadixLocalScanKernel, radix_bits>( + exec_q, n_iters, n_segments, scan_wg_size, n_values, n_counts, + counts_ptr, {count_ev}); + + // 3. Reorder Phase + sycl::event reorder_ev{}; + // subgroup_ballot-based peer algo uses extract_bits to populate + // uint32_t mask and hence relies on sub-group to be 32 or narrower + static constexpr std::size_t sg32_v = 32u; + static constexpr std::size_t sg16_v = 16u; + static constexpr std::size_t sg08_v = 8u; + if (sg32_v == reorder_sg_size || sg16_v == reorder_sg_size || + sg08_v == reorder_sg_size) + { + static constexpr auto peer_algorithm = + peer_prefix_algo::subgroup_ballot; + + reorder_ev = radix_sort_reorder_submit<_RadixReorderPeerKernel, + radix_bits, peer_algorithm>( + exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr, + out_ptr, n_counts, counts_ptr, proj_op, is_ascending, + {scan_ev}); + } + else { + static constexpr auto peer_algorithm = + peer_prefix_algo::scan_then_broadcast; + + reorder_ev = radix_sort_reorder_submit<_RadixReorderKernel, + radix_bits, peer_algorithm>( + exec_q, n_iters, n_segments, radix_offset, n_values, in_ptr, + out_ptr, n_counts, counts_ptr, proj_op, is_ascending, + {scan_ev}); + } + + return reorder_ev; + } +}; // struct parallel_radix_sort_iteration + +template +class radix_sort_one_wg_krn; + +template +struct subgroup_radix_sort +{ +private: + class use_slm_tag + { + }; + class use_global_mem_tag + { + }; + +public: + template + sycl::event operator()(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_to_sort, + ValueT *input_ptr, + OutputT *output_ptr, + ProjT proj_op, + const bool is_ascending, + const std::vector &depends) + { + static_assert(std::is_same_v, OutputT>); + + using _SortKernelLoc = + radix_sort_one_wg_krn; + using _SortKernelPartGlob = + radix_sort_one_wg_krn; + using _SortKernelGlob = + radix_sort_one_wg_krn; + + static constexpr std::size_t max_concurrent_work_groups = 128U; + + // Choose this to occupy the entire accelerator + const std::size_t n_work_groups = + std::min(n_iters, max_concurrent_work_groups); + + // determine which temporary allocation can be accommodated in SLM + const auto &SLM_availability = + check_slm_size(exec_q, n_to_sort); + + const std::size_t n_batch_size = n_work_groups; + + switch (SLM_availability) { + case temp_allocations::both_in_slm: + { + static constexpr auto storage_for_values = use_slm_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; + + return one_group_submitter<_SortKernelLoc>()( + exec_q, n_iters, n_iters, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + case temp_allocations::counters_in_slm: + { + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_slm_tag{}; + + return one_group_submitter<_SortKernelPartGlob>()( + exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + default: + { + static constexpr auto storage_for_values = use_global_mem_tag{}; + static constexpr auto storage_for_counters = use_global_mem_tag{}; + + return one_group_submitter<_SortKernelGlob>()( + exec_q, n_iters, n_batch_size, n_to_sort, input_ptr, output_ptr, + proj_op, is_ascending, storage_for_values, storage_for_counters, + depends); + } + } + } + +private: + template + class TempBuf; + + template + class TempBuf + { + std::size_t buf_size; + + public: + TempBuf(std::size_t, std::size_t n) : buf_size(n) {} + auto get_acc(sycl::handler &cgh) + { + return sycl::local_accessor(buf_size, cgh); + } + + std::size_t get_iter_stride() const + { + return std::size_t{0}; + } + }; + + template + class TempBuf + { + sycl::buffer buf; + std::size_t iter_stride; + + public: + TempBuf(std::size_t n_iters, std::size_t n) + : buf(n_iters * n), iter_stride(n) + { + } + auto get_acc(sycl::handler &cgh) + { + return sycl::accessor(buf, cgh, sycl::read_write, sycl::no_init); + } + std::size_t get_iter_stride() const + { + return iter_stride; + } + }; + + static_assert(wg_size <= 1024); + static constexpr std::uint16_t bin_count = (1 << radix); + static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1; + + enum class temp_allocations + { + both_in_slm, + counters_in_slm, + both_in_global_mem + }; + + template + temp_allocations check_slm_size(const sycl::queue &exec_q, SizeT n) + { + // the kernel is designed for data size <= 64K + assert(n <= (SizeT(1) << 16)); + + static constexpr auto req_slm_size_counters = + counter_buf_sz * sizeof(std::uint16_t); + + const auto &dev = exec_q.get_device(); + + // Pessimistically only use half of the memory to take into account + // a SYCL group algorithm might use a portion of SLM + const std::size_t max_slm_size = + dev.template get_info() / 2; + + const auto n_uniform = 1 << ceil_log2(n); + const auto req_slm_size_val = sizeof(T) * n_uniform; + + return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size) + ? + // the values and the counters are placed in SLM + temp_allocations::both_in_slm + : (req_slm_size_counters <= max_slm_size) + ? + // the counters are placed in SLM, the values - in the + // global memory + temp_allocations::counters_in_slm + : + // the values and the counters are placed in the global + // memory + temp_allocations::both_in_global_mem; + } + + template + struct one_group_submitter + { + template + sycl::event operator()(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_batch_size, + std::size_t n_values, + InputT *input_arr, + OutputT *output_arr, + const ProjT &proj_op, + const bool is_ascending, + SLM_value_tag, + SLM_counter_tag, + const std::vector &depends) + { + assert(!(n_values >> 16)); + + assert(n_values <= static_cast(block_size) * + static_cast(wg_size)); + + const std::uint16_t n = static_cast(n_values); + static_assert(std::is_same_v, OutputT>); + + using ValueT = OutputT; + + using KeyT = std::invoke_result_t; + + TempBuf buf_val( + n_batch_size, static_cast(block_size * wg_size)); + TempBuf buf_count( + n_batch_size, static_cast(counter_buf_sz)); + + sycl::range<1> lRange{wg_size}; + + sycl::event sort_ev; + std::vector deps{depends}; + + const std::size_t n_batches = + (n_iters + n_batch_size - 1) / n_batch_size; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + const auto &krn = kb.get_kernel(kernel_id); + + const std::uint32_t krn_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // due to a bug in CPU device implementation, an additional + // synchronization is necessary for short sub-group sizes + const bool work_around_needed = + exec_q.get_device().has(sycl::aspect::cpu) && + (krn_sg_size < 16); + + for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) { + + const std::size_t block_start = batch_id * n_batch_size; + + // input_arr/output_arr each has shape (n_iters, n) + InputT *this_input_arr = input_arr + block_start * n_values; + OutputT *this_output_arr = output_arr + block_start * n_values; + + const std::size_t block_end = + std::min(block_start + n_batch_size, n_iters); + + sycl::range<1> gRange{(block_end - block_start) * wg_size}; + sycl::nd_range ndRange{gRange, lRange}; + + sort_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(deps); + cgh.use_kernel_bundle(kb); + + // allocation to use for value exchanges + auto exchange_acc = buf_val.get_acc(cgh); + const std::size_t exchange_acc_iter_stride = + buf_val.get_iter_stride(); + + // allocation for counters + auto counter_acc = buf_count.get_acc(cgh); + const std::size_t counter_acc_iter_stride = + buf_count.get_iter_stride(); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> + ndit) { + ValueT values[block_size]; + + const std::size_t iter_id = ndit.get_group(0); + const std::size_t iter_val_offset = + iter_id * static_cast(n); + const std::size_t iter_counter_offset = + iter_id * counter_acc_iter_stride; + const std::size_t iter_exchange_offset = + iter_id * exchange_acc_iter_stride; + + std::uint16_t wi = ndit.get_local_linear_id(); + std::uint16_t begin_bit = 0; + + static constexpr std::uint16_t end_bit = + number_of_bits_in_type(); + + // copy from input array into values +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; + values[i] = + (id < n) ? this_input_arr[iter_val_offset + id] + : ValueT{}; + } + + while (true) { + // indices for indirect access in the "re-order" + // phase + std::uint16_t indices[block_size]; + { + // pointers to bucket's counters + std::uint16_t *counters[block_size]; + + // counting phase + auto pcounter = + get_accessor_pointer(counter_acc) + + (wi + iter_counter_offset); + + // initialize counters +#pragma unroll + for (std::uint16_t i = 0; i < bin_count; ++i) + pcounter[i * wg_size] = std::uint16_t{0}; + + sycl::group_barrier(ndit.get_group()); + + if (is_ascending) { +#pragma unroll + for (std::uint16_t i = 0; i < block_size; + ++i) { + const std::uint16_t id = + wi * block_size + i; + static constexpr std::uint16_t + bin_mask = bin_count - 1; + + // points to the padded element, i.e. id + // is in-range + static constexpr std::uint16_t + default_out_of_range_bin_id = + bin_mask; + + const std::uint16_t bin = + (id < n) + ? get_bucket_id( + order_preserving_cast< + /* is_ascending */ + true>( + proj_op(values[i])), + begin_bit) + : default_out_of_range_bin_id; + + // counting and local offset calculation + counters[i] = &pcounter[bin * wg_size]; + indices[i] = *counters[i]; + *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } + } + } + else { +#pragma unroll + for (std::uint16_t i = 0; i < block_size; + ++i) { + const std::uint16_t id = + wi * block_size + i; + static constexpr std::uint16_t + bin_mask = bin_count - 1; + + // points to the padded element, i.e. id + // is in-range + static constexpr std::uint16_t + default_out_of_range_bin_id = + bin_mask; + + const std::uint16_t bin = + (id < n) + ? get_bucket_id( + order_preserving_cast< + /* is_ascending */ + false>( + proj_op(values[i])), + begin_bit) + : default_out_of_range_bin_id; + + // counting and local offset calculation + counters[i] = &pcounter[bin * wg_size]; + indices[i] = *counters[i]; + *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } + } + } + + sycl::group_barrier(ndit.get_group()); + + // exclusive scan phase + { + + // scan contiguous numbers + std::uint16_t bin_sum[bin_count]; + const std::size_t counter_offset0 = + iter_counter_offset + wi * bin_count; + bin_sum[0] = counter_acc[counter_offset0]; + +#pragma unroll + for (std::uint16_t i = 1; i < bin_count; + ++i) + bin_sum[i] = + bin_sum[i - 1] + + counter_acc[counter_offset0 + i]; + + sycl::group_barrier(ndit.get_group()); + + // exclusive scan local sum + std::uint16_t sum_scan = + sycl::exclusive_scan_over_group( + ndit.get_group(), + bin_sum[bin_count - 1], + sycl::plus()); + +// add to local sum, generate exclusive scan result +#pragma unroll + for (std::uint16_t i = 0; i < bin_count; + ++i) + counter_acc[counter_offset0 + i + 1] = + sum_scan + bin_sum[i]; + + if (wi == 0) + counter_acc[iter_counter_offset + 0] = + std::uint32_t{0}; + + sycl::group_barrier(ndit.get_group()); + } + +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + // a global index is a local offset plus a + // global base index + indices[i] += *counters[i]; + } + + sycl::group_barrier(ndit.get_group()); + } + + begin_bit += radix; + + // "re-order" phase + sycl::group_barrier(ndit.get_group()); + if (begin_bit >= end_bit) { + // the last iteration - writing out the result +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; + if (r < n) { + this_output_arr[iter_val_offset + r] = + values[i]; + } + } + + return; + } + + // data exchange +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; + if (r < n) + exchange_acc[iter_exchange_offset + r] = + values[i]; + } + + sycl::group_barrier(ndit.get_group()); + +#pragma unroll + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; + if (id < n) + values[i] = + exchange_acc[iter_exchange_offset + id]; + } + + sycl::group_barrier(ndit.get_group()); + } + }); + }); + + deps = {sort_ev}; + } + + return sort_ev; + } + }; +}; + +template +struct OneWorkGroupRadixSortKernel; + +//----------------------------------------------------------------------- +// radix sort: main function +//----------------------------------------------------------------------- +template +sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, + std::size_t n_iters, + std::size_t n_to_sort, + const ValueT *input_arr, + ValueT *output_arr, + const ProjT &proj_op, + const bool is_ascending, + const std::vector &depends) +{ + assert(n_to_sort > 1); + + using KeyT = std::remove_cv_t< + std::remove_reference_t>>; + + // radix bits represent number of processed bits in each value during one + // iteration + static constexpr std::uint32_t radix_bits = 4; + + sycl::event sort_ev{}; + + const auto &dev = exec_q.get_device(); + const auto max_wg_size = + dev.template get_info(); + + static constexpr std::uint16_t ref_wg_size = 64; + if (n_to_sort <= 16384 && ref_wg_size * 8 <= max_wg_size) { + using _RadixSortKernel = OneWorkGroupRadixSortKernel; + + if (n_to_sort <= 64 && ref_wg_size <= max_wg_size) { + // wg_size * block_size == 64 * 1 * 1 == 64 + static constexpr std::uint16_t wg_size = ref_wg_size; + static constexpr std::uint16_t block_size = 1; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 128 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 1 == 128 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 1; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 256 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 2 == 256 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 2; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 512 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 4 == 512 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 4; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 1024 && ref_wg_size * 2 <= max_wg_size) { + // wg_size * block_size == 64 * 2 * 8 == 1024 + static constexpr std::uint16_t wg_size = ref_wg_size * 2; + static constexpr std::uint16_t block_size = 8; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 2048 && ref_wg_size * 4 <= max_wg_size) { + // wg_size * block_size == 64 * 4 * 8 == 2048 + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 8; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 4096 && ref_wg_size * 4 <= max_wg_size) { + // wg_size * block_size == 64 * 4 * 16 == 4096 + static constexpr std::uint16_t wg_size = ref_wg_size * 4; + static constexpr std::uint16_t block_size = 16; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else if (n_to_sort <= 8192 && ref_wg_size * 8 <= max_wg_size) { + // wg_size * block_size == 64 * 8 * 16 == 8192 + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 16; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + else { + // wg_size * block_size == 64 * 8 * 32 == 16384 + static constexpr std::uint16_t wg_size = ref_wg_size * 8; + static constexpr std::uint16_t block_size = 32; + + sort_ev = subgroup_radix_sort<_RadixSortKernel, wg_size, block_size, + radix_bits>{}( + exec_q, n_iters, n_to_sort, input_arr, output_arr, proj_op, + is_ascending, depends); + } + } + else { + static constexpr std::uint32_t radix_iters = + number_of_buckets_in_type(radix_bits); + static constexpr std::uint32_t radix_states = std::uint32_t(1) + << radix_bits; + + static constexpr std::size_t bound_512k = (std::size_t(1) << 19); + static constexpr std::size_t bound_2m = (std::size_t(1) << 21); + + const auto wg_sz_k = (n_to_sort < bound_512k) ? 8 + : (n_to_sort <= bound_2m) ? 4 + : 1; + const std::size_t wg_size = max_wg_size / wg_sz_k; + + const std::size_t n_segments = (n_to_sort + wg_size - 1) / wg_size; + + // Additional radix_states elements are used for getting local offsets + // from count values + no_op flag; 'No operation' flag specifies whether + // to skip re-order phase if the all keys are the same (lie in one bin) + const std::size_t n_counts = + (n_segments + 1) * radix_states + 1 /*no_op flag*/; + + using CountT = std::uint32_t; + + // memory for storing count and offset values + auto count_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_counts, exec_q); + + CountT *count_ptr = count_owner.get(); + + static constexpr std::uint32_t zero_radix_iter{0}; + + if constexpr (std::is_same_v) { + + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments, + zero_radix_iter, n_to_sort, + input_arr, output_arr, + n_counts, count_ptr, proj_op, + is_ascending, depends); + + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, count_owner); + + return sort_ev; + } + + auto tmp_arr_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_to_sort, exec_q); + + ValueT *tmp_arr = tmp_arr_owner.get(); + + // iterations per each bucket + assert("Number of iterations must be even" && radix_iters % 2 == 0); + assert(radix_iters > 0); + + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, /*even=*/true>::submit(exec_q, n_iters, n_segments, + zero_radix_iter, n_to_sort, + input_arr, tmp_arr, n_counts, + count_ptr, proj_op, is_ascending, + depends); + + for (std::uint32_t radix_iter = 1; radix_iter < radix_iters; + ++radix_iter) { + if (radix_iter % 2 == 0) { + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, + /*even=*/true>::submit(exec_q, n_iters, n_segments, + radix_iter, n_to_sort, output_arr, + tmp_arr, n_counts, count_ptr, + proj_op, is_ascending, {sort_ev}); + } + else { + sort_ev = parallel_radix_sort_iteration_step< + radix_bits, + /*even=*/false>::submit(exec_q, n_iters, n_segments, + radix_iter, n_to_sort, tmp_arr, + output_arr, n_counts, count_ptr, + proj_op, is_ascending, {sort_ev}); + } + } + + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, tmp_arr_owner, count_owner); + } + + return sort_ev; +} + +struct IdentityProj +{ + constexpr IdentityProj() {} + + template + constexpr T operator()(T val) const + { + return val; + } +}; + +template +struct ValueProj +{ + constexpr ValueProj() {} + + constexpr ValueT operator()(const std::pair &pair) const + { + return pair.first; + } +}; + +template +struct IndexedProj +{ + IndexedProj(const ValueT *arg_ptr) : ptr(arg_ptr), value_projector{} {} + + IndexedProj(const ValueT *arg_ptr, const ProjT &proj_op) + : ptr(arg_ptr), value_projector(proj_op) + { + } + + auto operator()(IndexT i) const + { + return value_projector(ptr[i]); + } + +private: + const ValueT *ptr; + ProjT value_projector; +}; + +} // namespace radix_sort_details + +using dpctl::tensor::ssize_t; + +template +sycl::event + radix_sort_axis1_contig_impl(sycl::queue &exec_q, + const bool sort_ascending, + // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t iter_nelems, + // size of each array to sort (length of rows, + // i.e. number of columns) + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + using Proj = radix_sort_details::IdentityProj; + static constexpr Proj proj_op{}; + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, proj_op, + sort_ascending, depends); + + return radix_sort_ev; +} + +template +class radix_argsort_index_write_out_krn; + +template +class radix_argsort_iota_krn; + +template +sycl::event + radix_argsort_axis1_contig_impl(sycl::queue &exec_q, + const bool sort_ascending, + // number of sub-arrays to sort (num. of + // rows in a matrix when sorting over rows) + std::size_t iter_nelems, + // size of each array to sort (length of + // rows, i.e. number of columns) + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + const std::size_t total_nelems = iter_nelems * sort_nelems; + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(total_nelems, + exec_q); + + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); + + using IdentityProjT = radix_sort_details::IdentityProj; + using IndexedProjT = + radix_sort_details::IndexedProj; + const IndexedProjT proj_op{arg_tp}; + + using IotaKernelName = radix_argsort_iota_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op, + sort_ascending, {iota_ev}); + + using MapBackKernelName = radix_argsort_index_write_out_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; + + sycl::event dep = radix_sort_ev; + + // no need to perform map_back ( id % sort_nelems) + // if total_nelems == sort_nelems + if (iter_nelems > 1u) { + dep = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {dep}); + } + + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {dep}, workspace_owner); + + return cleanup_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp new file mode 100644 index 000000000000..7b48f310a445 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_impl_fn_ptr_t.hpp @@ -0,0 +1,61 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp new file mode 100644 index 000000000000..8a6b1a8131ca --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -0,0 +1,148 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace dpctl::tensor::kernels::sort_utils_detail +{ + +namespace syclexp = sycl::ext::oneapi::experimental; + +template +sycl::event iota_impl(sycl::queue &exec_q, + T *data, + std::size_t nelems, + const std::vector &dependent_events) +{ + static constexpr std::uint32_t lws = 256; + static constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws); + + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + + const std::size_t offset = (gid - lane_id) * n_wi; + const std::uint32_t max_sgSize = sg.get_max_local_range()[0]; + + std::array stripe{}; +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + stripe[i] = T(offset + lane_id + i * max_sgSize); + } + + if (offset + n_wi * max_sgSize < nelems) { + static constexpr auto group_ls_props = syclexp::properties{ + syclexp::data_placement_striped + // , syclexp::full_group + }; + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&data[offset]); + + syclexp::group_store(sg, sycl::span{&stripe[0], n_wi}, + out_multi_ptr, group_ls_props); + } + else { + for (std::size_t idx = offset + lane_id; idx < nelems; + idx += max_sgSize) { + data[idx] = T(idx); + } + } + }); + }); + + return e; +} + +template +sycl::event map_back_impl(sycl::queue &exec_q, + std::size_t nelems, + const IndexTy *flat_index_data, + IndexTy *reduced_index_data, + std::size_t row_size, + const std::vector &dependent_events) +{ + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const IndexTy linear_index = flat_index_data[data_id]; + reduced_index_data[data_id] = (linear_index % row_size); + } + } + }); + }); + + return map_back_ev; +} + +} // namespace dpctl::tensor::kernels::sort_utils_detail diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp new file mode 100644 index 000000000000..0fdf7bec80fd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp @@ -0,0 +1,179 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +template +std::pair + py_sort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, sort_nelems * iter_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error("Both input arrays must have " + "the same value data type"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + if (sort_nelems > 1) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = sort_contig_fns[src_typeid]; + + if (nullptr == fn) { + throw py::value_error( + "Not implemented for the dtype of input arrays"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), + dst.get_data(), zero_offset, zero_offset, zero_offset, + zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + else { + assert(dst.get_size() == iter_nelems); + int src_elemsize = src.get_elemsize(); + + sycl::event copy_ev = + exec_q.copy(src.get_data(), dst.get_data(), + src_elemsize * iter_nelems, depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {copy_ev}), + copy_ev); + } + } + + throw py::value_error( + "Both source and destination arrays must be C-contiguous"); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp new file mode 100644 index 000000000000..fb3cab487df8 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp @@ -0,0 +1,192 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/type_dispatch.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "py_sort_common.hpp" +#include "radix_sort.hpp" +#include "radix_sort_support.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_radix_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_radix_sort_contig_dispatch_vector[td_ns::num_types]; + +namespace +{ + +template +sycl::event sort_axis1_contig_caller(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + using dpctl::tensor::kernels::radix_sort_axis1_contig_impl; + + return radix_sort_axis1_contig_impl( + q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp, + iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset, + depends); +} + +} // end of anonymous namespace + +template +struct AscendingRadixSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined) { + return sort_axis1_contig_caller; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingRadixSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined) { + return sort_axis1_contig_caller; + } + else { + return nullptr; + } + } +}; + +void init_radix_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder< + sort_contig_fn_ptr_t, AscendingRadixSortContigFactory, td_ns::num_types> + dtv1; + dtv1.populate_dispatch_vector(ascending_radix_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_radix_sort_contig_dispatch_vector); +} + +bool py_radix_sort_defined(int typenum) +{ + const auto &array_types = td_ns::usm_ndarray_types(); + + try { + int type_id = array_types.typenum_to_lookup_id(typenum); + return (nullptr != + ascending_radix_sort_contig_dispatch_vector[type_id]); + } catch (const std::exception &e) { + return false; + } +} + +void init_radix_sort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_radix_sort_dispatch_vectors(); + + auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + ascending_radix_sort_contig_dispatch_vector); + }; + m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_radix_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + descending_radix_sort_contig_dispatch_vector); + }; + m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + m.def("_radix_sort_dtype_supported", py_radix_sort_defined); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp new file mode 100644 index 000000000000..5f3c771b464b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_radix_sort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp new file mode 100644 index 000000000000..8d7e55a5cd28 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort_support.hpp @@ -0,0 +1,78 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +namespace dpctl::tensor::py_internal +{ + +template +struct TypeDefinedEntry : std::bool_constant> +{ + static constexpr bool is_defined = true; +}; + +struct NotDefinedEntry : std::true_type +{ + static constexpr bool is_defined = false; +}; + +template +struct RadixSortSupportVector +{ + using resolver_t = + typename std::disjunction, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + TypeDefinedEntry, + NotDefinedEntry>; + + static constexpr bool is_defined = resolver_t::is_defined; +}; + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp new file mode 100644 index 000000000000..d9ab1feca46a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -0,0 +1,57 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include + +// #include "sorting/isin.hpp" +// #include "sorting/merge_argsort.hpp" +#include "sorting/merge_sort.hpp" +// #include "sorting/radix_argsort.hpp" +#include "sorting/radix_sort.hpp" +// #include "sorting/searchsorted.hpp" +// #include "sorting/topk.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_sorting_impl, m) +{ + // dpctl::tensor::py_internal::init_isin_functions(m); + dpctl::tensor::py_internal::init_merge_sort_functions(m); + // dpctl::tensor::py_internal::init_merge_argsort_functions(m); + // dpctl::tensor::py_internal::init_searchsorted_functions(m); + dpctl::tensor::py_internal::init_radix_sort_functions(m); + // dpctl::tensor::py_internal::init_radix_argsort_functions(m); + // dpctl::tensor::py_internal::init_topk_functions(m); +} diff --git a/pyproject.toml b/pyproject.toml index 73844d1b0c17..6f5c83e542ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT" quiet-level = 3 [tool.coverage.report] From 1f018eca273573d66bf35f64497b815682642b0b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 05:52:48 -0800 Subject: [PATCH 113/145] Move _sort_ascending/descending to dpctl_ext.tensor._tensor_sorting_impl --- .../include/kernels/sorting/merge_sort.hpp | 857 ++++++++++++++++++ .../kernels/sorting/search_sorted_detail.hpp | 125 +++ .../libtensor/source/sorting/merge_sort.cpp | 138 +++ .../libtensor/source/sorting/merge_sort.hpp | 47 + pyproject.toml | 2 +- 5 files changed, 1168 insertions(+), 1 deletion(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp new file mode 100644 index 000000000000..e14425605fe2 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -0,0 +1,857 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace merge_sort_detail +{ + +using dpctl::tensor::ssize_t; +using namespace dpctl::tensor::kernels::search_sorted_detail; + +/*! @brief Merge two contiguous sorted segments */ +template +void merge_impl(const std::size_t offset, + const InAcc in_acc, + OutAcc out_acc, + const std::size_t start_1, + const std::size_t end_1, + const std::size_t end_2, + const std::size_t start_out, + Compare comp, + const std::size_t chunk) +{ + const std::size_t start_2 = end_1; + // Borders of the sequences to merge within this call + const std::size_t local_start_1 = sycl::min(offset + start_1, end_1); + const std::size_t local_end_1 = sycl::min(local_start_1 + chunk, end_1); + const std::size_t local_start_2 = sycl::min(offset + start_2, end_2); + const std::size_t local_end_2 = sycl::min(local_start_2 + chunk, end_2); + + const std::size_t local_size_1 = local_end_1 - local_start_1; + const std::size_t local_size_2 = local_end_2 - local_start_2; + + const auto r_item_1 = in_acc[end_1 - 1]; + const auto l_item_2 = (start_2 < end_2) ? in_acc[start_2] : r_item_1; + + // Copy if the sequences are sorted with respect to each other or merge + // otherwise + if (!comp(l_item_2, r_item_1)) { + const std::size_t out_shift_1 = start_out + local_start_1 - start_1; + const std::size_t out_shift_2 = + start_out + end_1 - start_1 + local_start_2 - start_2; + + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + else if (comp(r_item_1, l_item_2)) { + const std::size_t out_shift_1 = + start_out + end_2 - start_2 + local_start_1 - start_1; + const std::size_t out_shift_2 = start_out + local_start_2 - start_2; + for (std::size_t i = 0; i < local_size_1; ++i) { + out_acc[out_shift_1 + i] = in_acc[local_start_1 + i]; + } + for (std::size_t i = 0; i < local_size_2; ++i) { + out_acc[out_shift_2 + i] = in_acc[local_start_2 + i]; + } + } + // Perform merging + else { + + // Process 1st sequence + if (local_start_1 < local_end_1) { + // Reduce the range for searching within the 2nd sequence and handle + // bound items find left border in 2nd sequence + const auto local_l_item_1 = in_acc[local_start_1]; + std::size_t l_search_bound_2 = + lower_bound_impl(in_acc, start_2, end_2, local_l_item_1, comp); + const std::size_t l_shift_1 = local_start_1 - start_1; + const std::size_t l_shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_1; + + std::size_t r_search_bound_2{}; + // find right border in 2nd sequence + if (local_size_1 > 1) { + const auto local_r_item_1 = in_acc[local_end_1 - 1]; + r_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, end_2, local_r_item_1, comp); + const auto r_shift_1 = local_end_1 - 1 - start_1; + const auto r_shift_2 = r_search_bound_2 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_1; + } + + // Handle intermediate items + if (r_search_bound_2 == l_search_bound_2) { + const std::size_t shift_2 = l_search_bound_2 - start_2; + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + const std::size_t shift_1 = idx - start_1; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + else { + for (std::size_t idx = local_start_1 + 1; idx < local_end_1 - 1; + ++idx) { + const auto intermediate_item_1 = in_acc[idx]; + // we shouldn't seek in whole 2nd sequence. Just for the + // part where the 1st sequence should be + l_search_bound_2 = lower_bound_impl( + in_acc, l_search_bound_2, r_search_bound_2, + intermediate_item_1, comp); + const std::size_t shift_1 = idx - start_1; + const std::size_t shift_2 = l_search_bound_2 - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_1; + } + } + } + // Process 2nd sequence + if (local_start_2 < local_end_2) { + // Reduce the range for searching within the 1st sequence and handle + // bound items find left border in 1st sequence + const auto local_l_item_2 = in_acc[local_start_2]; + std::size_t l_search_bound_1 = + upper_bound_impl(in_acc, start_1, end_1, local_l_item_2, comp); + const std::size_t l_shift_1 = l_search_bound_1 - start_1; + const std::size_t l_shift_2 = local_start_2 - start_2; + + out_acc[start_out + l_shift_1 + l_shift_2] = local_l_item_2; + + std::size_t r_search_bound_1{}; + // find right border in 1st sequence + if (local_size_2 > 1) { + const auto local_r_item_2 = in_acc[local_end_2 - 1]; + r_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, end_1, local_r_item_2, comp); + const std::size_t r_shift_1 = r_search_bound_1 - start_1; + const std::size_t r_shift_2 = local_end_2 - 1 - start_2; + + out_acc[start_out + r_shift_1 + r_shift_2] = local_r_item_2; + } + + // Handle intermediate items + if (l_search_bound_1 == r_search_bound_1) { + const std::size_t shift_1 = l_search_bound_1 - start_1; + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + const std::size_t shift_2 = idx - start_2; + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + else { + for (auto idx = local_start_2 + 1; idx < local_end_2 - 1; ++idx) + { + const auto intermediate_item_2 = in_acc[idx]; + // we shouldn't seek in whole 1st sequence. Just for the + // part where the 2nd sequence should be + l_search_bound_1 = upper_bound_impl( + in_acc, l_search_bound_1, r_search_bound_1, + intermediate_item_2, comp); + const std::size_t shift_1 = l_search_bound_1 - start_1; + const std::size_t shift_2 = idx - start_2; + + out_acc[start_out + shift_1 + shift_2] = + intermediate_item_2; + } + } + } + } +} + +template +void insertion_sort_impl(Iter &&first, + std::size_t begin, + std::size_t end, + Compare &&comp) +{ + for (std::size_t i = begin + 1; i < end; ++i) { + const auto val_i = first[i]; + std::size_t j = i - 1; + while ((j + 1 > begin) && (comp(val_i, first[j]))) { + first[j + 1] = first[j]; + --j; + } + if (j + 1 < i) { + first[j + 1] = val_i; + } + } +} + +template +void leaf_sort_impl(Iter &&first, + std::size_t begin, + std::size_t end, + Compare &&comp) +{ + return insertion_sort_impl(std::forward(first), + std::move(begin), std::move(end), + std::forward(comp)); +} + +template +struct GetValueType +{ + using value_type = typename std::iterator_traits::value_type; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetValueType< + sycl::accessor> +{ + using value_type = ElementType; +}; + +template +struct GetValueType> +{ + using value_type = ElementType; +}; + +template +struct GetReadOnlyAccess +{ + Iter operator()(const Iter &it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadOnlyAccess> +{ + auto operator()(const sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_only); + return acc; + } +}; + +template +struct GetWriteDiscardAccess +{ + Iter operator()(Iter it, sycl::handler &) + { + return it; + } +}; + +template +struct GetWriteDiscardAccess> +{ + auto operator()(sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::write_only, sycl::no_init); + return acc; + } +}; + +template +struct GetReadWriteAccess +{ + Iter operator()(Iter &it, sycl::handler &) + { + return it; + } +}; + +template +struct GetReadWriteAccess> +{ + auto operator()(sycl::buffer &buf, + sycl::handler &cgh) + { + sycl::accessor acc(buf, cgh, sycl::read_write); + return acc; + } +}; + +template +class sort_base_step_contig_krn; + +template +sycl::event + sort_base_step_contig_impl(sycl::queue &q, + const std::size_t iter_nelems, + const std::size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + const std::size_t conseq_nelems_sorted, + const std::vector &depends = {}) +{ + + using inpT = typename GetValueType::value_type; + using outT = typename GetValueType::value_type; + using KernelName = sort_base_step_contig_krn; + + const std::size_t n_segments = + quotient_ceil(sort_nelems, conseq_nelems_sorted); + + sycl::event base_sort = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + const sycl::range<1> gRange{iter_nelems * n_segments}; + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.parallel_for(gRange, [=](sycl::id<1> id) { + const std::size_t iter_id = id[0] / n_segments; + const std::size_t segment_id = id[0] - iter_id * n_segments; + + const std::size_t iter_offset = iter_id * sort_nelems; + const std::size_t beg_id = + iter_offset + segment_id * conseq_nelems_sorted; + const std::size_t end_id = + iter_offset + + std::min((segment_id + 1) * conseq_nelems_sorted, sort_nelems); + for (std::size_t i = beg_id; i < end_id; ++i) { + output_acc[i] = input_acc[i]; + } + + leaf_sort_impl(output_acc, beg_id, end_id, comp); + }); + }); + + return base_sort; +} + +template +class sort_over_work_group_contig_krn; + +template +sycl::event sort_over_work_group_contig_impl( + sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const InpAcc input, + OutAcc output, + const Comp &comp, + std::size_t &nelems_wg_sorts, + const std::vector &depends = {}) +{ + using inpT = typename GetValueType::value_type; + using T = typename GetValueType::value_type; + using KernelName = sort_over_work_group_contig_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = q.get_context(); + auto const &dev = q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(T)); + + static constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + const std::size_t lws = sub_groups_per_work_group * max_sg_size; + + nelems_wg_sorts = elems_per_wi * lws; + + if (nelems_wg_sorts > nelems_per_slm) { + nelems_wg_sorts = (q.get_device().has(sycl::aspect::cpu) ? 16 : 4); + + return sort_base_step_contig_impl( + q, iter_nelems, sort_nelems, input, output, comp, nelems_wg_sorts, + depends); + } + + // This assumption permits doing away with using a loop + assert(nelems_wg_sorts % lws == 0); + + const std::size_t n_segments = quotient_ceil(sort_nelems, nelems_wg_sorts); + + sycl::event base_sort_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{nelems_wg_sorts}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + auto input_acc = GetReadOnlyAccess{}(input, cgh); + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); + + const std::size_t segment_start_idx = segment_id * nelems_wg_sorts; + const std::size_t segment_end_idx = + std::min(segment_start_idx + nelems_wg_sorts, sort_nelems); + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; + + // load input into SLM + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + T v = (array_id < sort_nelems) + ? input_acc[iter_id * sort_nelems + array_id] + : T{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const std::size_t chunk = quotient_ceil(nelems_wg_sorts, lws); + + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + leaf_sort_impl(work_space, chunk_start_idx, chunk_end_idx, comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + std::size_t n_chunks_merged = 1; + + // merge chunk while n_chunks_merged * chunk < wg_chunk_size + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); + for (; n_chunks_merged < max_chunks_merged; + data_in_temp = !data_in_temp, n_chunks_merged *= 2) + { + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = + sycl::min(2 * nelems_sorted_so_far * q, wg_chunk_size); + const std::size_t end_1 = + sycl::min(start_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t end_2 = + sycl::min(end_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t offset = chunk * (lid - q * n_chunks_merged); + + if (data_in_temp) { + merge_impl(offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + else { + merge_impl(offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, comp, chunk); + } + sycl::group_barrier(it.get_group()); + } + + const auto &out_src = (data_in_temp) ? scratch_space : work_space; + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + if (array_id < sort_nelems) { + output_acc[iter_id * sort_nelems + array_id] = + out_src[array_id - segment_start_idx]; + } + } + }); + }); + + return base_sort_ev; +} + +class vacuous_krn; + +inline sycl::event tie_events(sycl::queue &q, + const std::vector depends) +{ + if (depends.empty()) + return sycl::event(); + if (depends.size() == 1) + return depends[0]; + + sycl::event e = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + using KernelName = vacuous_krn; + cgh.single_task([]() {}); + }); + + return e; +} + +template +class merge_adjacent_blocks_to_temp_krn; + +template +class merge_adjacent_blocks_from_temp_krn; + +template +sycl::event + merge_sorted_block_contig_impl(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + Acc output, + const Comp comp, + std::size_t sorted_block_size, + const std::vector &depends = {}) +{ + + if (sorted_block_size >= sort_nelems) + return tie_events(q, depends); + + // experimentally determined value + // size of segments worked upon by each work-item during merging + const sycl::device &dev = q.get_device(); + const std::size_t segment_size = (dev.has(sycl::aspect::cpu)) ? 32 : 4; + + const std::size_t chunk_size = + (sorted_block_size < segment_size) ? sorted_block_size : segment_size; + + assert(sorted_block_size % chunk_size == 0); + + using T = typename GetValueType::value_type; + + sycl::buffer temp_buf(sycl::range<1>{iter_nelems * sort_nelems}); + // T *allocated_mem = sycl::malloc_device(iter_nelems * sort_nelems, q); + + bool needs_copy = true; + bool used_depends = false; + + sycl::event dep_ev; + std::size_t chunks_merged = sorted_block_size / chunk_size; + + assert(!(chunks_merged & (chunks_merged - 1))); + + using ToTempKernelName = class merge_adjacent_blocks_to_temp_krn; + using FromTempKernelName = + class merge_adjacent_blocks_from_temp_krn; + + while (chunks_merged * chunk_size < sort_nelems) { + sycl::event local_dep = dep_ev; + + sycl::event merge_ev = q.submit([&](sycl::handler &cgh) { + if (used_depends) { + cgh.depends_on(local_dep); + } + else { + cgh.depends_on(depends); + used_depends = true; + } + + const std::size_t n_chunks = quotient_ceil(sort_nelems, chunk_size); + + if (needs_copy) { + sycl::accessor temp_acc{temp_buf, cgh, sycl::write_only, + sycl::no_init}; + auto output_acc = GetReadOnlyAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, output_acc, temp_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + else { + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + cgh.parallel_for( + {iter_nelems * n_chunks}, [=](sycl::id<1> wid) { + auto flat_idx = wid[0]; + auto iter_idx = flat_idx / n_chunks; + auto idx = flat_idx - n_chunks * iter_idx; + + const std::size_t idx_mult = + (idx / chunks_merged) * chunks_merged; + const std::size_t idx_rem = (idx - idx_mult); + const std::size_t start_1 = + sycl::min(2 * idx_mult * chunk_size, sort_nelems); + const std::size_t end_1 = sycl::min( + start_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t end_2 = sycl::min( + end_1 + chunks_merged * chunk_size, sort_nelems); + const std::size_t offset = chunk_size * idx_rem; + + const std::size_t iter_offset = iter_idx * sort_nelems; + + merge_impl(offset, temp_acc, output_acc, + iter_offset + start_1, iter_offset + end_1, + iter_offset + end_2, iter_offset + start_1, + comp, chunk_size); + }); + } + }); + + chunks_merged *= 2; + dep_ev = merge_ev; + + if (chunks_merged * chunk_size < sort_nelems) { + needs_copy = !needs_copy; + } + } + + if (needs_copy) { + sycl::event copy_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dep_ev); + + sycl::accessor temp_acc{temp_buf, cgh, sycl::read_only}; + auto output_acc = GetWriteDiscardAccess{}(output, cgh); + + cgh.copy(temp_acc, output_acc); + }); + dep_ev = copy_ev; + } + + return dep_ev; +} + +} // namespace merge_sort_detail + +template > +sycl::event stable_sort_axis1_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + argTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + auto comp = Comp{}; + + // constant chosen experimentally to ensure monotonicity of + // sorting performance, as measured on GPU Max, and Iris Xe + constexpr std::size_t sequential_sorting_threshold = 16; + + if (sort_nelems < sequential_sorting_threshold) { + // equal work-item sorts entire row + sycl::event sequential_sorting_ev = + merge_sort_detail::sort_base_step_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sort_nelems, depends); + + return sequential_sorting_ev; + } + else { + std::size_t sorted_block_size{}; + + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, arg_tp, res_tp, comp, + sorted_block_size, // modified in place with size of sorted + // block size + depends); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, comp, + sorted_block_size, {base_sort_ev}); + + return merges_ev; + } +} + +template +class populate_index_data_krn; + +template +class index_map_to_rows_krn; + +template +struct IndexComp +{ + IndexComp(const ValueT *data, const ValueComp &comp_op) + : ptr(data), value_comp(comp_op) + { + } + + bool operator()(const IndexT &i1, const IndexT &i2) const + { + return value_comp(ptr[i1], ptr[i2]); + } + +private: + const ValueT *ptr; + ValueComp value_comp; +}; + +template > +sycl::event stable_argsort_axis1_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + sort_arg_offset; + IndexTy *res_tp = + reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; + + const IndexComp index_comp{arg_tp, ValueComp{}}; + + static constexpr std::size_t determine_automatically = 0; + std::size_t sorted_block_size = determine_automatically; + + const std::size_t total_nelems = iter_nelems * sort_nelems; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + using IotaKernelName = populate_index_data_krn; + + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, res_tp, total_nelems, depends); + + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, res_tp, index_comp, + sorted_block_size, // modified in place with size of sorted block + // size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size, + {base_sort_ev}); + + // no need to map back if iter_nelems == 1 + if (iter_nelems == 1u) { + return merges_ev; + } + + using MapBackKernelName = index_map_to_rows_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; + + sycl::event write_out_ev = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev}); + + return write_out_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp new file mode 100644 index 000000000000..c5b2e28411df --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp @@ -0,0 +1,125 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace dpctl::tensor::kernels +{ + +namespace search_sorted_detail +{ + +template +T quotient_ceil(T n, T m) +{ + return (n + m - 1) / m; +} + +template +std::size_t lower_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp) +{ + std::size_t n = last - first; + std::size_t cur = n, start = first; + std::size_t it; + while (n > 0) { + it = start; + cur = n / 2; + it += cur; + if (comp(acc[it], value)) { + n -= cur + 1, start = ++it; + } + else + n = cur; + } + return start; +} + +template +std::size_t upper_bound_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_impl(acc, first, last, value, op_comp); +} + +template +std::size_t lower_bound_indexed_impl(const Acc acc, + std::size_t first, + std::size_t last, + const Value &value, + const Compare &comp, + const IndexerT &acc_indexer) +{ + std::size_t n = last - first; + std::size_t cur = n, start = first; + std::size_t it; + while (n > 0) { + it = start; + cur = n / 2; + it += cur; + if (comp(acc[acc_indexer(it)], value)) { + n -= cur + 1, start = ++it; + } + else + n = cur; + } + return start; +} + +template +std::size_t upper_bound_indexed_impl(const Acc acc, + const std::size_t first, + const std::size_t last, + const Value &value, + const Compare &comp, + const IndexerT &acc_indexer) +{ + const auto &op_comp = [comp](auto x, auto y) { return !comp(y, x); }; + return lower_bound_indexed_impl(acc, first, last, value, op_comp, + acc_indexer); +} + +} // namespace search_sorted_detail + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp new file mode 100644 index 000000000000..7bb0e37c6aed --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp @@ -0,0 +1,138 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "merge_sort.hpp" +#include "py_sort_common.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_sort_contig_dispatch_vector[td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_sort_contig_dispatch_vector[td_ns::num_types]; + +template +struct AscendingSortContigFactory +{ + fnT get() + { + using dpctl::tensor::rich_comparisons::AscendingSorter; + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +template +struct DescendingSortContigFactory +{ + fnT get() + { + using dpctl::tensor::rich_comparisons::DescendingSorter; + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_sort_axis1_contig_impl; + return stable_sort_axis1_contig_impl; + } +}; + +void init_merge_sort_dispatch_vectors(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchVectorBuilder + dtv1; + dtv1.populate_dispatch_vector(ascending_sort_contig_dispatch_vector); + + td_ns::DispatchVectorBuilder + dtv2; + dtv2.populate_dispatch_vector(descending_sort_contig_dispatch_vector); +} + +void init_merge_sort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_merge_sort_dispatch_vectors(); + + auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector); + }; + m.def("_sort_ascending", py_sort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_sort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_sort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector); + }; + m.def("_sort_descending", py_sort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp new file mode 100644 index 000000000000..a6bdd0a4efe9 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_merge_sort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/pyproject.toml b/pyproject.toml index 6f5c83e542ba..09253467b8dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -108,7 +108,7 @@ target-version = ['py310', 'py311', 'py312', 'py313', 'py314'] [tool.codespell] builtin = "clear,rare,informal,names" check-filenames = true -ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT" +ignore-words-list = "amin,arange,elemt,fro,hist,ith,mone,nd,nin,sinc,vart,GroupT,AccessorT,IndexT,fpT,OffsetT,inpT" quiet-level = 3 [tool.coverage.report] From b62d836496299b18391bd336522c5c77b5f20327 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 05:55:25 -0800 Subject: [PATCH 114/145] Move ti.sort() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_sorting.py | 163 +++++++++++++++++++++++++++++++++++ dpnp/dpnp_iface_sorting.py | 2 +- 3 files changed, 166 insertions(+), 1 deletion(-) create mode 100644 dpctl_ext/tensor/_sorting.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 95c5e64c9322..8fa57c6d5028 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -82,6 +82,7 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip +from ._sorting import sort from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ @@ -124,6 +125,7 @@ "reshape", "result_type", "roll", + "sort", "squeeze", "stack", "swapaxes", diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py new file mode 100644 index 000000000000..5c16cbf9f17a --- /dev/null +++ b/dpctl_ext/tensor/_sorting.py @@ -0,0 +1,163 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +# import operator +# from typing import NamedTuple + +import dpctl.tensor as dpt +import dpctl.utils as du + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti + +from ._numpy_helper import normalize_axis_index +from ._tensor_sorting_impl import ( # _argsort_ascending,; _argsort_descending,; _radix_argsort_ascending,; _radix_argsort_descending,; _topk, + _radix_sort_ascending, + _radix_sort_descending, + _radix_sort_dtype_supported, + _sort_ascending, + _sort_descending, +) + +__all__ = ["sort"] + + +def _get_mergesort_impl_fn(descending): + return _sort_descending if descending else _sort_ascending + + +def _get_radixsort_impl_fn(descending): + return _radix_sort_descending if descending else _radix_sort_ascending + + +def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None): + """sort(x, axis=-1, descending=False, stable=True) + + Returns a sorted copy of an input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + kind (Optional[Literal["stable", "mergesort", "radixsort"]]): + Sorting algorithm. The default is `"stable"`, which uses parallel + merge-sort or parallel radix-sort algorithms depending on the + array data type. + Returns: + usm_ndarray: + a sorted array. The returned array has the same data type and + the same shape as the input array `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt_ext.copy(x, order="C") + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt_ext.permute_dims(x, perm) + if kind is None: + kind = "stable" + if not isinstance(kind, str) or kind not in [ + "stable", + "radixsort", + "mergesort", + ]: + raise ValueError( + "Unsupported kind value. Expected 'stable', 'mergesort', " + f"or 'radixsort', but got '{kind}'" + ) + if kind == "mergesort": + impl_fn = _get_mergesort_impl_fn(descending) + elif kind == "radixsort": + if _radix_sort_dtype_supported(x.dtype.num): + impl_fn = _get_radixsort_impl_fn(descending) + else: + raise ValueError(f"Radix sort is not supported for {x.dtype}") + else: + dt = x.dtype + if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]: + impl_fn = _get_radixsort_impl_fn(descending) + else: + impl_fn = _get_mergesort_impl_fn(descending) + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if arr.flags.c_contiguous: + res = dpt_ext.empty_like(arr, order="C") + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt_ext.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + res = dpt_ext.empty_like(arr, order="C") + ht_ev, impl_ev = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt_ext.permute_dims(res, inv_perm) + return res diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index be6c52ae9d80..31a3e9cf0017 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -404,7 +404,7 @@ def sort(a, axis=-1, kind=None, order=None, *, descending=False, stable=None): return _wrap_sort_argsort( a, - dpt.sort, + dpt_ext.sort, axis=axis, kind=kind, order=order, From 82d202cceebcc6a8a01411c976886c1cf7792160 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 06:09:57 -0800 Subject: [PATCH 115/145] Move ti.unique_counts() and ti.unique_values() to dpctl_ext.tensor --- dpctl_ext/tensor/__init__.py | 6 + dpctl_ext/tensor/_set_functions.py | 274 +++++++++++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 4 +- 3 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 dpctl_ext/tensor/_set_functions.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 8fa57c6d5028..aa373a9dd5c0 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -82,6 +82,10 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip +from ._set_functions import ( # isin,; unique_all,; unique_inverse, + unique_counts, + unique_values, +) from ._sorting import sort from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type @@ -135,6 +139,8 @@ "to_numpy", "tril", "triu", + "unique_counts", + "unique_values", "unstack", "where", "zeros", diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py new file mode 100644 index 000000000000..d4608a5a2a0a --- /dev/null +++ b/dpctl_ext/tensor/_set_functions.py @@ -0,0 +1,274 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +from typing import NamedTuple + +import dpctl.tensor as dpt +import dpctl.utils as du +from dpctl.tensor._tensor_elementwise_impl import _not_equal, _subtract + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext + +from ._tensor_impl import ( + _copy_usm_ndarray_into_usm_ndarray, + _extract, + _full_usm_ndarray, + _linspace_step, + default_device_index_type, + mask_positions, +) +from ._tensor_sorting_impl import ( + _sort_ascending, +) + + +class UniqueCountsResult(NamedTuple): + values: dpt.usm_ndarray + counts: dpt.usm_ndarray + + +def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: + """unique_values(x) + + Returns the unique elements of an input array `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + usm_ndarray + an array containing the set of unique elements in `x`. The + returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + if x.ndim == 1: + fx = x + else: + fx = dpt_ext.reshape(x, (x.size,), order="C") + if fx.size == 0: + return fx + s = dpt_ext.empty_like(fx, order="C") + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt_ext.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # writing into new allocation, no dependencies + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt_ext.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + return s + unique_vals = dpt_ext.empty( + n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q + ) + ht_ev, ex_e = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, ex_e) + return unique_vals + + +def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: + """unique_counts(x) + + Returns the unique elements of an input array `x` and the corresponding + counts for each unique element in `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, counts)` whose + + * first element is the field name `values` and is an array + containing the unique elements of `x`. This array has the + same data type as `x`. + * second element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + if x.ndim == 1: + fx = x + else: + fx = dpt_ext.reshape(x, (x.size,), order="C") + ind_dt = default_device_index_type(exec_q) + if fx.size == 0: + return UniqueCountsResult(fx, dpt_ext.empty_like(fx, dtype=ind_dt)) + s = dpt_ext.empty_like(fx, order="C") + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _sort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=s, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt_ext.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _sort_ascending( + src=tmp, + dst=s, + trailing_dims_to_sort=1, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + unique_mask = dpt_ext.empty(s.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency, since we write into new allocation + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt_ext.empty( + unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q + ) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev] + ) + if n_uniques == fx.size: + return UniqueCountsResult( + s, + dpt_ext.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ), + ) + unique_vals = dpt_ext.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + # populate unique values + ht_ev, ex_e = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, ex_e) + unique_counts = dpt_ext.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + # writing into new allocation, no dependency + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + # no dependency, writing into disjoint segmenent of new allocation + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt_ext.empty_like(unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=unique_counts[1:], + src2=unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + return UniqueCountsResult(unique_vals, _counts) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index c034d1c43793..c11538321e51 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -386,12 +386,12 @@ def _get_first_nan_index(usm_a): num_of_flags = (return_index, return_inverse, return_counts).count(True) if num_of_flags == 0: - usm_res = dpt.unique_values(usm_ar) + usm_res = dpt_ext.unique_values(usm_ar) usm_res = (usm_res,) # cast to a tuple to align with other cases elif num_of_flags == 1 and return_inverse: usm_res = dpt.unique_inverse(usm_ar) elif num_of_flags == 1 and return_counts: - usm_res = dpt.unique_counts(usm_ar) + usm_res = dpt_ext.unique_counts(usm_ar) else: usm_res = dpt.unique_all(usm_ar) From 893cdc3658f4220db7df940138047fb3bb4d45e2 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 10:35:26 -0800 Subject: [PATCH 116/145] Move _radix_argsort_ascending/descending to _tensor_sorting_impl --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../source/sorting/py_argsort_common.hpp | 185 +++++++++++++++++ .../source/sorting/radix_argsort.cpp | 190 ++++++++++++++++++ .../source/sorting/radix_argsort.hpp | 47 +++++ .../libtensor/source/tensor_sorting.cpp | 8 +- 5 files changed, 428 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index cf0a78efdd59..ed2a9014338c 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -72,9 +72,9 @@ set(_accumulator_sources set(_sorting_sources #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp ) diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp new file mode 100644 index 000000000000..d204b492b2ff --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp @@ -0,0 +1,185 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +template +std::pair + py_argsort(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const sorting_contig_impl_fnT &sort_contig_fns) +{ + int src_nd = src.get_ndim(); + int dst_nd = dst.get_ndim(); + if (src_nd != dst_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_sort; + if (trailing_dims_to_sort <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_sort must be positive, but no " + "greater than rank of the array being sorted"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + std::size_t sort_nelems(1); + for (int i = iteration_nd; same_shapes && (i < src_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == dst_shape_ptr[i]); + sort_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + if ((iter_nelems == 0) || (sort_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample( + dst, sort_nelems * iter_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if ((dst_typeid != static_cast(td_ns::typenum_t::INT64)) && + (dst_typeid != static_cast(td_ns::typenum_t::INT32))) + { + throw py::value_error( + "Output index array must have data type int32 or int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + if (sort_nelems > 1) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = sort_contig_fns[src_typeid][dst_typeid]; + + if (fn == nullptr) { + throw py::value_error( + "Not implemented for dtypes of input arrays"); + } + + sycl::event comp_ev = + fn(exec_q, iter_nelems, sort_nelems, src.get_data(), + dst.get_data(), zero_offset, zero_offset, zero_offset, + zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + else { + assert(dst.get_size() == iter_nelems); + int dst_elemsize = dst.get_elemsize(); + static constexpr int memset_val(0); + + sycl::event fill_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.memset(reinterpret_cast(dst.get_data()), memset_val, + iter_nelems * dst_elemsize); + }); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {fill_ev}); + + return std::make_pair(keep_args_alive_ev, fill_ev); + } + } + + throw py::value_error( + "Both source and destination arrays must be C-contiguous"); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp new file mode 100644 index 000000000000..ba9bbfd61b7c --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp @@ -0,0 +1,190 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/type_dispatch.hpp" + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "py_argsort_common.hpp" +#include "radix_argsort.hpp" +#include "radix_sort_support.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + +static sort_contig_fn_ptr_t + ascending_radix_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_radix_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +namespace +{ + +template +sycl::event argsort_axis1_contig_caller(sycl::queue &q, + std::size_t iter_nelems, + std::size_t sort_nelems, + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, + const std::vector &depends) +{ + using dpctl::tensor::kernels::radix_argsort_axis1_contig_impl; + + return radix_argsort_axis1_contig_impl( + q, is_ascending, iter_nelems, sort_nelems, arg_cp, res_cp, + iter_arg_offset, iter_res_offset, sort_arg_offset, sort_res_offset, + depends); +} + +} // end of anonymous namespace + +template +struct AscendingRadixArgSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined && + (std::is_same_v || + std::is_same_v)) + { + return argsort_axis1_contig_caller< + /*ascending*/ true, argTy, IndexTy>; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingRadixArgSortContigFactory +{ + fnT get() + { + if constexpr (RadixSortSupportVector::is_defined && + (std::is_same_v || + std::is_same_v)) + { + return argsort_axis1_contig_caller< + /*ascending*/ false, argTy, IndexTy>; + } + else { + return nullptr; + } + } +}; + +void init_radix_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_radix_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + descending_radix_argsort_contig_dispatch_table); +} + +void init_radix_argsort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables(); + + auto py_radix_argsort_ascending = + [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + ascending_radix_argsort_contig_dispatch_table); + }; + m.def("_radix_argsort_ascending", py_radix_argsort_ascending, + py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_radix_argsort_descending = + [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + descending_radix_argsort_contig_dispatch_table); + }; + m.def("_radix_argsort_descending", py_radix_argsort_descending, + py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp new file mode 100644 index 000000000000..89013fbb1bdc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_radix_argsort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp index d9ab1feca46a..8aacb8d08256 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -36,9 +36,9 @@ #include // #include "sorting/isin.hpp" -// #include "sorting/merge_argsort.hpp" +#include "sorting/merge_argsort.hpp" #include "sorting/merge_sort.hpp" -// #include "sorting/radix_argsort.hpp" +#include "sorting/radix_argsort.hpp" #include "sorting/radix_sort.hpp" // #include "sorting/searchsorted.hpp" // #include "sorting/topk.hpp" @@ -49,9 +49,9 @@ PYBIND11_MODULE(_tensor_sorting_impl, m) { // dpctl::tensor::py_internal::init_isin_functions(m); dpctl::tensor::py_internal::init_merge_sort_functions(m); - // dpctl::tensor::py_internal::init_merge_argsort_functions(m); + dpctl::tensor::py_internal::init_merge_argsort_functions(m); // dpctl::tensor::py_internal::init_searchsorted_functions(m); dpctl::tensor::py_internal::init_radix_sort_functions(m); - // dpctl::tensor::py_internal::init_radix_argsort_functions(m); + dpctl::tensor::py_internal::init_radix_argsort_functions(m); // dpctl::tensor::py_internal::init_topk_functions(m); } From 40c2b8467df776299f1e99e9a0aa7a70d1eab0b0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 10:36:33 -0800 Subject: [PATCH 117/145] Move _argsort_ascending/descending to _tensor_sorting_impl --- .../source/sorting/merge_argsort.cpp | 159 ++++++++++++++++++ .../source/sorting/merge_argsort.hpp | 47 ++++++ 2 files changed, 206 insertions(+) create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp new file mode 100644 index 000000000000..f76a74e1f8bf --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp @@ -0,0 +1,159 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/sort_impl_fn_ptr_t.hpp" + +#include "merge_argsort.hpp" +#include "py_argsort_common.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::sort_contig_fn_ptr_t; +static sort_contig_fn_ptr_t + ascending_argsort_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static sort_contig_fn_ptr_t + descending_argsort_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct AscendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using dpctl::tensor::rich_comparisons::AscendingSorter; + using Comp = typename AscendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct DescendingArgSortContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v || + std::is_same_v) + { + using dpctl::tensor::rich_comparisons::DescendingSorter; + using Comp = typename DescendingSorter::type; + + using dpctl::tensor::kernels::stable_argsort_axis1_contig_impl; + return stable_argsort_axis1_contig_impl; + } + else { + return nullptr; + } + } +}; + +void init_merge_argsort_dispatch_tables(void) +{ + using dpctl::tensor::kernels::sort_contig_fn_ptr_t; + + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(ascending_argsort_contig_dispatch_table); + + td_ns::DispatchTableBuilder< + sort_contig_fn_ptr_t, DescendingArgSortContigFactory, td_ns::num_types> + dtb2; + dtb2.populate_dispatch_table(descending_argsort_contig_dispatch_table); +} + +void init_merge_argsort_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_merge_argsort_dispatch_tables(); + + auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + ascending_argsort_contig_dispatch_table); + }; + m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto py_argsort_descending = [](const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_sort, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + return dpctl::tensor::py_internal::py_argsort( + src, trailing_dims_to_sort, dst, exec_q, depends, + dpctl::tensor::py_internal:: + descending_argsort_contig_dispatch_table); + }; + m.def("_argsort_descending", py_argsort_descending, py::arg("src"), + py::arg("trailing_dims_to_sort"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + return; +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp new file mode 100644 index 000000000000..10777b4bc2fd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_merge_argsort_functions(py::module_); + +} // namespace dpctl::tensor::py_internal From 6912311de96b88402c210fa7231e334ccd9b91e6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 10:38:59 -0800 Subject: [PATCH 118/145] Move ti.argsort() to dpctl_ext.tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 3 +- dpctl_ext/tensor/_sorting.py | 127 ++++++++++++++++++++++++++++++++++- dpnp/dpnp_iface_sorting.py | 10 ++- 3 files changed, 131 insertions(+), 9 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index aa373a9dd5c0..4e0155ff5e6e 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -86,11 +86,12 @@ unique_counts, unique_values, ) -from ._sorting import sort +from ._sorting import argsort, sort from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ "arange", + "argsort", "asarray", "asnumpy", "astype", diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py index 5c16cbf9f17a..a5fd1d57bcdf 100644 --- a/dpctl_ext/tensor/_sorting.py +++ b/dpctl_ext/tensor/_sorting.py @@ -38,7 +38,11 @@ import dpctl_ext.tensor._tensor_impl as ti from ._numpy_helper import normalize_axis_index -from ._tensor_sorting_impl import ( # _argsort_ascending,; _argsort_descending,; _radix_argsort_ascending,; _radix_argsort_descending,; _topk, +from ._tensor_sorting_impl import ( + _argsort_ascending, + _argsort_descending, + _radix_argsort_ascending, + _radix_argsort_descending, _radix_sort_ascending, _radix_sort_descending, _radix_sort_dtype_supported, @@ -46,7 +50,7 @@ _sort_descending, ) -__all__ = ["sort"] +__all__ = ["sort", "argsort"] def _get_mergesort_impl_fn(descending): @@ -161,3 +165,122 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None): inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt_ext.permute_dims(res, inv_perm) return res + + +def _get_mergeargsort_impl_fn(descending): + return _argsort_descending if descending else _argsort_ascending + + +def _get_radixargsort_impl_fn(descending): + return _radix_argsort_descending if descending else _radix_argsort_ascending + + +def argsort(x, axis=-1, descending=False, stable=True, kind=None): + """argsort(x, axis=-1, descending=False, stable=True) + + Returns the indices that sort an array `x` along a specified axis. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to sort. If set to `-1`, the function + must sort along the last axis. Default: `-1`. + descending (Optional[bool]): + sort order. If `True`, the array must be sorted in descending + order (by value). If `False`, the array must be sorted in + ascending order (by value). Default: `False`. + stable (Optional[bool]): + sort stability. If `True`, the returned array must maintain the + relative order of `x` values which compare as equal. If `False`, + the returned array may or may not maintain the relative order of + `x` values which compare as equal. Default: `True`. + kind (Optional[Literal["stable", "mergesort", "radixsort"]]): + Sorting algorithm. The default is `"stable"`, which uses parallel + merge-sort or parallel radix-sort algorithms depending on the + array data type. + + Returns: + usm_ndarray: + an array of indices. The returned array has the same shape as + the input array `x`. The return array has default array index + data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + nd = x.ndim + if nd == 0: + axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis") + return dpt_ext.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue), order="C" + ) + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt_ext.permute_dims(x, perm) + if kind is None: + kind = "stable" + if not isinstance(kind, str) or kind not in [ + "stable", + "radixsort", + "mergesort", + ]: + raise ValueError( + "Unsupported kind value. Expected 'stable', 'mergesort', " + f"or 'radixsort', but got '{kind}'" + ) + if kind == "mergesort": + impl_fn = _get_mergeargsort_impl_fn(descending) + elif kind == "radixsort": + if _radix_sort_dtype_supported(x.dtype.num): + impl_fn = _get_radixargsort_impl_fn(descending) + else: + raise ValueError(f"Radix sort is not supported for {x.dtype}") + else: + dt = x.dtype + if dt in [dpt.bool, dpt.uint8, dpt.int8, dpt.int16, dpt.uint16]: + impl_fn = _get_radixargsort_impl_fn(descending) + else: + impl_fn = _get_mergeargsort_impl_fn(descending) + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + index_dt = ti.default_device_index_type(exec_q) + if arr.flags.c_contiguous: + res = dpt_ext.empty_like(arr, dtype=index_dt, order="C") + ht_ev, impl_ev = impl_fn( + src=arr, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt_ext.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + res = dpt_ext.empty_like(arr, dtype=index_dt, order="C") + ht_ev, impl_ev = impl_fn( + src=tmp, + trailing_dims_to_sort=1, + dst=res, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt_ext.permute_dims(res, inv_perm) + return res diff --git a/dpnp/dpnp_iface_sorting.py b/dpnp/dpnp_iface_sorting.py index 31a3e9cf0017..e7abef1f4338 100644 --- a/dpnp/dpnp_iface_sorting.py +++ b/dpnp/dpnp_iface_sorting.py @@ -41,13 +41,11 @@ from collections.abc import Sequence -import dpctl.tensor as dpt -from dpctl.tensor._numpy_helper import normalize_axis_index - # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp +from dpctl_ext.tensor._numpy_helper import normalize_axis_index # pylint: disable=no-name-in-module from .dpnp_algo import ( @@ -87,7 +85,7 @@ def _wrap_sort_argsort( usm_a = dpnp.get_usm_ndarray(a) if axis is None: - usm_a = dpt_ext.reshape(usm_a, -1) + usm_a = dpt.reshape(usm_a, -1) axis = -1 axis = normalize_axis_index(axis, ndim=usm_a.ndim) @@ -404,7 +402,7 @@ def sort(a, axis=-1, kind=None, order=None, *, descending=False, stable=None): return _wrap_sort_argsort( a, - dpt_ext.sort, + dpt.sort, axis=axis, kind=kind, order=order, From 88a23a20a32681c9589ce893ebd5911f9c5ed816 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:12:09 -0800 Subject: [PATCH 119/145] Move _searchsorted_left/right to _tensor_sorting_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/sorting/searchsorted.hpp | 260 ++++++++++ .../libtensor/source/sorting/searchsorted.cpp | 478 ++++++++++++++++++ .../libtensor/source/sorting/searchsorted.hpp | 47 ++ .../libtensor/source/tensor_sorting.cpp | 4 +- 5 files changed, 788 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index ed2a9014338c..1ad403aad573 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -75,7 +75,7 @@ set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp ) set(_tensor_accumulation_impl_sources diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp new file mode 100644 index 000000000000..a38522d7e4f7 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -0,0 +1,260 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "utils/offset_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +template +struct SearchSortedFunctor +{ +private: + const argTy *hay_tp; + const argTy *needles_tp; + indTy *positions_tp; + std::size_t hay_nelems; + HayIndexerT hay_indexer; + NeedlesIndexerT needles_indexer; + PositionsIndexerT positions_indexer; + +public: + SearchSortedFunctor(const argTy *hay_, + const argTy *needles_, + indTy *positions_, + const std::size_t hay_nelems_, + const HayIndexerT &hay_indexer_, + const NeedlesIndexerT &needles_indexer_, + const PositionsIndexerT &positions_indexer_) + : hay_tp(hay_), needles_tp(needles_), positions_tp(positions_), + hay_nelems(hay_nelems_), hay_indexer(hay_indexer_), + needles_indexer(needles_indexer_), + positions_indexer(positions_indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + const Compare comp{}; + + const std::size_t i = id[0]; + const argTy needle_v = needles_tp[needles_indexer(i)]; + + // position of the needle_v in the hay array + indTy pos{}; + + static constexpr std::size_t zero(0); + if constexpr (left_side) { + // search in hay in left-closed interval, give `pos` such that + // hay[pos - 1] < needle_v <= hay[pos] + + // lower_bound returns the first pos such that bool(hay[pos] < + // needle_v) is false, i.e. needle_v <= hay[pos] + pos = search_sorted_detail::lower_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + } + else { + // search in hay in right-closed interval: hay[pos - 1] <= needle_v + // < hay[pos] + + // upper_bound returns the first pos such that bool(needle_v < + // hay[pos]) is true, i.e. needle_v < hay[pos] + pos = search_sorted_detail::upper_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + } + + positions_tp[positions_indexer(i)] = pos; + } +}; + +typedef sycl::event (*searchsorted_contig_impl_fp_ptr_t)( + sycl::queue &, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + const std::vector &); + +template +class searchsorted_contig_impl_krn; + +template +sycl::event searchsorted_contig_impl(sycl::queue &exec_q, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + const char *needles_cp, + const ssize_t needles_offset, + char *positions_cp, + const ssize_t positions_offset, + const std::vector &depends) +{ + const argTy *hay_tp = reinterpret_cast(hay_cp) + hay_offset; + const argTy *needles_tp = + reinterpret_cast(needles_cp) + needles_offset; + + indTy *positions_tp = + reinterpret_cast(positions_cp) + positions_offset; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class searchsorted_contig_impl_krn; + + sycl::range<1> gRange(needles_nelems); + + using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + static constexpr TrivialIndexerT hay_indexer{}; + static constexpr TrivialIndexerT needles_indexer{}; + static constexpr TrivialIndexerT positions_indexer{}; + + const auto fnctr = + SearchSortedFunctor( + hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer, + needles_indexer, positions_indexer); + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +typedef sycl::event (*searchsorted_strided_impl_fp_ptr_t)( + sycl::queue &, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + int, + const ssize_t *, + const std::vector &); + +template +class searchsorted_strided_impl_krn; + +template +sycl::event searchsorted_strided_impl( + sycl::queue &exec_q, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array + const ssize_t hay_stride, + const char *needles_cp, + const ssize_t needles_offset, + char *positions_cp, + const ssize_t positions_offset, + const int needles_nd, + // packed_shape_strides is [needles_shape, needles_strides, + // positions_strides] has length of 3*needles_nd + const ssize_t *packed_shape_strides, + const std::vector &depends) +{ + const argTy *hay_tp = reinterpret_cast(hay_cp); + const argTy *needles_tp = reinterpret_cast(needles_cp); + + indTy *positions_tp = reinterpret_cast(positions_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange(needles_nelems); + + using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + const HayIndexerT hay_indexer( + /* offset */ hay_offset, + /* size */ hay_nelems, + /* step */ hay_stride); + + using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const ssize_t *needles_shape_strides = packed_shape_strides; + const NeedlesIndexerT needles_indexer(needles_nd, needles_offset, + needles_shape_strides); + using PositionsIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *positions_shape = packed_shape_strides; + const ssize_t *positions_strides = + packed_shape_strides + 2 * needles_nd; + const PositionsIndexerT positions_indexer( + needles_nd, positions_offset, positions_shape, positions_strides); + + const auto fnctr = + SearchSortedFunctor( + hay_tp, needles_tp, positions_tp, hay_nelems, hay_indexer, + needles_indexer, positions_indexer); + using KernelName = + class searchsorted_strided_impl_krn; + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp new file mode 100644 index 000000000000..1f2a45d217cb --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp @@ -0,0 +1,478 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/searchsorted.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/rich_comparisons.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ + +namespace detail +{ + +using dpctl::tensor::kernels::searchsorted_contig_impl_fp_ptr_t; + +static searchsorted_contig_impl_fp_ptr_t + left_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types]; + +static searchsorted_contig_impl_fp_ptr_t + right_side_searchsorted_contig_impl[td_ns::num_types][td_ns::num_types]; + +template +struct LeftSideSearchSortedContigFactory +{ + constexpr LeftSideSearchSortedContigFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) + { + static constexpr bool left_side_search(true); + using dpctl::tensor::kernels::searchsorted_contig_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct RightSideSearchSortedContigFactory +{ + constexpr RightSideSearchSortedContigFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) + { + static constexpr bool right_side_search(false); + + using dpctl::tensor::kernels::searchsorted_contig_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_contig_impl; + } + else { + return nullptr; + } + } +}; + +using dpctl::tensor::kernels::searchsorted_strided_impl_fp_ptr_t; + +static searchsorted_strided_impl_fp_ptr_t + left_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types]; + +static searchsorted_strided_impl_fp_ptr_t + right_side_searchsorted_strided_impl[td_ns::num_types][td_ns::num_types]; + +template +struct LeftSideSearchSortedStridedFactory +{ + constexpr LeftSideSearchSortedStridedFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) + { + static constexpr bool left_side_search(true); + using dpctl::tensor::kernels::searchsorted_strided_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct RightSideSearchSortedStridedFactory +{ + constexpr RightSideSearchSortedStridedFactory() {} + + fnT get() const + { + if constexpr (std::is_same_v || + std::is_same_v) + { + static constexpr bool right_side_search(false); + using dpctl::tensor::kernels::searchsorted_strided_impl; + using dpctl::tensor::rich_comparisons::AscendingSorter; + + using Compare = typename AscendingSorter::type; + + return searchsorted_strided_impl; + } + else { + return nullptr; + } + } +}; + +void init_searchsorted_dispatch_table(void) +{ + + // Contiguous input function dispatch + td_ns::DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(left_side_searchsorted_contig_impl); + + td_ns::DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(right_side_searchsorted_contig_impl); + + // Strided input function dispatch + td_ns::DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(left_side_searchsorted_strided_impl); + + td_ns::DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(right_side_searchsorted_strided_impl); +} + +} // namespace detail + +/*! @brief search for needle from needles in sorted hay */ +std::pair + py_searchsorted(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const bool search_left_side, + const std::vector &depends) +{ + const int hay_nd = hay.get_ndim(); + const int needles_nd = needles.get_ndim(); + const int positions_nd = positions.get_ndim(); + + if (hay_nd != 1 || needles_nd != positions_nd) { + throw py::value_error("Array dimensions mismatch"); + } + + // check that needle and positions have the same shape + std::size_t needles_nelems(1); + bool same_shape(true); + + const std::size_t hay_nelems = static_cast(hay.get_shape(0)); + + const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *positions_shape_ptr = needles.get_shape_raw(); + + for (int i = 0; (i < needles_nd) && same_shape; ++i) { + const auto needles_sh_i = needles_shape_ptr[i]; + const auto positions_sh_i = positions_shape_ptr[i]; + + same_shape = same_shape && (needles_sh_i == positions_sh_i); + needles_nelems *= static_cast(needles_sh_i); + } + + if (!same_shape) { + throw py::value_error( + "Array of values to search for and array of their " + "positions do not have the same shape"); + } + + // check that positions is ample enough + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(positions, + needles_nelems); + + // check that positions is writable + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(positions); + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, positions})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // if output array overlaps with input arrays, race condition results + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(positions, hay) || overlap(positions, needles)) { + throw py::value_error("Destination array overlaps with input."); + } + + const int hay_typenum = hay.get_typenum(); + const int needles_typenum = needles.get_typenum(); + const int positions_typenum = positions.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum); + const int needles_typeid = + array_types.typenum_to_lookup_id(needles_typenum); + const int positions_typeid = + array_types.typenum_to_lookup_id(positions_typenum); + + // check hay and needle have the same data-type + if (needles_typeid != hay_typeid) { + throw py::value_error( + "Hay array and needles array must have the same data types"); + } + // check that positions has indexing data-type (int32, or int64) + const auto positions_typenum_t_v = + static_cast(positions_typeid); + if (positions_typenum_t_v != td_ns::typenum_t::INT32 && + positions_typenum_t_v != td_ns::typenum_t::INT64) + { + throw py::value_error( + "Positions array must have data-type int32, or int64"); + } + + if (needles_nelems == 0) { + // Nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + // if all inputs are contiguous call contiguous implementations + // otherwise call strided implementation + const bool hay_is_c_contig = hay.is_c_contiguous(); + const bool hay_is_f_contig = hay.is_f_contiguous(); + + const bool needles_is_c_contig = needles.is_c_contiguous(); + const bool needles_is_f_contig = needles.is_f_contiguous(); + + const bool positions_is_c_contig = positions.is_c_contiguous(); + const bool positions_is_f_contig = positions.is_f_contiguous(); + + const bool all_c_contig = + (hay_is_c_contig && needles_is_c_contig && positions_is_c_contig); + const bool all_f_contig = + (hay_is_f_contig && needles_is_f_contig && positions_is_f_contig); + + const char *hay_data = hay.get_data(); + const char *needles_data = needles.get_data(); + + char *positions_data = positions.get_data(); + + if (all_c_contig || all_f_contig) { + auto fn = + (search_left_side) + ? detail::left_side_searchsorted_contig_impl[hay_typeid] + [positions_typeid] + : detail::right_side_searchsorted_contig_impl[hay_typeid] + [positions_typeid]; + + if (fn) { + static constexpr py::ssize_t zero_offset(0); + + sycl::event comp_ev = + fn(exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, + needles_data, zero_offset, positions_data, zero_offset, + depends); + + return std::make_pair( + dpctl::utils::keep_args_alive(exec_q, {hay, needles, positions}, + {comp_ev}), + comp_ev); + } + } + + // strided case + + const auto &needles_strides = needles.get_strides_vector(); + const auto &positions_strides = positions.get_strides_vector(); + + int simplified_nd = needles_nd; + + using shT = std::vector; + shT simplified_common_shape; + shT simplified_needles_strides; + shT simplified_positions_strides; + py::ssize_t needles_offset(0); + py::ssize_t positions_offset(0); + + if (simplified_nd == 0) { + // needles and positions have same nd + simplified_nd = 1; + simplified_common_shape.push_back(1); + simplified_needles_strides.push_back(0); + simplified_positions_strides.push_back(0); + } + else { + dpctl::tensor::py_internal::simplify_iteration_space( + // modified by reference + simplified_nd, + // read-only inputs + needles_shape_ptr, needles_strides, positions_strides, + // output, modified by reference + simplified_common_shape, simplified_needles_strides, + simplified_positions_strides, needles_offset, positions_offset); + } + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // vectors being packed + simplified_common_shape, simplified_needles_strides, + simplified_positions_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_strides_ev = + std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + auto strided_fn = + (search_left_side) + ? detail::left_side_searchsorted_strided_impl[hay_typeid] + [positions_typeid] + : detail::right_side_searchsorted_strided_impl[hay_typeid] + [positions_typeid]; + + if (!strided_fn) { + throw std::runtime_error( + "No implementation for data types of input arrays"); + } + + static constexpr py::ssize_t zero_offset(0); + py::ssize_t hay_step = hay.get_strides_vector()[0]; + + const sycl::event &comp_ev = strided_fn( + exec_q, hay_nelems, needles_nelems, hay_data, zero_offset, hay_step, + needles_data, needles_offset, positions_data, positions_offset, + simplified_nd, packed_shape_strides, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + const sycl::event &ht_ev = dpctl::utils::keep_args_alive( + exec_q, {hay, needles, positions}, host_task_events); + + return std::make_pair(ht_ev, comp_ev); +} + +/*! @brief search for needle from needles in sorted hay, + * hay[pos] <= needle < hay[pos + 1] + */ +std::pair + py_searchsorted_left(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const std::vector &depends) +{ + static constexpr bool side_left(true); + return py_searchsorted(hay, needles, positions, exec_q, side_left, depends); +} + +/*! @brief search for needle from needles in sorted hay, + * hay[pos] < needle <= hay[pos + 1] + */ +std::pair + py_searchsorted_right(const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &positions, + sycl::queue &exec_q, + const std::vector &depends) +{ + static constexpr bool side_right(false); + return py_searchsorted(hay, needles, positions, exec_q, side_right, + depends); +} + +void init_searchsorted_functions(py::module_ m) +{ + dpctl::tensor::py_internal::detail::init_searchsorted_dispatch_table(); + + using dpctl::tensor::py_internal::py_searchsorted_left; + using dpctl::tensor::py_internal::py_searchsorted_right; + + m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"), + py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_searchsorted_right", &py_searchsorted_right, py::arg("hay"), + py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp new file mode 100644 index 000000000000..b60dae1e0ec9 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_searchsorted_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp index 8aacb8d08256..b569317e5d68 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -40,7 +40,7 @@ #include "sorting/merge_sort.hpp" #include "sorting/radix_argsort.hpp" #include "sorting/radix_sort.hpp" -// #include "sorting/searchsorted.hpp" +#include "sorting/searchsorted.hpp" // #include "sorting/topk.hpp" namespace py = pybind11; @@ -50,7 +50,7 @@ PYBIND11_MODULE(_tensor_sorting_impl, m) // dpctl::tensor::py_internal::init_isin_functions(m); dpctl::tensor::py_internal::init_merge_sort_functions(m); dpctl::tensor::py_internal::init_merge_argsort_functions(m); - // dpctl::tensor::py_internal::init_searchsorted_functions(m); + dpctl::tensor::py_internal::init_searchsorted_functions(m); dpctl::tensor::py_internal::init_radix_sort_functions(m); dpctl::tensor::py_internal::init_radix_argsort_functions(m); // dpctl::tensor::py_internal::init_topk_functions(m); From 148b6e54a190d8278ca4a368c4515c8da867bf26 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:18:16 -0800 Subject: [PATCH 120/145] Fix bug: wrong shape pointer for positions array --- dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp index 1f2a45d217cb..5cebb74be2d0 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp @@ -236,7 +236,7 @@ std::pair const std::size_t hay_nelems = static_cast(hay.get_shape(0)); const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); - const py::ssize_t *positions_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *positions_shape_ptr = positions.get_shape_raw(); for (int i = 0; (i < needles_nd) && same_shape; ++i) { const auto needles_sh_i = needles_shape_ptr[i]; From 8b010b0e4f39ff0aeed7cf5a1f8dc656e99fad17 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:20:54 -0800 Subject: [PATCH 121/145] Move ti.searchsorted() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_searchsorted.py | 189 ++++++++++++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 6 +- dpnp/dpnp_iface_searching.py | 2 +- 4 files changed, 196 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/_searchsorted.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 4e0155ff5e6e..63de04d1cdb8 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -82,6 +82,7 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip +from ._searchsorted import searchsorted from ._set_functions import ( # isin,; unique_all,; unique_inverse, unique_counts, unique_values, @@ -130,6 +131,7 @@ "reshape", "result_type", "roll", + "searchsorted", "sort", "squeeze", "stack", diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpctl_ext/tensor/_searchsorted.py new file mode 100644 index 000000000000..2d4807fb0d0c --- /dev/null +++ b/dpctl_ext/tensor/_searchsorted.py @@ -0,0 +1,189 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +from typing import Literal, Union + +import dpctl +import dpctl.utils as du + +# TODO: revert to `from ._usmarray import...` +# when dpnp fully migrates dpctl/tensor +from dpctl.tensor._usmarray import usm_ndarray + +from ._copy_utils import _empty_like_orderK +from ._ctors import empty +from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy +from ._tensor_impl import _take as ti_take +from ._tensor_impl import ( + default_device_index_type as ti_default_device_index_type, +) +from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right +from ._type_utils import isdtype, result_type + + +def searchsorted( + x1: usm_ndarray, + x2: usm_ndarray, + /, + *, + side: Literal["left", "right"] = "left", + sorter: Union[usm_ndarray, None] = None, +) -> usm_ndarray: + """searchsorted(x1, x2, side='left', sorter=None) + + Finds the indices into `x1` such that, if the corresponding elements + in `x2` were inserted before the indices, the order of `x1`, when sorted + in ascending order, would be preserved. + + Args: + x1 (usm_ndarray): + input array. Must be a one-dimensional array. If `sorter` is + `None`, must be sorted in ascending order; otherwise, `sorter` must + be an array of indices that sort `x1` in ascending order. + x2 (usm_ndarray): + array containing search values. + side (Literal["left", "right]): + argument controlling which index is returned if a value lands + exactly on an edge. If `x2` is an array of rank `N` where + `v = x2[n, m, ..., j]`, the element `ret[n, m, ..., j]` in the + return array `ret` contains the position `i` such that + if `side="left"`, it is the first index such that + `x1[i-1] < v <= x1[i]`, `0` if `v <= x1[0]`, and `x1.size` + if `v > x1[-1]`; + and if `side="right"`, it is the first position `i` such that + `x1[i-1] <= v < x1[i]`, `0` if `v < x1[0]`, and `x1.size` + if `v >= x1[-1]`. Default: `"left"`. + sorter (Optional[usm_ndarray]): + array of indices that sort `x1` in ascending order. The array must + have the same shape as `x1` and have an integral data type. + Out of bound index values of `sorter` array are treated using + `"wrap"` mode documented in :py:func:`dpctl.tensor.take`. + Default: `None`. + """ + if not isinstance(x1, usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x1)}") + if not isinstance(x2, usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x2)}") + if sorter is not None and not isinstance(sorter, usm_ndarray): + raise TypeError( + f"Expected dpctl.tensor.usm_ndarray, got {type(sorter)}" + ) + + if side not in ["left", "right"]: + raise ValueError( + "Unrecognized value of 'side' keyword argument. " + "Expected either 'left' or 'right'" + ) + + if sorter is None: + q = du.get_execution_queue([x1.sycl_queue, x2.sycl_queue]) + else: + q = du.get_execution_queue( + [x1.sycl_queue, x2.sycl_queue, sorter.sycl_queue] + ) + if q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously " + "inferred from input arguments." + ) + + if x1.ndim != 1: + raise ValueError("First argument array must be one-dimensional") + + x1_dt = x1.dtype + x2_dt = x2.dtype + + _manager = du.SequentialOrderManager[q] + dep_evs = _manager.submitted_events + ev = dpctl.SyclEvent() + if sorter is not None: + if not isdtype(sorter.dtype, "integral"): + raise ValueError( + f"Sorter array must have integral data type, got {sorter.dtype}" + ) + if x1.shape != sorter.shape: + raise ValueError( + "Sorter array must be one-dimension with the same " + "shape as the first argument array" + ) + res = empty(x1.shape, dtype=x1_dt, usm_type=x1.usm_type, sycl_queue=q) + ind = (sorter,) + axis = 0 + wrap_out_of_bound_indices_mode = 0 + ht_ev, ev = ti_take( + x1, + ind, + res, + axis, + wrap_out_of_bound_indices_mode, + sycl_queue=q, + depends=dep_evs, + ) + x1 = res + _manager.add_event_pair(ht_ev, ev) + + if x1_dt != x2_dt: + dt = result_type(x1, x2) + if x1_dt != dt: + x1_buf = _empty_like_orderK(x1, dt) + dep_evs = _manager.submitted_events + ht_ev, ev = ti_copy( + src=x1, dst=x1_buf, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + x1 = x1_buf + if x2_dt != dt: + x2_buf = _empty_like_orderK(x2, dt) + dep_evs = _manager.submitted_events + ht_ev, ev = ti_copy( + src=x2, dst=x2_buf, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + x2 = x2_buf + + dst_usm_type = du.get_coerced_usm_type([x1.usm_type, x2.usm_type]) + index_dt = ti_default_device_index_type(q) + + dst = _empty_like_orderK(x2, index_dt, usm_type=dst_usm_type) + + dep_evs = _manager.submitted_events + if side == "left": + ht_ev, s_ev = _searchsorted_left( + hay=x1, + needles=x2, + positions=dst, + sycl_queue=q, + depends=dep_evs, + ) + else: + ht_ev, s_ev = _searchsorted_right( + hay=x1, needles=x2, positions=dst, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, s_ev) + return dst diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index c11538321e51..6d6244d2534f 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -378,8 +378,10 @@ def _get_first_nan_index(usm_a): true_val = dpt_ext.asarray( True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type ) - return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left") - return dpt.searchsorted(usm_a, usm_a[-1], side="left") + return dpt_ext.searchsorted( + dpt.isnan(usm_a), true_val, side="left" + ) + return dpt_ext.searchsorted(usm_a, usm_a[-1], side="left") return None usm_ar = dpnp.get_usm_ndarray(ar) diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 15f52338ec7e..055aaa999c3a 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -382,7 +382,7 @@ def searchsorted(a, v, side="left", sorter=None): usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter) return dpnp_array._create_from_usm_ndarray( - dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter) + dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter) ) From 9766d345c0874965a9a96d9d18405e05dd043de5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:27:16 -0800 Subject: [PATCH 122/145] Move dpt.unique_all() and dpt.unique_inverse() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 6 +- dpctl_ext/tensor/_set_functions.py | 364 +++++++++++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 4 +- 3 files changed, 371 insertions(+), 3 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 63de04d1cdb8..5eca7e61098a 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -83,8 +83,10 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip from ._searchsorted import searchsorted -from ._set_functions import ( # isin,; unique_all,; unique_inverse, +from ._set_functions import ( + unique_all, unique_counts, + unique_inverse, unique_values, ) from ._sorting import argsort, sort @@ -142,7 +144,9 @@ "to_numpy", "tril", "triu", + "unique_all", "unique_counts", + "unique_inverse", "unique_values", "unstack", "where", diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py index d4608a5a2a0a..0ca0cd9d3db0 100644 --- a/dpctl_ext/tensor/_set_functions.py +++ b/dpctl_ext/tensor/_set_functions.py @@ -41,19 +41,34 @@ _extract, _full_usm_ndarray, _linspace_step, + _take, default_device_index_type, mask_positions, ) from ._tensor_sorting_impl import ( + _argsort_ascending, + _searchsorted_left, _sort_ascending, ) +class UniqueAllResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + counts: dpt.usm_ndarray + + class UniqueCountsResult(NamedTuple): values: dpt.usm_ndarray counts: dpt.usm_ndarray +class UniqueInverseResult(NamedTuple): + values: dpt.usm_ndarray + inverse_indices: dpt.usm_ndarray + + def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray: """unique_values(x) @@ -272,3 +287,352 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult: ) _manager.add_event_pair(ht_ev, sub_ev) return UniqueCountsResult(unique_vals, _counts) + + +def unique_inverse(x): + """unique_inverse + + Returns the unique elements of an input array x and the indices from the + set of unique elements that reconstruct `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, inverse_indices)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt_ext.reshape(x, (x.size,), order="C") + sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + return UniqueInverseResult(fx, dpt_ext.reshape(unsorting_ids, x.shape)) + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt_ext.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, argsort_ev = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, argsort_ev) + s = dpt_ext.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, take_ev) + unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + # no dependency + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt_ext.empty( + unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q + ) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + return UniqueInverseResult(s, dpt_ext.reshape(unsorting_ids, x.shape)) + unique_vals = dpt_ext.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, uv_ev = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, uv_ev) + cum_unique_counts = dpt_ext.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt_ext.empty_like(cum_unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + + inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C") + ht_ev, ssl_ev = _searchsorted_left( + hay=unique_vals, + needles=x, + positions=inv, + sycl_queue=exec_q, + depends=[ + uv_ev, + ], + ) + _manager.add_event_pair(ht_ev, ssl_ev) + + return UniqueInverseResult(unique_vals, inv) + + +def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: + """unique_all(x) + + Returns the unique elements of an input array `x`, the first occurring + indices for each unique element in `x`, the indices from the set of unique + elements that reconstruct `x`, and the corresponding counts for each + unique element in `x`. + + Args: + x (usm_ndarray): + input array. Inputs with more than one dimension are flattened. + Returns: + tuple[usm_ndarray, usm_ndarray, usm_ndarray, usm_ndarray] + a namedtuple `(values, indices, inverse_indices, counts)` whose + + * first element has the field name `values` and is an array + containing the unique elements of `x`. The array has the same + data type as `x`. + * second element has the field name `indices` and is an array + the indices (of first occurrences) of `x` that result in + `values`. The array has the same shape as `values` and has the + default array index data type. + * third element has the field name `inverse_indices` and is an + array containing the indices of values that reconstruct `x`. + The array has the same shape as `x` and has the default array + index data type. + * fourth element has the field name `counts` and is an array + containing the number of times each unique element occurs in `x`. + This array has the same shape as `values` and has the default + array index data type. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + array_api_dev = x.device + exec_q = array_api_dev.sycl_queue + x_usm_type = x.usm_type + ind_dt = default_device_index_type(exec_q) + if x.ndim == 1: + fx = x + else: + fx = dpt_ext.reshape(x, (x.size,), order="C") + sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C") + unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C") + if fx.size == 0: + # original array contains no data + # so it can be safely returned as values + return UniqueAllResult( + fx, + sorting_ids, + dpt_ext.reshape(unsorting_ids, x.shape), + dpt_ext.empty_like(fx, dtype=ind_dt), + ) + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if fx.flags.c_contiguous: + ht_ev, sort_ev = _argsort_ascending( + src=fx, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, sort_ev) + else: + tmp = dpt_ext.empty_like(fx, order="C") + ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray( + src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + ht_ev, sort_ev = _argsort_ascending( + src=tmp, + trailing_dims_to_sort=1, + dst=sorting_ids, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, sort_ev) + ht_ev, args_ev = _argsort_ascending( + src=sorting_ids, + trailing_dims_to_sort=1, + dst=unsorting_ids, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, args_ev) + s = dpt_ext.empty_like(fx) + # s = fx[sorting_ids] + ht_ev, take_ev = _take( + src=fx, + ind=(sorting_ids,), + dst=s, + axis_start=0, + mode=0, + sycl_queue=exec_q, + depends=[sort_ev], + ) + _manager.add_event_pair(ht_ev, take_ev) + unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q) + ht_ev, uneq_ev = _not_equal( + src1=s[:-1], + src2=s[1:], + dst=unique_mask[1:], + sycl_queue=exec_q, + depends=[take_ev], + ) + _manager.add_event_pair(ht_ev, uneq_ev) + ht_ev, one_ev = _full_usm_ndarray( + fill_value=True, dst=unique_mask[0], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, one_ev) + cumsum = dpt_ext.empty( + unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q + ) + # synchronizing call + n_uniques = mask_positions( + unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev] + ) + if n_uniques == fx.size: + _counts = dpt_ext.ones( + n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + return UniqueAllResult( + s, + sorting_ids, + dpt_ext.reshape(unsorting_ids, x.shape), + _counts, + ) + unique_vals = dpt_ext.empty( + n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q + ) + ht_ev, uv_ev = _extract( + src=s, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=unique_vals, + sycl_queue=exec_q, + ) + _manager.add_event_pair(ht_ev, uv_ev) + cum_unique_counts = dpt_ext.empty( + n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q + ) + idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q) + ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q) + _manager.add_event_pair(ht_ev, id_ev) + ht_ev, extr_ev = _extract( + src=idx, + cumsum=cumsum, + axis_start=0, + axis_end=1, + dst=cum_unique_counts[:-1], + sycl_queue=exec_q, + depends=[id_ev], + ) + _manager.add_event_pair(ht_ev, extr_ev) + ht_ev, set_ev = _full_usm_ndarray( + x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q + ) + _manager.add_event_pair(ht_ev, set_ev) + _counts = dpt_ext.empty_like(cum_unique_counts[1:]) + ht_ev, sub_ev = _subtract( + src1=cum_unique_counts[1:], + src2=cum_unique_counts[:-1], + dst=_counts, + sycl_queue=exec_q, + depends=[set_ev, extr_ev], + ) + _manager.add_event_pair(ht_ev, sub_ev) + + inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C") + ht_ev, ssl_ev = _searchsorted_left( + hay=unique_vals, + needles=x, + positions=inv, + sycl_queue=exec_q, + depends=[ + uv_ev, + ], + ) + _manager.add_event_pair(ht_ev, ssl_ev) + return UniqueAllResult( + unique_vals, + sorting_ids[cum_unique_counts[:-1]], + inv, + _counts, + ) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index 6d6244d2534f..cb0d0d35b0d7 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -391,11 +391,11 @@ def _get_first_nan_index(usm_a): usm_res = dpt_ext.unique_values(usm_ar) usm_res = (usm_res,) # cast to a tuple to align with other cases elif num_of_flags == 1 and return_inverse: - usm_res = dpt.unique_inverse(usm_ar) + usm_res = dpt_ext.unique_inverse(usm_ar) elif num_of_flags == 1 and return_counts: usm_res = dpt_ext.unique_counts(usm_ar) else: - usm_res = dpt.unique_all(usm_ar) + usm_res = dpt_ext.unique_all(usm_ar) first_nan = None if equal_nan: From b8ad5ec1826f5f0e288fae7a0c20af7f0279faf3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:55:59 -0800 Subject: [PATCH 123/145] Move _isin to _tensor_sorting_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/sorting/isin.hpp | 245 +++++++++++++ .../include/kernels/sorting/merge_sort.hpp | 3 +- .../kernels/sorting/search_sorted_detail.hpp | 3 +- .../include/kernels/sorting/searchsorted.hpp | 3 +- .../include/kernels/sorting/sort_utils.hpp | 3 +- .../tensor/libtensor/source/sorting/isin.cpp | 324 ++++++++++++++++++ .../tensor/libtensor/source/sorting/isin.hpp | 47 +++ .../libtensor/source/tensor_sorting.cpp | 4 +- 9 files changed, 623 insertions(+), 11 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/isin.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 1ad403aad573..c03cba55b7e4 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -70,7 +70,7 @@ set(_accumulator_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp ) set(_sorting_sources - #{CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp new file mode 100644 index 000000000000..847fa96ecdff --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/isin.hpp @@ -0,0 +1,245 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor membership operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "utils/offset_utils.hpp" +#include "utils/rich_comparisons.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; + +template +struct IsinFunctor +{ +private: + bool invert; + const T *hay_tp; + const T *needles_tp; + bool *out_tp; + std::size_t hay_nelems; + HayIndexerT hay_indexer; + NeedlesIndexerT needles_indexer; + OutIndexerT out_indexer; + +public: + IsinFunctor(const bool invert_, + const T *hay_, + const T *needles_, + bool *out_, + const std::size_t hay_nelems_, + const HayIndexerT &hay_indexer_, + const NeedlesIndexerT &needles_indexer_, + const OutIndexerT &out_indexer_) + : invert(invert_), hay_tp(hay_), needles_tp(needles_), out_tp(out_), + hay_nelems(hay_nelems_), hay_indexer(hay_indexer_), + needles_indexer(needles_indexer_), out_indexer(out_indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + using Compare = + typename dpctl::tensor::rich_comparisons::AscendingSorter::type; + static constexpr Compare comp{}; + + const std::size_t i = id[0]; + const T needle_v = needles_tp[needles_indexer(i)]; + + // position of the needle_v in the hay array + std::size_t pos{}; + + static constexpr std::size_t zero(0); + // search in hay in left-closed interval, give `pos` such that + // hay[pos - 1] < needle_v <= hay[pos] + + // lower_bound returns the first pos such that bool(hay[pos] < + // needle_v) is false, i.e. needle_v <= hay[pos] + pos = search_sorted_detail::lower_bound_indexed_impl( + hay_tp, zero, hay_nelems, needle_v, comp, hay_indexer); + bool out = (pos == hay_nelems ? false : hay_tp[pos] == needle_v); + out_tp[out_indexer(i)] = (invert) ? !out : out; + } +}; + +typedef sycl::event (*isin_contig_impl_fp_ptr_t)( + sycl::queue &, + const bool, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + const std::vector &); + +template +class isin_contig_impl_krn; + +template +sycl::event isin_contig_impl(sycl::queue &exec_q, + const bool invert, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + const char *needles_cp, + const ssize_t needles_offset, + char *out_cp, + const ssize_t out_offset, + const std::vector &depends) +{ + const T *hay_tp = reinterpret_cast(hay_cp) + hay_offset; + const T *needles_tp = + reinterpret_cast(needles_cp) + needles_offset; + + bool *out_tp = reinterpret_cast(out_cp) + out_offset; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = class isin_contig_impl_krn; + + sycl::range<1> gRange(needles_nelems); + + using TrivialIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + static constexpr TrivialIndexerT hay_indexer{}; + static constexpr TrivialIndexerT needles_indexer{}; + static constexpr TrivialIndexerT out_indexer{}; + + const auto fnctr = + IsinFunctor( + invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer, + needles_indexer, out_indexer); + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +typedef sycl::event (*isin_strided_impl_fp_ptr_t)( + sycl::queue &, + const bool, + const std::size_t, + const std::size_t, + const char *, + const ssize_t, + const ssize_t, + const char *, + const ssize_t, + char *, + const ssize_t, + int, + const ssize_t *, + const std::vector &); + +template +class isin_strided_impl_krn; + +template +sycl::event isin_strided_impl( + sycl::queue &exec_q, + const bool invert, + const std::size_t hay_nelems, + const std::size_t needles_nelems, + const char *hay_cp, + const ssize_t hay_offset, + // hay is 1D, so hay_nelems, hay_offset, hay_stride describe strided array + const ssize_t hay_stride, + const char *needles_cp, + const ssize_t needles_offset, + char *out_cp, + const ssize_t out_offset, + const int needles_nd, + // packed_shape_strides is [needles_shape, needles_strides, + // out_strides] has length of 3*needles_nd + const ssize_t *packed_shape_strides, + const std::vector &depends) +{ + const T *hay_tp = reinterpret_cast(hay_cp); + const T *needles_tp = reinterpret_cast(needles_cp); + + bool *out_tp = reinterpret_cast(out_cp); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + sycl::range<1> gRange(needles_nelems); + + using HayIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + const HayIndexerT hay_indexer( + /* offset */ hay_offset, + /* size */ hay_nelems, + /* step */ hay_stride); + + using NeedlesIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + const ssize_t *needles_shape_strides = packed_shape_strides; + const NeedlesIndexerT needles_indexer(needles_nd, needles_offset, + needles_shape_strides); + using OutIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *out_shape = packed_shape_strides; + const ssize_t *out_strides = packed_shape_strides + 2 * needles_nd; + const OutIndexerT out_indexer(needles_nd, out_offset, out_shape, + out_strides); + + const auto fnctr = + IsinFunctor( + invert, hay_tp, needles_tp, out_tp, hay_nelems, hay_indexer, + needles_indexer, out_indexer); + using KernelName = class isin_strided_impl_krn; + + cgh.parallel_for(gRange, fnctr); + }); + + return comp_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index e14425605fe2..a047c172f7bc 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -29,8 +29,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines functions of dpctl.tensor._tensor_sorting_impl -/// extension. +/// This file defines kernels for tensor sort/argsort operations. //===----------------------------------------------------------------------===// #pragma once diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp index c5b2e28411df..d184261aeabf 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp @@ -29,8 +29,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines functions of dpctl.tensor._tensor_sorting_impl -/// extension. +/// This file defines kernels for tensor sort/argsort operations. //===----------------------------------------------------------------------===// #pragma once diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index a38522d7e4f7..b89ad922e102 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -29,8 +29,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines functions of dpctl.tensor._tensor_sorting_impl -/// extension. +/// This file defines kernels for tensor sort/argsort operations. //===----------------------------------------------------------------------===// #pragma once diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp index 8a6b1a8131ca..4bbcacd56fcc 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -29,8 +29,7 @@ //===----------------------------------------------------------------------===// /// /// \file -/// This file defines functions of dpctl.tensor._tensor_sorting_impl -/// extension. +/// This file defines kernels for tensor sort/argsort operations. //===----------------------------------------------------------------------===// #pragma once diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp new file mode 100644 index 000000000000..30f65f0d9bfa --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp @@ -0,0 +1,324 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/isin.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +#include "simplify_iteration_space.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl::tensor::py_internal +{ +namespace detail +{ + +using dpctl::tensor::kernels::isin_contig_impl_fp_ptr_t; + +static isin_contig_impl_fp_ptr_t + isin_contig_impl_dispatch_vector[td_ns::num_types]; + +template +struct IsinContigFactory +{ + constexpr IsinContigFactory() {} + + fnT get() const + { + using dpctl::tensor::kernels::isin_contig_impl; + return isin_contig_impl; + } +}; + +using dpctl::tensor::kernels::isin_strided_impl_fp_ptr_t; + +static isin_strided_impl_fp_ptr_t + isin_strided_impl_dispatch_vector[td_ns::num_types]; + +template +struct IsinStridedFactory +{ + constexpr IsinStridedFactory() {} + + fnT get() const + { + using dpctl::tensor::kernels::isin_strided_impl; + return isin_strided_impl; + } +}; + +void init_isin_dispatch_vector(void) +{ + + // Contiguous input function dispatch + td_ns::DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isin_contig_impl_dispatch_vector); + + // Strided input function dispatch + td_ns::DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isin_strided_impl_dispatch_vector); +} + +} // namespace detail + +/*! @brief search for needle from needles in sorted hay */ +std::pair + py_isin(const dpctl::tensor::usm_ndarray &needles, + const dpctl::tensor::usm_ndarray &hay, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const bool invert, + const std::vector &depends) +{ + const int hay_nd = hay.get_ndim(); + const int needles_nd = needles.get_ndim(); + const int dst_nd = dst.get_ndim(); + + if (hay_nd != 1 || needles_nd != dst_nd) { + throw py::value_error("Array dimensions mismatch"); + } + + // check that needle and dst have the same shape + std::size_t needles_nelems(1); + bool same_shape(true); + + const std::size_t hay_nelems = static_cast(hay.get_shape(0)); + + const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = needles.get_shape_raw(); + + for (int i = 0; (i < needles_nd) && same_shape; ++i) { + const auto needles_sh_i = needles_shape_ptr[i]; + const auto dst_sh_i = dst_shape_ptr[i]; + + same_shape = same_shape && (needles_sh_i == dst_sh_i); + needles_nelems *= static_cast(needles_sh_i); + } + + if (!same_shape) { + throw py::value_error( + "Array of values to search for and array of their " + "dst do not have the same shape"); + } + + // check that dst is ample enough + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, + needles_nelems); + + // check that dst is writable + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + // check that queues are compatible + if (!dpctl::utils::queues_are_compatible(exec_q, {hay, needles, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + // if output array overlaps with input arrays, race condition results + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(dst, hay) || overlap(dst, needles)) { + throw py::value_error("Destination array overlaps with input."); + } + + const int hay_typenum = hay.get_typenum(); + const int needles_typenum = needles.get_typenum(); + const int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + const int hay_typeid = array_types.typenum_to_lookup_id(hay_typenum); + const int needles_typeid = + array_types.typenum_to_lookup_id(needles_typenum); + const int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // check hay and needle have the same data-type + if (needles_typeid != hay_typeid) { + throw py::value_error( + "Hay array and needles array must have the same data types"); + } + // check that dst has boolean data type + const auto dst_typenum_t_v = static_cast(dst_typeid); + if (dst_typenum_t_v != td_ns::typenum_t::BOOL) { + throw py::value_error("dst array must have data-type bool"); + } + + if (needles_nelems == 0) { + // Nothing to do + return std::make_pair(sycl::event{}, sycl::event{}); + } + + // if all inputs are contiguous call contiguous implementations + // otherwise call strided implementation + const bool hay_is_c_contig = hay.is_c_contiguous(); + const bool hay_is_f_contig = hay.is_f_contiguous(); + + const bool needles_is_c_contig = needles.is_c_contiguous(); + const bool needles_is_f_contig = needles.is_f_contiguous(); + + const bool dst_is_c_contig = dst.is_c_contiguous(); + const bool dst_is_f_contig = dst.is_f_contiguous(); + + const bool all_c_contig = + (hay_is_c_contig && needles_is_c_contig && dst_is_c_contig); + const bool all_f_contig = + (hay_is_f_contig && needles_is_f_contig && dst_is_f_contig); + + const char *hay_data = hay.get_data(); + const char *needles_data = needles.get_data(); + + char *dst_data = dst.get_data(); + + if (all_c_contig || all_f_contig) { + auto fn = detail::isin_contig_impl_dispatch_vector[hay_typeid]; + + static constexpr py::ssize_t zero_offset(0); + + sycl::event comp_ev = fn(exec_q, invert, hay_nelems, needles_nelems, + hay_data, zero_offset, needles_data, + zero_offset, dst_data, zero_offset, depends); + + return std::make_pair(dpctl::utils::keep_args_alive( + exec_q, {hay, needles, dst}, {comp_ev}), + comp_ev); + } + + // strided case + + const auto &needles_strides = needles.get_strides_vector(); + const auto &dst_strides = dst.get_strides_vector(); + + int simplified_nd = needles_nd; + + using shT = std::vector; + shT simplified_common_shape; + shT simplified_needles_strides; + shT simplified_dst_strides; + py::ssize_t needles_offset(0); + py::ssize_t dst_offset(0); + + if (simplified_nd == 0) { + // needles and dst have same nd + simplified_nd = 1; + simplified_common_shape.push_back(1); + simplified_needles_strides.push_back(0); + simplified_dst_strides.push_back(0); + } + else { + dpctl::tensor::py_internal::simplify_iteration_space( + // modified by reference + simplified_nd, + // read-only inputs + needles_shape_ptr, needles_strides, dst_strides, + // output, modified by reference + simplified_common_shape, simplified_needles_strides, + simplified_dst_strides, needles_offset, dst_offset); + } + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // vectors being packed + simplified_common_shape, simplified_needles_strides, + simplified_dst_strides); + auto packed_shape_strides_owner = + std::move(std::get<0>(ptr_size_event_tuple)); + const sycl::event ©_shape_strides_ev = + std::get<2>(ptr_size_event_tuple); + const py::ssize_t *packed_shape_strides = packed_shape_strides_owner.get(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + auto strided_fn = detail::isin_strided_impl_dispatch_vector[hay_typeid]; + + if (!strided_fn) { + throw std::runtime_error( + "No implementation for data types of input arrays"); + } + + static constexpr py::ssize_t zero_offset(0); + py::ssize_t hay_step = hay.get_strides_vector()[0]; + + const sycl::event &comp_ev = strided_fn( + exec_q, invert, hay_nelems, needles_nelems, hay_data, zero_offset, + hay_step, needles_data, needles_offset, dst_data, dst_offset, + simplified_nd, packed_shape_strides, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, packed_shape_strides_owner); + + host_task_events.push_back(temporaries_cleanup_ev); + const sycl::event &ht_ev = dpctl::utils::keep_args_alive( + exec_q, {hay, needles, dst}, host_task_events); + + return std::make_pair(ht_ev, comp_ev); +} + +void init_isin_functions(py::module_ m) +{ + dpctl::tensor::py_internal::detail::init_isin_dispatch_vector(); + + using dpctl::tensor::py_internal::py_isin; + m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("invert"), + py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp new file mode 100644 index 000000000000..236e8b5898c6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_isin_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp index b569317e5d68..2aa664fc94ff 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -35,7 +35,7 @@ #include -// #include "sorting/isin.hpp" +#include "sorting/isin.hpp" #include "sorting/merge_argsort.hpp" #include "sorting/merge_sort.hpp" #include "sorting/radix_argsort.hpp" @@ -47,7 +47,7 @@ namespace py = pybind11; PYBIND11_MODULE(_tensor_sorting_impl, m) { - // dpctl::tensor::py_internal::init_isin_functions(m); + dpctl::tensor::py_internal::init_isin_functions(m); dpctl::tensor::py_internal::init_merge_sort_functions(m); dpctl::tensor::py_internal::init_merge_argsort_functions(m); dpctl::tensor::py_internal::init_searchsorted_functions(m); From ce570a54c31f93ce2a542800cc68c616732ce14b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 11:58:42 -0800 Subject: [PATCH 124/145] Fix bug: wrong shape pointer for dst array --- dpctl_ext/tensor/libtensor/source/sorting/isin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp index 30f65f0d9bfa..728a7ca3b733 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp @@ -136,7 +136,7 @@ std::pair const std::size_t hay_nelems = static_cast(hay.get_shape(0)); const py::ssize_t *needles_shape_ptr = needles.get_shape_raw(); - const py::ssize_t *dst_shape_ptr = needles.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); for (int i = 0; (i < needles_nd) && same_shape; ++i) { const auto needles_sh_i = needles_shape_ptr[i]; From a7c644007f5ea6ff874265d8f8c43f3b4fdc4403 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 12:02:02 -0800 Subject: [PATCH 125/145] Move dpt.isin() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_set_functions.py | 167 ++++++++++++++++++++++++++++- dpnp/dpnp_iface_logic.py | 5 +- 3 files changed, 172 insertions(+), 2 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 5eca7e61098a..0f2585c9b0e9 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -84,6 +84,7 @@ from ._clip import clip from ._searchsorted import searchsorted from ._set_functions import ( + isin, unique_all, unique_counts, unique_inverse, @@ -119,6 +120,7 @@ "full_like", "iinfo", "isdtype", + "isin", "linspace", "meshgrid", "moveaxis", diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py index 0ca0cd9d3db0..93f81f044fd2 100644 --- a/dpctl_ext/tensor/_set_functions.py +++ b/dpctl_ext/tensor/_set_functions.py @@ -26,7 +26,7 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -from typing import NamedTuple +from typing import NamedTuple, Optional, Union import dpctl.tensor as dpt import dpctl.utils as du @@ -36,6 +36,13 @@ # when dpnp fully migrates dpctl/tensor import dpctl_ext.tensor as dpt_ext +from ._copy_utils import _empty_like_orderK +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) from ._tensor_impl import ( _copy_usm_ndarray_into_usm_ndarray, _extract, @@ -47,9 +54,25 @@ ) from ._tensor_sorting_impl import ( _argsort_ascending, + _isin, _searchsorted_left, _sort_ascending, ) +from ._type_utils import ( + _resolve_weak_types_all_py_ints, + _to_device_supported_dtype, +) + +__all__ = [ + "isin", + "unique_values", + "unique_counts", + "unique_inverse", + "unique_all", + "UniqueAllResult", + "UniqueCountsResult", + "UniqueInverseResult", +] class UniqueAllResult(NamedTuple): @@ -636,3 +659,145 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult: inv, _counts, ) + + +def isin( + x: Union[dpt.usm_ndarray, int, float, complex, bool], + test_elements: Union[dpt.usm_ndarray, int, float, complex, bool], + /, + *, + invert: Optional[bool] = False, +) -> dpt.usm_ndarray: + """isin(x, test_elements, /, *, invert=False) + + Tests `x in test_elements` for each element of `x`. Returns a boolean array + with the same shape as `x` that is `True` where the element is in + `test_elements`, `False` otherwise. + + Args: + x (Union[usm_ndarray, bool, int, float, complex]): + input element or elements. + test_elements (Union[usm_ndarray, bool, int, float, complex]): + elements against which to test each value of `x`. + invert (Optional[bool]): + if `True`, the output results are inverted, i.e., are equivalent to + testing `x not in test_elements` for each element of `x`. + Default: `False`. + + Returns: + usm_ndarray: + an array of the inclusion test results. The returned array has a + boolean data type and the same shape as `x`. + """ + q1, x_usm_type = _get_queue_usm_type(x) + q2, test_usm_type = _get_queue_usm_type(test_elements) + if q1 is None and q2 is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments. " + "One of the arguments must represent USM allocation and " + "expose `__sycl_usm_array_interface__` property" + ) + if q1 is None: + exec_q = q2 + res_usm_type = test_usm_type + elif q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = du.get_execution_queue((q1, q2)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + test_usm_type, + ) + ) + du.validate_usm_type(res_usm_type, allow_none=False) + sycl_dev = exec_q.sycl_device + + if not isinstance(invert, bool): + raise TypeError( + "`invert` keyword argument must be of boolean type, " + f"got {type(invert)}" + ) + + x_dt = _get_dtype(x, sycl_dev) + test_dt = _get_dtype(test_elements, sycl_dev) + if not all(_validate_dtype(dt) for dt in (x_dt, test_dt)): + raise ValueError("Operands have unsupported data types") + + x_sh = _get_shape(x) + if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0: + if invert: + return dpt_ext.ones( + x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + return dpt_ext.zeros( + x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q + ) + + dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev) + dt = _to_device_supported_dtype(dpt_ext.result_type(dt1, dt2), sycl_dev) + + if not isinstance(x, dpt.usm_ndarray): + x_arr = dpt_ext.asarray( + x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + x_arr = x + + if not isinstance(test_elements, dpt.usm_ndarray): + test_arr = dpt_ext.asarray( + test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q + ) + else: + test_arr = test_elements + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + if x_dt != dt: + x_buf = _empty_like_orderK(x_arr, dt, res_usm_type, exec_q) + ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray( + src=x_arr, dst=x_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + else: + x_buf = x_arr + + if test_dt != dt: + # copy into C-contiguous memory, because the array will be flattened + test_buf = dpt_ext.empty_like( + test_arr, dtype=dt, order="C", usm_type=res_usm_type + ) + ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray( + src=test_arr, dst=test_buf, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, ev) + else: + test_buf = test_arr + + test_buf = dpt_ext.reshape(test_buf, -1) + test_buf = dpt_ext.sort(test_buf) + + dst = dpt_ext.empty_like( + x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C" + ) + + dep_evs = _manager.submitted_events + ht_ev, s_ev = _isin( + needles=x_buf, + hay=test_buf, + dst=dst, + sycl_queue=exec_q, + invert=invert, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, s_ev) + return dst diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py index 1834f25a0485..3e3501b14c7c 100644 --- a/dpnp/dpnp_iface_logic.py +++ b/dpnp/dpnp_iface_logic.py @@ -49,6 +49,9 @@ import dpctl.utils as dpu import numpy +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc @@ -1273,7 +1276,7 @@ def isin( usm_element = dpnp.get_usm_ndarray(element) usm_test = dpnp.get_usm_ndarray(test_elements) return dpnp_array._create_from_usm_ndarray( - dpt.isin( + dpt_ext.isin( usm_element, usm_test, invert=invert, From 62d19f1b746ea7b0a4241d74e052915cc59ac712 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 12:24:42 -0800 Subject: [PATCH 126/145] Move _topk to _tensor_sorting_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../include/kernels/sorting/topk.hpp | 510 ++++++++++++++++++ .../tensor/libtensor/source/sorting/topk.cpp | 303 +++++++++++ .../tensor/libtensor/source/sorting/topk.hpp | 47 ++ .../libtensor/source/tensor_sorting.cpp | 4 +- 5 files changed, 863 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/sorting/topk.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index c03cba55b7e4..056b7c425544 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -76,7 +76,7 @@ set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_argsort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp ) set(_tensor_accumulation_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp new file mode 100644 index 000000000000..90e06a1e9eb8 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -0,0 +1,510 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor topk operation. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +namespace topk_detail +{ + +void scale_topk_params(const std::uint64_t nelems_per_slm, + const std::size_t sub_groups_per_work_group, + const std::uint32_t elems_per_wi, + const std::vector &sg_sizes, + std::size_t &lws, + std::size_t &nelems_wg_sorts) +{ + for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) { + auto sg_size = *it; + lws = sub_groups_per_work_group * sg_size; + nelems_wg_sorts = elems_per_wi * lws; + if (nelems_wg_sorts < nelems_per_slm) { + return; + } + } + // should never reach + throw std::runtime_error("Could not construct top k kernel parameters"); +} + +template +sycl::event write_out_impl(sycl::queue &exec_q, + std::size_t iter_nelems, + std::size_t k, + const argTy *arg_tp, + const IndexTy *index_data, + std::size_t iter_index_stride, + std::size_t axis_nelems, + argTy *vals_tp, + IndexTy *inds_tp, + const std::vector &depends) +{ + static constexpr std::uint32_t lws = 64; + static constexpr std::uint32_t n_wi = 4; + const std::size_t nelems = iter_nelems * k; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const std::size_t iter_id = data_id / k; + + /* + const std::size_t axis_gid = data_id - (iter_gid * k); + const std::size_t src_idx = iter_gid * iter_index_stride + + axis_gid; + */ + const std::size_t src_idx = + data_id + iter_id * (iter_index_stride - k); + + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + + const std::size_t dst_idx = data_id; + vals_tp[dst_idx] = v; + inds_tp[dst_idx] = (res_ind % axis_nelems); + } + } + }); + }); + + return write_out_ev; +} + +} // namespace topk_detail + +template +class topk_populate_index_data_krn; + +template +class topk_full_merge_map_back_krn; + +template +sycl::event + topk_full_merge_sort_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + const argTy *arg_tp, + argTy *vals_tp, + IndexTy *inds_tp, + const CompT &comp, + const std::vector &depends) +{ + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * axis_nelems, exec_q); + // extract USM pointer + IndexTy *index_data = index_data_owner.get(); + + using IotaKernelName = topk_populate_index_data_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, index_data, iter_nelems * axis_nelems, depends); + + std::size_t sorted_block_size; + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, axis_nelems, index_data, index_data, comp, + sorted_block_size, // modified in place with size of sorted block + // size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size, + {base_sort_ev}); + + using WriteOutKernelName = topk_full_merge_map_back_krn; + + sycl::event write_out_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems, + axis_nelems, vals_tp, inds_tp, {merges_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev}, + index_data_owner); + + return cleanup_host_task_event; +}; + +template +class topk_partial_merge_map_back_krn; + +template +class topk_over_work_group_krn; + +template > +sycl::event topk_merge_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t axis_nelems, // size of each array to sort (length of + // rows, i.e. number of columns) + std::size_t k, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if (axis_nelems < k) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); + + using dpctl::tensor::kernels::IndexComp; + const IndexComp index_comp{arg_tp, ValueComp{}}; + + if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) { + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k, + arg_tp, vals_tp, inds_tp, index_comp, + depends); + } + else { + using PartialKernelName = + topk_over_work_group_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy)); + + static constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + std::size_t lws = sub_groups_per_work_group * max_sg_size; + + std::size_t sorted_block_size = elems_per_wi * lws; + if (sorted_block_size > nelems_per_slm) { + const std::vector sg_sizes = + dev.get_info(); + topk_detail::scale_topk_params( + nelems_per_slm, sub_groups_per_work_group, elems_per_wi, + sg_sizes, + lws, // modified by reference + sorted_block_size // modified by reference + ); + } + + // This assumption permits doing away with using a loop + assert(sorted_block_size % lws == 0); + + using search_sorted_detail::quotient_ceil; + const std::size_t n_segments = + quotient_ceil(axis_nelems, sorted_block_size); + + // round k up for the later merge kernel if necessary + const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4; + std::size_t k_rounded = + (k < round_k_to) + ? k + : quotient_ceil(k, round_k_to) * round_k_to; + + // get length of tail for alloc size + auto rem = axis_nelems % sorted_block_size; + auto alloc_len = (rem && rem < k_rounded) + ? rem + k_rounded * (n_segments - 1) + : k_rounded * n_segments; + + // if allocation would be sufficiently large or k is larger than + // elements processed, use full sort + if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size || + alloc_len >= axis_nelems / 2) + { + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, + k, arg_tp, vals_tp, inds_tp, + index_comp, depends); + } + + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * alloc_len, exec_q); + // get raw USM pointer + IndexTy *index_data = index_data_owner.get(); + + // no need to populate index data: SLM will be populated with default + // values + + sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{sorted_block_size}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<1> it) { + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = + group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); + + const std::size_t segment_start_idx = + segment_id * sorted_block_size; + const std::size_t segment_end_idx = std::min( + segment_start_idx + sorted_block_size, axis_nelems); + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; + + // load input into SLM + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + IndexTy v = (array_id < axis_nelems) + ? iter_id * axis_nelems + array_id + : IndexTy{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const std::size_t chunk = + quotient_ceil(sorted_block_size, lws); + + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + merge_sort_detail::leaf_sort_impl( + work_space, chunk_start_idx, chunk_end_idx, index_comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + std::size_t n_chunks_merged = 1; + + // merge chunk while n_chunks_merged * chunk < wg_chunk_size + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); + for (; n_chunks_merged < max_chunks_merged; + data_in_temp = !data_in_temp, n_chunks_merged *= 2) + { + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = sycl::min( + 2 * nelems_sorted_so_far * q, wg_chunk_size); + const std::size_t end_1 = sycl::min( + start_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t end_2 = sycl::min( + end_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t offset = + chunk * (lid - q * n_chunks_merged); + + if (data_in_temp) { + merge_sort_detail::merge_impl( + offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + else { + merge_sort_detail::merge_impl( + offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + sycl::group_barrier(it.get_group()); + } + + // output assumed to be structured as (iter_nelems, + // alloc_len) + const std::size_t k_segment_start_idx = + segment_id * k_rounded; + const std::size_t k_segment_end_idx = std::min( + k_segment_start_idx + k_rounded, alloc_len); + const auto &out_src = + (data_in_temp) ? scratch_space : work_space; + for (std::size_t array_id = k_segment_start_idx + lid; + array_id < k_segment_end_idx; array_id += lws) + { + if (lid < k_rounded) { + index_data[iter_id * alloc_len + array_id] = + out_src[array_id - k_segment_start_idx]; + } + } + }); + }); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, alloc_len, index_data, index_comp, + k_rounded, {base_sort_ev}); + + // Write out top k of the merge-sorted memory + using WriteOutKernelName = + topk_partial_merge_map_back_krn; + + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, alloc_len, + axis_nelems, vals_tp, inds_tp, {merges_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, index_data_owner); + + return cleanup_host_task_event; + } +} + +template +class topk_iota_krn; + +template +class topk_radix_map_back_krn; + +template +sycl::event topk_radix_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool ascending, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if (axis_nelems < k) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp); + argTy *vals_tp = reinterpret_cast(vals_cp); + IndexTy *inds_tp = reinterpret_cast(inds_cp); + + const std::size_t total_nelems = iter_nelems * axis_nelems; + const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + padded_total_nelems + total_nelems, exec_q); + + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); + IndexTy *tmp_tp = workspace + padded_total_nelems; + + using IdentityProjT = radix_sort_details::IdentityProj; + using IndexedProjT = + radix_sort_details::IndexedProj; + const IndexedProjT proj_op{arg_tp}; + + using IotaKernelName = topk_iota_krn; + + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op, + ascending, {iota_ev}); + + // Write out top k of the temporary + using WriteOutKernelName = topk_radix_map_back_krn; + + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems, + vals_tp, inds_tp, {radix_sort_ev}); + + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, workspace_owner); + + return cleanup_ev; +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp new file mode 100644 index 000000000000..23a8d2a4328f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp @@ -0,0 +1,303 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/sorting/topk.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/rich_comparisons.hpp" +#include "utils/type_dispatch.hpp" + +#include "topk.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + std::size_t, + bool, + const char *, + char *, + char *, + const std::vector &); + +static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types]; + +namespace +{ + +template +struct use_radix_sort : public std::false_type +{ +}; + +template +struct use_radix_sort< + T, + std::enable_if_t, + std::is_same, + std::is_same, + std::is_same, + std::is_same>::value>> + : public std::true_type +{ +}; + +template +sycl::event topk_caller(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool largest, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + const std::vector &depends) +{ + if constexpr (use_radix_sort::value) { + using dpctl::tensor::kernels::topk_radix_impl; + auto ascending = !largest; + return topk_radix_impl(exec_q, iter_nelems, axis_nelems, + k, ascending, arg_cp, vals_cp, + inds_cp, depends); + } + else { + using dpctl::tensor::kernels::topk_merge_impl; + if (largest) { + using CompTy = + typename dpctl::tensor::rich_comparisons::DescendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + depends); + } + else { + using CompTy = + typename dpctl::tensor::rich_comparisons::AscendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + depends); + } + } +} + +} // namespace + +std::pair + py_topk(const dpctl::tensor::usm_ndarray &src, + std::optional trailing_dims_to_search, + const std::size_t k, + const bool largest, + const dpctl::tensor::usm_ndarray &vals, + const dpctl::tensor::usm_ndarray &inds, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int vals_nd = vals.get_ndim(); + int inds_nd = inds.get_ndim(); + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); + const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); + + std::size_t axis_nelems(1); + std::size_t iter_nelems(1); + if (trailing_dims_to_search.has_value()) { + if (src_nd != vals_nd || src_nd != inds_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + + auto trailing_dims = trailing_dims_to_search.value(); + int iter_nd = src_nd - trailing_dims; + if (trailing_dims <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_search must be positive, but no " + "greater than rank of the array being searched"); + } + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] && + src_shape_i == inds_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + std::size_t vals_k(1); + std::size_t inds_k(1); + for (int i = iter_nd; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + vals_k *= static_cast(vals_shape_ptr[i]); + inds_k *= static_cast(inds_shape_ptr[i]); + } + + bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } + } + else { + if (vals_nd != 1 || inds_nd != 1) { + throw py::value_error("Output arrays must be one-dimensional"); + } + + for (int i = 0; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + } + + bool valid_k = (axis_nelems >= k && + static_cast(vals_shape_ptr[0]) == k && + static_cast(inds_shape_ptr[0]) == k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals); + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds); + + if ((iter_nelems == 0) || (axis_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, vals) || overlap(src, inds)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, + k * iter_nelems); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, + k * iter_nelems); + + int src_typenum = src.get_typenum(); + int vals_typenum = vals.get_typenum(); + int inds_typenum = inds.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum); + int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum); + + if (src_typeid != vals_typeid) { + throw py::value_error("Input array and vals array must have " + "the same data type"); + } + + if (inds_typeid != static_cast(td_ns::typenum_t::INT64)) { + throw py::value_error("Inds array must have data type int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_vals_c_contig = vals.is_c_contiguous(); + bool is_inds_c_contig = inds.is_c_contiguous(); + + if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { + auto fn = topk_dispatch_vector[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), + vals.get_data(), inds.get_data(), depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +template +struct TopKFactory +{ + fnT get() + { + using IdxT = std::int64_t; + return topk_caller; + } +}; + +void init_topk_dispatch_vectors(void) +{ + td_ns::DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(topk_dispatch_vector); +} + +void init_topk_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_topk_dispatch_vectors(); + + m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), + py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp new file mode 100644 index 000000000000..d39c0eefdb93 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.hpp @@ -0,0 +1,47 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_topk_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp index 2aa664fc94ff..98a2493a7c48 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -41,7 +41,7 @@ #include "sorting/radix_argsort.hpp" #include "sorting/radix_sort.hpp" #include "sorting/searchsorted.hpp" -// #include "sorting/topk.hpp" +#include "sorting/topk.hpp" namespace py = pybind11; @@ -53,5 +53,5 @@ PYBIND11_MODULE(_tensor_sorting_impl, m) dpctl::tensor::py_internal::init_searchsorted_functions(m); dpctl::tensor::py_internal::init_radix_sort_functions(m); dpctl::tensor::py_internal::init_radix_argsort_functions(m); - // dpctl::tensor::py_internal::init_topk_functions(m); + dpctl::tensor::py_internal::init_topk_functions(m); } From 56d397d89ac0740cba12223935809da12a3f53e5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 2 Mar 2026 12:26:31 -0800 Subject: [PATCH 127/145] Move dpt.top_k() --- dpctl_ext/tensor/__init__.py | 3 +- dpctl_ext/tensor/_sorting.py | 170 ++++++++++++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 4 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 0f2585c9b0e9..9bfda2446889 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -90,7 +90,7 @@ unique_inverse, unique_values, ) -from ._sorting import argsort, sort +from ._sorting import argsort, sort, top_k from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ @@ -143,6 +143,7 @@ "take", "take_along_axis", "tile", + "top_k", "to_numpy", "tril", "triu", diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py index a5fd1d57bcdf..24693a408889 100644 --- a/dpctl_ext/tensor/_sorting.py +++ b/dpctl_ext/tensor/_sorting.py @@ -26,8 +26,8 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** -# import operator -# from typing import NamedTuple +import operator +from typing import NamedTuple import dpctl.tensor as dpt import dpctl.utils as du @@ -48,9 +48,10 @@ _radix_sort_dtype_supported, _sort_ascending, _sort_descending, + _topk, ) -__all__ = ["sort", "argsort"] +__all__ = ["sort", "argsort", "top_k"] def _get_mergesort_impl_fn(descending): @@ -284,3 +285,166 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None): inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt_ext.permute_dims(res, inv_perm) return res + + +def _get_top_k_largest(mode): + modes = {"largest": True, "smallest": False} + try: + return modes[mode] + except KeyError: + raise ValueError( + f"`mode` must be `largest` or `smallest`. Got `{mode}`." + ) + + +class TopKResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + + +def top_k(x, k, /, *, axis=None, mode="largest"): + """top_k(x, k, axis=None, mode="largest") + + Returns the `k` largest or smallest values and their indices in the input + array `x` along the specified axis `axis`. + + Args: + x (usm_ndarray): + input array. + k (int): + number of elements to find. Must be a positive integer value. + axis (Optional[int]): + axis along which to search. If `None`, the search will be performed + over the flattened array. Default: ``None``. + mode (Literal["largest", "smallest"]): + search mode. Must be one of the following modes: + + - `"largest"`: return the `k` largest elements. + - `"smallest"`: return the `k` smallest elements. + + Default: `"largest"`. + + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, indices)` whose + + * first element `values` will be an array containing the `k` + largest or smallest elements of `x`. The array has the same data + type as `x`. If `axis` was `None`, `values` will be a + one-dimensional array with shape `(k,)` and otherwise, `values` + will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]` + * second element `indices` will be an array containing indices of + `x` that result in `values`. The array will have the same shape + as `values` and will have the default array index data type. + """ + largest = _get_top_k_largest(mode) + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + + k = operator.index(k) + if k < 0: + raise ValueError("`k` must be a positive integer value") + + nd = x.ndim + if axis is None: + sz = x.size + if nd == 0: + if k > 1: + raise ValueError(f"`k`={k} is out of bounds 1") + return TopKResult( + dpt_ext.copy(x, order="C"), + dpt_ext.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue) + ), + ) + arr = x + n_search_dims = None + res_sh = k + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + sz = x.shape[axis] + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt_ext.permute_dims(x, perm) + n_search_dims = 1 + res_sh = arr.shape[: nd - 1] + (k,) + + if k > sz: + raise ValueError(f"`k`={k} is out of bounds {sz}") + + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + res_usm_type = arr.usm_type + if arr.flags.c_contiguous: + vals = dpt_ext.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt_ext.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=arr, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt_ext.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + vals = dpt_ext.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt_ext.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=tmp, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if axis is not None and a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + vals = dpt_ext.permute_dims(vals, inv_perm) + inds = dpt_ext.permute_dims(inds, inv_perm) + + return TopKResult(vals, inds) From ad814fb168cfb39a73b97c73b3ca4ee7b1852e38 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 03:44:05 -0800 Subject: [PATCH 128/145] Initialize _tensor_reductions_impl extension and move _all --- dpctl_ext/tensor/CMakeLists.txt | 25 +- .../libtensor/include/kernels/reductions.hpp | 3324 +++++++++++++++++ .../libtensor/source/reductions/all.cpp | 164 + .../libtensor/source/reductions/all.hpp | 46 + .../reductions/reduction_atomic_support.hpp | 148 + .../source/reductions/reduction_common.cpp | 69 + .../source/reductions/reduction_common.hpp | 46 + .../source/reductions/reduction_over_axis.hpp | 1325 +++++++ .../libtensor/source/tensor_reductions.cpp | 45 + 9 files changed, 5191 insertions(+), 1 deletion(-) create mode 100644 dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/all.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 056b7c425544..872e3f786c65 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -69,6 +69,19 @@ set(_accumulator_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_prod.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators/cumulative_sum.cpp ) +set(_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp +) set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp @@ -82,6 +95,10 @@ set(_tensor_accumulation_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_accumulation.cpp ${_accumulator_sources} ) +set(_tensor_reductions_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp + ${_reduction_sources} +) set(_tensor_sorting_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_sorting.cpp ${_sorting_sources} @@ -114,6 +131,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_accumulation_i target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) list(APPEND _py_trgts ${python_module_name}) +set(python_module_name _tensor_reductions_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) +target_link_libraries(${python_module_name} PRIVATE ${_static_lib_trgt}) +list(APPEND _py_trgts ${python_module_name}) + set(python_module_name _tensor_sorting_impl) pybind11_add_module(${python_module_name} MODULE ${_tensor_sorting_impl_sources}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_sorting_impl_sources}) @@ -135,7 +158,7 @@ set(_no_fast_math_sources list( APPEND _no_fast_math_sources # ${_elementwise_sources} - # ${_reduction_sources} + ${_reduction_sources} ${_sorting_sources} # ${_linalg_sources} ${_accumulator_sources} diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp new file mode 100644 index 000000000000..d77d7df92609 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp @@ -0,0 +1,3324 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor reduction along axis. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpctl_tensor_types.hpp" +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::kernels +{ + +using dpctl::tensor::ssize_t; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace reduction_detail +{ + +inline std::size_t get_work_group_size(const sycl::device &d) +{ + // prevents running out of resources on CPU + return std::min( + 2048, d.get_info() / 2); +} + +} // namespace reduction_detail + +template +struct needs_workaround +{ + static constexpr bool value = + (std::is_same_v> && + (std::is_same_v || + std::is_same_v)) || + (__LIBSYCL_MAJOR_VERSION < 7 && std::is_same_v && + std::is_same_v>); +}; + +template +struct can_use_reduce_over_group +{ + static constexpr bool value = + sycl::has_known_identity::value && + !needs_workaround::value; +}; + +template +struct SequentialReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + +public: + SequentialReduction(const argT *inp, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + outT red_val(identity_); + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) + { + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + red_val = reduction_op_(red_val, val); + } + + out_[out_iter_offset] = red_val; + } +}; + +/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */ + +/* + This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8 + if the device has aspect atomic64 and only with those supported by + sycl::atomic_ref +*/ +template +struct ReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + ReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) + { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg; + if constexpr (su_ns::IsLogicalAnd::value) { + red_val_over_wg = static_cast( + sycl::all_of_group(work_group, local_red_val)); + } + else if constexpr (su_ns::IsLogicalOr::value) { + red_val_over_wg = static_cast( + sycl::any_of_group(work_group, local_red_val)); + } + else { + red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val, + identity_, reduction_op_); + } + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + if constexpr (su_ns::IsPlus::value) { + res_ref += red_val_over_wg; + } + else if constexpr (su_ns::IsMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } + else if constexpr (su_ns::IsLogicalAnd::value) { + res_ref.fetch_and(red_val_over_wg); + } + else if constexpr (su_ns::IsLogicalOr::value) { + res_ref.fetch_or(red_val_over_wg); + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */ + +template +struct CustomReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + std::size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (std::size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) + { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + // retain these checks in case a reduce_over_group work-around is + // needed + if constexpr (su_ns::IsSyclPlus::value) { + res_ref += red_val_over_wg; + } + else if constexpr (su_ns::IsSyclMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclLogicalAnd::value) { + res_ref.fetch_and(red_val_over_wg); + } + else if constexpr (su_ns::IsSyclLogicalOr::value) + { + res_ref.fetch_or(red_val_over_wg); + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +template +struct ReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + ReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (su_ns::IsLogicalAnd::value || + su_ns::IsLogicalOr::value) + { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg; + if constexpr (su_ns::IsLogicalAnd::value) { + red_val_over_wg = sycl::all_of_group(work_group, local_red_val); + } + else if constexpr (su_ns::IsLogicalOr::value) { + red_val_over_wg = sycl::any_of_group(work_group, local_red_val); + } + else { + red_val_over_wg = sycl::reduce_over_group(work_group, local_red_val, + identity_, reduction_op_); + } + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/ + +template +struct CustomReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + const ReductionOp &reduction_op, + const outT &identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val; + if constexpr (std::is_same_v> || + std::is_same_v>) + { + // handle nans + val = convert_impl(inp_[inp_offset]); + } + else { + val = convert_impl(inp_[inp_offset]); + } + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template < + typename argTy, + typename resTy, + typename ReductionOpT, + typename InputOutputIterIndexerT, + typename ReductionIndexerT, + template + class kernel_name_token> +sycl::event + sequential_reduction(sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t iter_nelems, + std::size_t reduction_nelems, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems)); + }); + + return red_ev; +} + +template +class custom_reduction_wrapper; + +template < + typename argTy, + typename resTy, + typename ReductionOpT, + typename InputOutputIterIndexerT, + typename ReductionIndexerT, + template + class kernel_name_token> +sycl::event + submit_atomic_reduction(sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, + ReductionOverGroupWithAtomicFunctor( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + + using KernelName = class custom_reduction_wrapper< + kernel_name_token>; + + cgh.parallel_for( + ndRange, + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +class reduction_over_group_with_atomics_init_krn; + +template +class reduction_seq_krn; + +template +class reduction_over_group_with_atomics_krn; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +sycl::event reduction_over_group_with_atomics_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_with_atomics_init_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +// Contig + +typedef sycl::event (*reduction_contig_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +/* @brief Reduce rows in a matrix */ +template +sycl::event reduction_axis1_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + result_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +/* @brief Reduce rows in a matrix */ +template +sycl::event reduction_axis0_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of cols in a + // matrix when reducing over cols) + std::size_t reduction_nelems, // size of each reduction (length of cols, + // i.e. number of rows) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + static constexpr std::size_t preferred_reductions_per_wi = 8; + std::size_t reductions_per_wi = + (reduction_nelems < preferred_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferred_reductions_per_wi; + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + sycl::event comp_ev = + submit_atomic_reduction( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {res_init_ev}); + + return comp_ev; + } +} + +/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */ + +template < + typename argTy, + typename resTy, + typename ReductionOpT, + typename InputOutputIterIndexerT, + typename ReductionIndexerT, + template + class kernel_name_token> +sycl::event submit_no_atomic_reduction( + sycl::queue &exec_q, + const argTy *arg, + resTy *res, + resTy identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class kernel_name_token; + + cgh.parallel_for( + ndRange, + ReductionOverGroupNoAtomicFunctor( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_wrapper< + kernel_name_token>; + + cgh.parallel_for( + ndRange, + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg, res, ReductionOpT(), identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +class reduction_over_group_temps_krn; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +template +class reduction_over_group_temps_empty_krn; + +template +sycl::event reduction_over_group_temps_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_temps_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + ; + + sycl::event first_reduction_ev; + { + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer + const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + static constexpr ResIndexerT noop_tmp_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + red_nd, reduction_arg_offset, reduction_shape_stride}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev; + { + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{ + inp_indexer, res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + } + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + 2 * iter_nd}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const RowsIndexerT rows_indexer{/* size */ iter_nelems, + /* step */ reduction_nelems}; + static constexpr NoOpIndexerT noop_tmp_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + noop_tmp_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event reduction_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr resTy identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = + sequential_reduction( + exec_q, arg_tp, res_tp, identity_val, iter_nelems, + reduction_nelems, in_out_iter_indexer, reduction_indexer, + depends); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, res_tp, identity_val, wg, iter_nelems, + reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT noop_tmp_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + /* size */ reduction_nelems, + /* step */ iter_nelems}; + + first_reduction_ev = submit_no_atomic_reduction< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, arg_tp, partially_reduced_tmp, identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, temp2_arg, identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = submit_no_atomic_reduction< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, reduction_over_group_temps_krn>( + exec_q, temp_arg, res_tp, identity_val, wg, iter_nelems, + remaining_reduction_nelems, reductions_per_wi, reduction_groups, + in_out_iter_indexer, reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +// Argmax and Argmin + +/* Sequential search reduction */ + +template +struct SequentialSearchReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + +public: + SequentialSearchReduction( + const argT *inp, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), idx_reduction_op_(idx_reduction_op), + idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + argT red_val(identity_); + outT idx_val(idx_identity_); + for (std::size_t m = 0; m < reduction_max_gid_; ++m) { + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == red_val) { + idx_val = idx_reduction_op_(idx_val, static_cast(m)); + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val < red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val > red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + } + } + out_[out_iter_offset] = idx_val; + } +}; + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + std::size_t reduction_max_gid_ = 0; + std::size_t iter_gws_ = 1; + std::size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + const ReductionOp &reduction_op, + const argT &identity_val, + const IdxReductionOp &idx_reduction_op, + const outT &idx_identity_val, + const InputOutputIterIndexerT &arg_res_iter_indexer, + const InputRedIndexerT &arg_reduced_dims_indexer, + SlmT local_mem, + std::size_t reduction_size, + std::size_t iteration_size, + std::size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const std::size_t reduction_lid = it.get_local_id(0); + const std::size_t wg = + it.get_local_range(0); // 0 <= reduction_lid < wg + + const std::size_t iter_gid = it.get_group(0) % iter_gws_; + const std::size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const std::size_t n_reduction_groups = + it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + const auto &inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + std::size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (std::size_t m = 0; m < reductions_per_wi; ++m) { + std::size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_strided_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + int, + const ssize_t *, + ssize_t, + ssize_t, + int, + const ssize_t *, + ssize_t, + const std::vector &); + +template +class search_seq_strided_krn; + +template +class search_seq_contig_krn; + +template +class search_over_group_krn; + +template +class custom_search_over_group_krn; + +template +class search_empty_krn; + +template +sycl::event + submit_search_reduction(sycl::queue &exec_q, + const argTy *arg, + argTy *arg_tmp, + resTy *res_tmp, + resTy *res, + argTy identity_val, + resTy idx_identity_val, + std::size_t wg, + std::size_t iter_nelems, + std::size_t reduction_nelems, + std::size_t reductions_per_wi, + std::size_t reduction_groups, + const InputOutputIterIndexerT &in_out_iter_indexer, + const ReductionIndexerT &reduction_indexer, + const std::vector &depends) +{ + sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + auto ndRange = sycl::nd_range<1>(globalRange, localRange); + + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class search_over_group_krn; + cgh.parallel_for( + ndRange, SearchReduction( + arg, arg_tmp, res_tmp, res, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_search_over_group_krn< + argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT, First, Last>; + cgh.parallel_for( + ndRange, + CustomSearchReduction( + arg, arg_tmp, res_tmp, res, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + }); + return red_ev; +} + +template +sycl::event search_over_group_temps_strided_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + int red_nd, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + const IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class search_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = idx_identity_val; + }); + }); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 4; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, iter_shape_and_strides}; + const ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + const InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + static constexpr ResIndexerT noop_tmp_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + const ReductionIndexerT reduction_indexer{ + red_nd, reduction_arg_offset, reduction_shape_stride}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + const ResIndexerT res_iter_indexer{ + iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + 2 * iter_nd}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +typedef sycl::event (*search_contig_impl_fn_ptr)( + sycl::queue &, + std::size_t, + std::size_t, + const char *, + char *, + ssize_t, + ssize_t, + ssize_t, + const std::vector &); + +template +sycl::event search_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + using InputIterIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto val_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = val_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + const InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{/* size */ iter_nelems, + /* step */ reduction_nelems}, + NoOpIndexerT{}}; + static constexpr ReductionIndexerT reduction_indexer{}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, val_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +sycl::event search_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of reductions (num. of rows in a + // matrix when reducing over rows) + std::size_t reduction_nelems, // size of each reduction (length of rows, + // i.e. number of columns) + const char *arg_cp, + char *res_cp, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + static constexpr argTy identity_val = + su_ns::Identity::value; + static constexpr resTy idx_identity_val = + su_ns::Identity::value; + + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + std::size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + + const InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = + class search_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + + static constexpr std::size_t preferred_reductions_per_wi = 8; + // prevents running out of resources on CPU + std::size_t max_wg = reduction_detail::get_work_group_size(d); + + std::size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{/* size */ reduction_nelems, + /* step */ iter_nelems}; + + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + std::size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event comp_ev = + submit_search_reduction( + exec_q, arg_tp, nullptr, nullptr, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, depends); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + std::size_t reduction_groups = + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups > 1); + + std::size_t second_iter_reduction_groups_ = + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); + + const std::size_t tmp_alloc_size = + iter_nelems * (reduction_groups + second_iter_reduction_groups_); + auto tmp_owner = dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + + resTy *partially_reduced_tmp = tmp_owner.get(); + resTy *partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + + auto vals_tmp_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + tmp_alloc_size, exec_q); + argTy *partially_reduced_vals_tmp = vals_tmp_owner.get(); + argTy *partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + + sycl::event first_reduction_ev; + { + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + static constexpr NoOpIndexerT columns_indexer{}; + static constexpr NoOpIndexerT result_indexer{}; + const InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + const ReductionIndexerT reduction_indexer{ + /* size */ reduction_nelems, + /* step */ iter_nelems}; + + first_reduction_ev = + submit_search_reduction( + exec_q, arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, identity_val, idx_identity_val, wg, + iter_nelems, reduction_nelems, preferred_reductions_per_wi, + reduction_groups, in_out_iter_indexer, reduction_indexer, + depends); + } + + std::size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferred_reductions_per_wi * max_wg) { + std::size_t reduction_groups_ = + (remaining_reduction_nelems + preferred_reductions_per_wi * wg - + 1) / + (preferred_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ reduction_groups_}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + sycl::event partial_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + identity_val, idx_identity_val, wg, iter_nelems, + remaining_reduction_nelems, preferred_reductions_per_wi, + reduction_groups_, in_out_iter_indexer, reduction_indexer, + {dependent_ev}); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + const InputIndexerT inp_indexer{/* size */ iter_nelems, + /* step */ remaining_reduction_nelems}; + static constexpr ResIndexerT res_iter_indexer{}; + + const InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + static constexpr ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = std::max( + 1, (remaining_reduction_nelems + wg - 1) / wg); + + reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + sycl::event final_reduction_ev = + submit_search_reduction( + exec_q, vals_temp_arg, nullptr, temp_arg, res_tp, identity_val, + idx_identity_val, wg, iter_nelems, remaining_reduction_nelems, + reductions_per_wi, reduction_groups, in_out_iter_indexer, + reduction_indexer, {dependent_ev}); + + sycl::event cleanup_host_task_event = + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {final_reduction_ev}, tmp_owner, vals_tmp_owner); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +} // namespace dpctl::tensor::kernels diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.cpp b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp new file mode 100644 index 000000000000..a901b9e1d9a3 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/all.cpp @@ -0,0 +1,164 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + all_reduction_strided_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + all_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static reduction_contig_impl_fn_ptr + all_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; + +template +struct AllStridedFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } +}; + +template +struct AllAxis1ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl; + } +}; + +template +struct AllAxis0ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_and; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl; + } +}; + +void populate_all_dispatch_vectors(void) +{ + using td_ns::DispatchVectorBuilder; + + DispatchVectorBuilder + all_dvb1; + all_dvb1.populate_dispatch_vector(all_reduction_strided_dispatch_vector); + + DispatchVectorBuilder + all_dvb2; + all_dvb2.populate_dispatch_vector( + all_reduction_axis1_contig_dispatch_vector); + + DispatchVectorBuilder + all_dvb3; + all_dvb3.populate_dispatch_vector( + all_reduction_axis0_contig_dispatch_vector); +}; + +using atomic_support::atomic_support_fn_ptr_t; +using atomic_support::check_atomic_support; +static atomic_support_fn_ptr_t all_atomic_support = + check_atomic_support; + +} // namespace impl + +void init_all(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_all_dispatch_vectors(); + using impl::all_reduction_axis0_contig_dispatch_vector; + using impl::all_reduction_axis1_contig_dispatch_vector; + using impl::all_reduction_strided_dispatch_vector; + + using impl::all_atomic_support; + + auto all_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + all_reduction_axis1_contig_dispatch_vector, + all_reduction_axis0_contig_dispatch_vector, + all_reduction_strided_dispatch_vector, all_atomic_support); + }; + m.def("_all", all_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/all.hpp b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp new file mode 100644 index 000000000000..5fb184e37c66 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/all.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_all(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp new file mode 100644 index 000000000000..b7aaf9313aa2 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -0,0 +1,148 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include + +#include + +#include "utils/type_utils.hpp" + +namespace dpctl::tensor::py_internal::atomic_support +{ + +typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc); + +/*! @brief Function which returns a constant value for atomic support */ +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + +/*! @brief Template for querying atomic support for a type on a device */ +template +bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type) +{ + static constexpr bool atomic32 = (sizeof(T) == 4); + static constexpr bool atomic64 = (sizeof(T) == 8); + using dpctl::tensor::type_utils::is_complex; + if constexpr ((!atomic32 && !atomic64) || is_complex::value) { + return fixed_decision(exec_q, usm_alloc_type); + } + else { + bool supports_atomics = false; + const sycl::device &dev = exec_q.get_device(); + if constexpr (atomic64) { + if (!dev.has(sycl::aspect::atomic64)) { + return false; + } + } + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + return supports_atomics; + } +} + +template +struct ArithmeticAtomicSupportFactory +{ + fnT get() + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_floating_point_v || + std::is_same_v || is_complex::value) + { + // for real- and complex- floating point types, tree reduction has + // better round-off accumulation properties (round-off error is + // proportional to the log2(reduction_size), while naive elementwise + // summation used by atomic implementation has round-off error + // growing proportional to the reduction_size.), hence reduction + // over floating point types should always use tree_reduction + // algorithm, even though atomic implementation may be applicable + return fixed_decision; + } + else { + return check_atomic_support; + } + } +}; + +template +struct MinMaxAtomicSupportFactory +{ + fnT get() + { + return check_atomic_support; + } +}; + +template +struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory +{ +}; + +template +struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct ProductAtomicSupportFactory + : public ArithmeticAtomicSupportFactory +{ +}; + +} // namespace dpctl::tensor::py_internal::atomic_support diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp new file mode 100644 index 000000000000..e3858338171f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -0,0 +1,69 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include + +#include "all.hpp" +// #include "any.hpp" +// #include "argmax.hpp" +// #include "argmin.hpp" +// #include "logsumexp.hpp" +// #include "max.hpp" +// #include "min.hpp" +// #include "prod.hpp" +// #include "reduce_hypot.hpp" +// #include "sum.hpp" + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +/*! @brief Add reduction functions to Python module */ +void init_reduction_functions(py::module_ m) +{ + init_all(m); + // init_any(m); + // init_argmax(m); + // init_argmin(m); + // init_logsumexp(m); + // init_max(m); + // init_min(m); + // init_prod(m); + // init_reduce_hypot(m); + // init_sum(m); +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp new file mode 100644 index 000000000000..4df67c16bc4e --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_reduction_functions(py::module_); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp new file mode 100644 index 000000000000..5397fe24693d --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -0,0 +1,1325 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension, specifically functions for reductions. +//===---------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/output_validation.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +/* ====================== dtype supported ======================== */ + +/*! @brief Template implementing Python API for querying type support by + * reduction which may support atomics */ +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = check_atomic_support[out_typeid](q, kind); + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + +/*! @brief Template implementing Python API for querying type support by tree + * reduction */ +template +bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &temps_dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + auto fn = temps_dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +/* ==================== Generic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis which may + * support atomics */ +template +std::pair py_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const contig_fnT &axis0_atomic_dispatch_table, + const contig_fnT &axis1_atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table, + const SupportAtomicFnT &check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; + + if (supports_atomics) { + fn = atomic_dispatch_table[src_typeid][dst_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + auto tmp_alloc_owner = + std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_alloc_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_alloc_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/* ================= No atomic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis without + * atomics */ +template +std::pair py_tree_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast( + simplified_reduction_src_strides[0]) == iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {reduction_ev}, tmp_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/*! @brief Template implementing Python API for searching over an axis */ +template +std::pair py_search_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &axis0_contig_dispatch_table, + const contig_fnT &axis1_contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + std::size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if (is_src_c_contig && is_dst_c_contig) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && dst_nd == 1) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + std::size_t iter_nelems = dst_nelems; + + static constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + std::size_t iter_nelems = dst_nelems; + + if (compact_reduction_src_strides[0] == 1) { + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast( + simplified_iteration_src_strides[0]) == reduction_nelems); + } + else if (static_cast(compact_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + auto arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + auto tmp_owner = std::move(std::get<0>(arrays_metainfo_packing_triple_)); + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + const py::ssize_t *temp_allocation_ptr = tmp_owner.get(); + + const py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + const py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {comp_ev}, tmp_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, comp_ev); +} + +/* ================= Atomic only reductions ====================== */ + +/*! @brief Template implementing Python API for boolean reductions over an axis + */ +template +std::pair + py_boolean_reduction(const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const contig_dispatchT &axis1_contig_dispatch_vector, + const contig_dispatchT &axis0_contig_dispatch_vector, + const strided_dispatchT &strided_dispatch_vector, + const atomic_support_fnT check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iter_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iter_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iter_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(dst); + + std::size_t dst_nelems = dst.get_size(); + + std::size_t red_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + red_nelems *= static_cast(src_shape_ptr[i]); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(dst, src)) { + throw py::value_error("Arrays are expected to have no memory overlap"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(dst, dst_nelems); + + const char *src_data = src.get_data(); + char *dst_data = dst.get_data(); + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + static constexpr int int32_typeid = + static_cast(td_ns::typenum_t::INT32); + if (dst_typeid != int32_typeid) { + throw py::value_error( + "Unexpected data type of destination array, expecting 'int32'"); + } + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support(exec_q, usm_type); + if (!supports_atomics) { + throw py::value_error( + "This reduction is not supported for this device and usm_type."); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 0)) + { + auto fn = axis1_contig_dispatch_vector[src_typeid]; + static constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + static constexpr py::ssize_t zero_offset = 0; + + sycl::event red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, zero_offset, + zero_offset, zero_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + + auto src_shape_vecs = src.get_shape_vector(); + auto src_strides_vecs = src.get_strides_vector(); + auto dst_strides_vecs = dst.get_strides_vector(); + + int simplified_red_nd = trailing_dims_to_reduce; + + using shT = std::vector; + shT red_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_red_shape; + shT simplified_red_src_strides; + py::ssize_t red_src_offset(0); + + using dpctl::tensor::py_internal::simplify_iteration_space_1; + simplify_iteration_space_1( + simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides, + // output + simplified_red_shape, simplified_red_src_strides, red_src_offset); + + shT iter_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iter_nd); + shT const &iter_dst_strides = dst_strides_vecs; + + shT simplified_iter_shape; + shT simplified_iter_src_strides; + shT simplified_iter_dst_strides; + py::ssize_t iter_src_offset(0); + py::ssize_t iter_dst_offset(0); + + if (iter_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iter_nd = 1; + simplified_iter_shape.push_back(1); + simplified_iter_src_strides.push_back(0); + simplified_iter_dst_strides.push_back(0); + } + else { + using dpctl::tensor::py_internal::simplify_iteration_space; + simplify_iteration_space( + iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, + // output + simplified_iter_shape, simplified_iter_src_strides, + simplified_iter_dst_strides, iter_src_offset, iter_dst_offset); + } + + if (simplified_red_nd == 1 && iter_nd == 1) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + std::size_t iter_nelems = dst_nelems; + + if (simplified_red_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iter_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iter_dst_strides[0] == 1) && + (static_cast(simplified_iter_src_strides[0]) == + red_nelems); + } + else if (static_cast(simplified_red_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = (simplified_iter_dst_strides[0] == 1) && + (simplified_iter_src_strides[0] == 1); + } + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_vector[src_typeid]; + + sycl::event red_ev = + fn(exec_q, iter_nelems, red_nelems, src_data, dst_data, + iter_src_offset, iter_dst_offset, red_src_offset, depends); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, {red_ev}); + + return std::make_pair(keep_args_event, red_ev); + } + } + + auto fn = strided_dispatch_vector[src_typeid]; + + std::vector host_task_events{}; + auto iter_red_metadata_packing_triple_ = + dpctl::tensor::offset_utils::device_allocate_and_pack( + exec_q, host_task_events, simplified_iter_shape, + simplified_iter_src_strides, simplified_iter_dst_strides, + simplified_red_shape, simplified_red_src_strides); + auto packed_shapes_strides_owner = + std::move(std::get<0>(iter_red_metadata_packing_triple_)); + const auto ©_metadata_ev = + std::get<2>(iter_red_metadata_packing_triple_); + const py::ssize_t *packed_shapes_and_strides = + packed_shapes_strides_owner.get(); + + const py::ssize_t *iter_shape_and_strides = packed_shapes_and_strides; + const py::ssize_t *red_shape_stride = + packed_shapes_and_strides + 3 * simplified_iter_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto red_ev = + fn(exec_q, dst_nelems, red_nelems, src_data, dst_data, iter_nd, + iter_shape_and_strides, iter_src_offset, iter_dst_offset, + simplified_red_nd, red_shape_stride, red_src_offset, all_deps); + + sycl::event temp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {red_ev}, packed_shapes_strides_owner); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, red_ev); +} + +extern void init_reduction_functions(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp new file mode 100644 index 000000000000..f0901bbf7ab3 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp @@ -0,0 +1,45 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include + +#include "reductions/reduction_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_reductions_impl, m) +{ + dpctl::tensor::py_internal::init_reduction_functions(m); +} From c4f249687952ea426a7ba190cc1b848ae9fcd6e5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 03:49:40 -0800 Subject: [PATCH 129/145] Add TODO with incorrect logic in reductions_over_axis.hpp --- .../tensor/libtensor/source/reductions/reduction_over_axis.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp index 5397fe24693d..e16a80f129fc 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -328,6 +328,7 @@ std::pair py_reduction_over_axis( using dpctl::tensor::py_internal::simplify_iteration_space; using dpctl::tensor::py_internal::simplify_iteration_space_1; + // TODO: not used anywhere auto const &src_shape_vecs = src.get_shape_vector(); auto const &src_strides_vecs = src.get_strides_vector(); auto const &dst_strides_vecs = dst.get_strides_vector(); @@ -927,6 +928,7 @@ std::pair py_search_over_axis( shT compact_reduction_src_strides; py::ssize_t reduction_src_offset(0); + // TODO: not used anywhere compact_iteration_space( reduction_nd, reduction_shape_ptr, reduction_src_strides, // output @@ -1157,6 +1159,7 @@ std::pair bool is_src_f_contig = src.is_f_contiguous(); bool is_dst_c_contig = dst.is_c_contiguous(); + // TODO: should be dst_nelems == 0? if ((is_src_c_contig && is_dst_c_contig) || (is_src_f_contig && dst_nelems == 0)) { From bd3add0adc32a44f9a10d268bcdd774ef8be2d66 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 03:52:14 -0800 Subject: [PATCH 130/145] Move ti.all() to dpctl_ext.tensor and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 2 + dpctl_ext/tensor/_manipulation_functions.py | 4 +- dpctl_ext/tensor/_utility_functions.py | 136 ++++++++++++++++++++ dpnp/dpnp_iface_logic.py | 2 +- 4 files changed, 141 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/_utility_functions.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 9bfda2446889..5e4bbe91ea52 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -79,6 +79,7 @@ unstack, ) from dpctl_ext.tensor._reshape import reshape +from dpctl_ext.tensor._utility_functions import all from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip @@ -94,6 +95,7 @@ from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ + "all", "arange", "argsort", "asarray", diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py index 08459dcaea76..e2d55c533bc0 100644 --- a/dpctl_ext/tensor/_manipulation_functions.py +++ b/dpctl_ext/tensor/_manipulation_functions.py @@ -624,7 +624,7 @@ def repeat(x, repeats, /, *, axis=None): "'repeats' array must be broadcastable to the size of " "the repeated axis" ) - if not dpt.all(repeats >= 0): + if not dpt_ext.all(repeats >= 0): raise ValueError("'repeats' elements must be positive") elif isinstance(repeats, (tuple, list, range)): @@ -646,7 +646,7 @@ def repeat(x, repeats, /, *, axis=None): repeats = dpt_ext.asarray( repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q ) - if not dpt.all(repeats >= 0): + if not dpt_ext.all(repeats >= 0): raise ValueError("`repeats` elements must be positive") else: raise TypeError( diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py new file mode 100644 index 000000000000..678b93cdfeec --- /dev/null +++ b/dpctl_ext/tensor/_utility_functions.py @@ -0,0 +1,136 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + + +import dpctl.tensor as dpt +import dpctl.utils as du + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti +import dpctl_ext.tensor._tensor_reductions_impl as tri + +from ._numpy_helper import normalize_axis_tuple + + +def _boolean_reduction(x, axis, keepdims, func): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + red_nd = nd + # case of a scalar + if red_nd == 0: + return dpt_ext.astype(x, dpt.bool) + x_tmp = x + res_shape = () + perm = list(range(nd)) + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + + red_nd = len(axis) + # check for axis=() + if red_nd == 0: + return dpt_ext.astype(x, dpt.bool) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt_ext.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + + exec_q = x.sycl_queue + res_usm_type = x.usm_type + + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + # always allocate the temporary as + # int32 and usm-device to ensure that atomic updates + # are supported + res_tmp = dpt_ext.empty( + res_shape, + dtype=dpt.int32, + usm_type="device", + sycl_queue=exec_q, + ) + hev0, ev0 = func( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res_tmp, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev0, ev0) + + # copy to boolean result array + res = dpt_ext.empty( + res_shape, + dtype=dpt.bool, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev1, ev1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=res_tmp, dst=res, sycl_queue=exec_q, depends=[ev0] + ) + _manager.add_event_pair(hev1, ev1) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm) + return res + + +def all(x, /, *, axis=None, keepdims=False): + """ + all(x, axis=None, keepdims=False) + + Tests whether all input array elements evaluate to True along a given axis. + + Args: + x (usm_ndarray): Input array. + axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes) + along which to perform a logical AND reduction. + When `axis` is `None`, a logical AND reduction + is performed over all dimensions of `x`. + If `axis` is negative, the axis is counted from + the last dimension to the first. + Default: `None`. + keepdims (bool, optional): If `True`, the reduced axes are included + in the result as singleton dimensions, and the result is + broadcastable to the input array shape. + If `False`, the reduced axes are not included in the result. + Default: `False`. + + Returns: + usm_ndarray: + An array with a data type of `bool` + containing the results of the logical AND reduction. + """ + return _boolean_reduction(x, axis, keepdims, tri._all) diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py index 3e3501b14c7c..9713071476f4 100644 --- a/dpnp/dpnp_iface_logic.py +++ b/dpnp/dpnp_iface_logic.py @@ -214,7 +214,7 @@ def all(a, /, axis=None, out=None, keepdims=False, *, where=True): dpnp.check_limitations(where=where) usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt.all(usm_a, axis=axis, keepdims=keepdims) + usm_res = dpt_ext.all(usm_a, axis=axis, keepdims=keepdims) # TODO: temporary solution until dpt.all supports out parameter return dpnp.get_result_array(usm_res, out) From b1953df0ab74d61d7282bf3a6925babe8114e21c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:06:06 -0800 Subject: [PATCH 131/145] Move _any to _tensor_reductions_impl --- dpctl_ext/tensor/CMakeLists.txt | 2 +- .../libtensor/source/reductions/any.cpp | 164 ++++++++++++++++++ .../libtensor/source/reductions/any.hpp | 46 +++++ .../source/reductions/reduction_common.cpp | 4 +- 4 files changed, 213 insertions(+), 3 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/any.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 872e3f786c65..e8bc17e39f55 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -72,7 +72,7 @@ set(_accumulator_sources set(_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.cpp b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp new file mode 100644 index 000000000000..6859e46cbc4a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/any.cpp @@ -0,0 +1,164 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + any_reduction_strided_dispatch_vector[td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + any_reduction_axis1_contig_dispatch_vector[td_ns::num_types]; +static reduction_contig_impl_fn_ptr + any_reduction_axis0_contig_dispatch_vector[td_ns::num_types]; + +template +struct AnyStridedFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } +}; + +template +struct AnyAxis1ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl; + } +}; + +template +struct AnyAxis0ContigFactory +{ + fnT get() const + { + using dstTy = std::int32_t; + using ReductionOpT = sycl::logical_or; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl; + } +}; + +void populate_any_dispatch_vectors(void) +{ + using td_ns::DispatchVectorBuilder; + + DispatchVectorBuilder + any_dvb1; + any_dvb1.populate_dispatch_vector(any_reduction_strided_dispatch_vector); + + DispatchVectorBuilder + any_dvb2; + any_dvb2.populate_dispatch_vector( + any_reduction_axis1_contig_dispatch_vector); + + DispatchVectorBuilder + any_dvb3; + any_dvb3.populate_dispatch_vector( + any_reduction_axis0_contig_dispatch_vector); +}; + +using atomic_support::atomic_support_fn_ptr_t; +using atomic_support::check_atomic_support; +static atomic_support_fn_ptr_t any_atomic_support = + check_atomic_support; + +} // namespace impl + +void init_any(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_any_dispatch_vectors(); + using impl::any_reduction_axis0_contig_dispatch_vector; + using impl::any_reduction_axis1_contig_dispatch_vector; + using impl::any_reduction_strided_dispatch_vector; + + using impl::any_atomic_support; + + auto any_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_boolean_reduction( + src, trailing_dims_to_reduce, dst, exec_q, depends, + any_reduction_axis1_contig_dispatch_vector, + any_reduction_axis0_contig_dispatch_vector, + any_reduction_strided_dispatch_vector, any_atomic_support); + }; + m.def("_any", any_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/any.hpp b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp new file mode 100644 index 000000000000..4e368a674615 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/any.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_any(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp index e3858338171f..027c80b27cc2 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -36,7 +36,7 @@ #include #include "all.hpp" -// #include "any.hpp" +#include "any.hpp" // #include "argmax.hpp" // #include "argmin.hpp" // #include "logsumexp.hpp" @@ -55,7 +55,7 @@ namespace dpctl::tensor::py_internal void init_reduction_functions(py::module_ m) { init_all(m); - // init_any(m); + init_any(m); // init_argmax(m); // init_argmin(m); // init_logsumexp(m); From 32802d682a8b52dd4254f7c3599314f1f84553e7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:08:40 -0800 Subject: [PATCH 132/145] Move ti.any()/ti.diff() and reuse them in dpnp --- dpctl_ext/tensor/__init__.py | 4 +- dpctl_ext/tensor/_utility_functions.py | 375 ++++++++++++++++++++++++- dpnp/dpnp_iface_logic.py | 7 +- dpnp/dpnp_iface_mathematical.py | 4 +- 4 files changed, 383 insertions(+), 7 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 5e4bbe91ea52..87bc9f544ab6 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -79,7 +79,7 @@ unstack, ) from dpctl_ext.tensor._reshape import reshape -from dpctl_ext.tensor._utility_functions import all +from dpctl_ext.tensor._utility_functions import all, any, diff from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip @@ -96,6 +96,7 @@ __all__ = [ "all", + "any", "arange", "argsort", "asarray", @@ -110,6 +111,7 @@ "cumulative_logsumexp", "cumulative_prod", "cumulative_sum", + "diff", "empty", "empty_like", "extract", diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py index 678b93cdfeec..a122ac3d6cea 100644 --- a/dpctl_ext/tensor/_utility_functions.py +++ b/dpctl_ext/tensor/_utility_functions.py @@ -26,6 +26,8 @@ # THE POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** +import builtins +import operator import dpctl.tensor as dpt import dpctl.utils as du @@ -36,7 +38,17 @@ import dpctl_ext.tensor._tensor_impl as ti import dpctl_ext.tensor._tensor_reductions_impl as tri -from ._numpy_helper import normalize_axis_tuple +from ._numpy_helper import normalize_axis_index, normalize_axis_tuple +from ._scalar_utils import ( + _get_dtype, + _get_queue_usm_type, + _get_shape, + _validate_dtype, +) +from ._type_utils import ( + _resolve_one_strong_one_weak_types, + _resolve_one_strong_two_weak_types, +) def _boolean_reduction(x, axis, keepdims, func): @@ -134,3 +146,364 @@ def all(x, /, *, axis=None, keepdims=False): containing the results of the logical AND reduction. """ return _boolean_reduction(x, axis, keepdims, tri._all) + + +def any(x, /, *, axis=None, keepdims=False): + """ + any(x, axis=None, keepdims=False) + + Tests whether any input array elements evaluate to True along a given axis. + + Args: + x (usm_ndarray): Input array. + axis (Optional[Union[int, Tuple[int,...]]]): Axis (or axes) + along which to perform a logical OR reduction. + When `axis` is `None`, a logical OR reduction + is performed over all dimensions of `x`. + If `axis` is negative, the axis is counted from + the last dimension to the first. + Default: `None`. + keepdims (bool, optional): If `True`, the reduced axes are included + in the result as singleton dimensions, and the result is + broadcastable to the input array shape. + If `False`, the reduced axes are not included in the result. + Default: `False`. + + Returns: + usm_ndarray: + An array with a data type of `bool` + containing the results of the logical OR reduction. + """ + return _boolean_reduction(x, axis, keepdims, tri._any) + + +def _validate_diff_shape(sh1, sh2, axis): + """ + Utility for validating that two shapes `sh1` and `sh2` + are possible to concatenate along `axis`. + """ + if not sh2: + # scalars will always be accepted + return True + else: + sh1_ndim = len(sh1) + if sh1_ndim == len(sh2) and builtins.all( + sh1[i] == sh2[i] for i in range(sh1_ndim) if i != axis + ): + return True + else: + return False + + +def _concat_diff_input(arr, axis, prepend, append): + """ + Concatenates `arr`, `prepend` and, `append` along `axis`, + where `arr` is an array and `prepend` and `append` are + any mixture of arrays and scalars. + """ + if prepend is not None and append is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, prepend_usm_type = _get_queue_usm_type(prepend) + q3, append_usm_type = _get_queue_usm_type(append) + if q2 is None and q3 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + elif q3 is None: + exec_q = du.get_execution_queue((q1, q2)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + ) + ) + elif q2 is None: + exec_q = du.get_execution_queue((q1, q3)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + append_usm_type, + ) + ) + else: + exec_q = du.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + append_usm_type, + ) + ) + du.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + prepend_shape = _get_shape(prepend) + append_shape = _get_shape(append) + if not builtins.all( + isinstance(s, (tuple, list)) + for s in ( + prepend_shape, + append_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + valid_prepend_shape = _validate_diff_shape( + arr_shape, prepend_shape, axis + ) + if not valid_prepend_shape: + raise ValueError( + f"`diff` argument `prepend` with shape {prepend_shape} is " + f"invalid for first input with shape {arr_shape}" + ) + valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis) + if not valid_append_shape: + raise ValueError( + f"`diff` argument `append` with shape {append_shape} is invalid" + f" for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + prepend_dtype = _get_dtype(prepend, sycl_dev) + append_dtype = _get_dtype(append, sycl_dev) + if not builtins.all( + _validate_dtype(o) for o in (prepend_dtype, append_dtype) + ): + raise ValueError("Operands have unsupported data types") + prepend_dtype, append_dtype = _resolve_one_strong_two_weak_types( + arr_dtype, prepend_dtype, append_dtype, sycl_dev + ) + if isinstance(prepend, dpt.usm_ndarray): + a_prepend = prepend + else: + a_prepend = dpt_ext.asarray( + prepend, + dtype=prepend_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if isinstance(append, dpt.usm_ndarray): + a_append = append + else: + a_append = dpt_ext.asarray( + append, + dtype=append_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not prepend_shape: + prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape) + if not append_shape: + append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_append = dpt_ext.broadcast_to(a_append, append_shape) + return dpt_ext.concat((a_prepend, arr, a_append), axis=axis) + elif prepend is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, prepend_usm_type = _get_queue_usm_type(prepend) + if q2 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + else: + exec_q = du.get_execution_queue((q1, q2)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + prepend_usm_type, + ) + ) + du.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + prepend_shape = _get_shape(prepend) + if not isinstance(prepend_shape, (tuple, list)): + raise TypeError( + "Shape of argument can not be inferred. " + "Argument is expected to be a " + "list or tuple" + ) + valid_prepend_shape = _validate_diff_shape( + arr_shape, prepend_shape, axis + ) + if not valid_prepend_shape: + raise ValueError( + f"`diff` argument `prepend` with shape {prepend_shape} is " + f"invalid for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + prepend_dtype = _get_dtype(prepend, sycl_dev) + if not _validate_dtype(prepend_dtype): + raise ValueError("Operand has unsupported data type") + prepend_dtype = _resolve_one_strong_one_weak_types( + arr_dtype, prepend_dtype, sycl_dev + ) + if isinstance(prepend, dpt.usm_ndarray): + a_prepend = prepend + else: + a_prepend = dpt_ext.asarray( + prepend, + dtype=prepend_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not prepend_shape: + prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape) + return dpt_ext.concat((a_prepend, arr), axis=axis) + elif append is not None: + q1, x_usm_type = arr.sycl_queue, arr.usm_type + q2, append_usm_type = _get_queue_usm_type(append) + if q2 is None: + exec_q = q1 + coerced_usm_type = x_usm_type + else: + exec_q = du.get_execution_queue((q1, q2)) + if exec_q is None: + raise du.ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + coerced_usm_type = du.get_coerced_usm_type( + ( + x_usm_type, + append_usm_type, + ) + ) + du.validate_usm_type(coerced_usm_type, allow_none=False) + arr_shape = arr.shape + append_shape = _get_shape(append) + if not isinstance(append_shape, (tuple, list)): + raise TypeError( + "Shape of argument can not be inferred. " + "Argument is expected to be a " + "list or tuple" + ) + valid_append_shape = _validate_diff_shape(arr_shape, append_shape, axis) + if not valid_append_shape: + raise ValueError( + f"`diff` argument `append` with shape {append_shape} is invalid" + f" for first input with shape {arr_shape}" + ) + sycl_dev = exec_q.sycl_device + arr_dtype = arr.dtype + append_dtype = _get_dtype(append, sycl_dev) + if not _validate_dtype(append_dtype): + raise ValueError("Operand has unsupported data type") + append_dtype = _resolve_one_strong_one_weak_types( + arr_dtype, append_dtype, sycl_dev + ) + if isinstance(append, dpt.usm_ndarray): + a_append = append + else: + a_append = dpt_ext.asarray( + append, + dtype=append_dtype, + usm_type=coerced_usm_type, + sycl_queue=exec_q, + ) + if not append_shape: + append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :] + a_append = dpt_ext.broadcast_to(a_append, append_shape) + return dpt_ext.concat((arr, a_append), axis=axis) + else: + arr1 = arr + return arr1 + + +def diff(x, /, *, axis=-1, n=1, prepend=None, append=None): + """ + Calculates the `n`-th discrete forward difference of `x` along `axis`. + + Args: + x (usm_ndarray): + input array. + axis (int): + axis along which to compute the difference. A valid axis must be on + the interval `[-N, N)`, where `N` is the rank (number of + dimensions) of `x`. + Default: `-1` + n (int): + number of times to recursively compute the difference. + Default: `1`. + prepend (Union[usm_ndarray, bool, int, float, complex]): + value or values to prepend to the specified axis before taking the + difference. + Must have the same shape as `x` except along `axis`, which can have + any shape. + Default: `None`. + append (Union[usm_ndarray, bool, int, float, complex]): + value or values to append to the specified axis before taking the + difference. + Must have the same shape as `x` except along `axis`, which can have + any shape. + Default: `None`. + + Returns: + usm_ndarray: + an array containing the `n`-th differences. The array will have the + same shape as `x`, except along `axis`, which will have shape: + ``prepend.shape[axis] + x.shape[axis] + append.shape[axis] - n`` + + The data type of the returned array is determined by the Type + Promotion Rules. + """ + + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expecting dpctl.tensor.usm_ndarray type, " f"got {type(x)}" + ) + x_nd = x.ndim + axis = normalize_axis_index(operator.index(axis), x_nd) + n = operator.index(n) + if n < 0: + raise ValueError(f"`n` must be positive, got {n}") + arr = _concat_diff_input(x, axis, prepend, append) + if n == 0: + return arr + # form slices and recurse + sl0 = tuple( + slice(None) if i != axis else slice(1, None) for i in range(x_nd) + ) + sl1 = tuple( + slice(None) if i != axis else slice(None, -1) for i in range(x_nd) + ) + + diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract + if n > 1: + arr_tmp0 = diff_op(arr[sl0], arr[sl1]) + arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1]) + n = n - 2 + if n > 0: + sl3 = tuple( + slice(None) if i != axis else slice(None, -2) + for i in range(x_nd) + ) + for _ in range(n): + arr_tmp0_sliced = arr_tmp0[sl3] + diff_op(arr_tmp1[sl0], arr_tmp1[sl1], out=arr_tmp0_sliced) + arr_tmp0, arr_tmp1 = arr_tmp1, arr_tmp0_sliced + arr = arr_tmp1 + else: + arr = diff_op(arr[sl0], arr[sl1]) + return arr diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py index 9713071476f4..a81416a28e43 100644 --- a/dpnp/dpnp_iface_logic.py +++ b/dpnp/dpnp_iface_logic.py @@ -44,14 +44,13 @@ # pylint: disable=no-name-in-module -import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti import dpctl.utils as dpu import numpy # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc @@ -214,7 +213,7 @@ def all(a, /, axis=None, out=None, keepdims=False, *, where=True): dpnp.check_limitations(where=where) usm_a = dpnp.get_usm_ndarray(a) - usm_res = dpt_ext.all(usm_a, axis=axis, keepdims=keepdims) + usm_res = dpt.all(usm_a, axis=axis, keepdims=keepdims) # TODO: temporary solution until dpt.all supports out parameter return dpnp.get_result_array(usm_res, out) @@ -1276,7 +1275,7 @@ def isin( usm_element = dpnp.get_usm_ndarray(element) usm_test = dpnp.get_usm_ndarray(test_elements) return dpnp_array._create_from_usm_ndarray( - dpt_ext.isin( + dpt.isin( usm_element, usm_test, invert=invert, diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index e5e379f8dc00..2c3975625c9e 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -1480,7 +1480,9 @@ def diff(a, n=1, axis=-1, prepend=None, append=None): ) usm_app = None if append is None else dpnp.get_usm_ndarray_or_scalar(append) - usm_res = dpt.diff(usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app) + usm_res = dpt_ext.diff( + usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app + ) return dpnp_array._create_from_usm_ndarray(usm_res) From 1b4e49825c053455986ff852f0b6e042252f6794 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:27:45 -0800 Subject: [PATCH 133/145] Move _min_over_axis/_max_over_axis to _tensor_reductions_impl --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../libtensor/source/reductions/max.cpp | 411 +++++++++++++++++ .../libtensor/source/reductions/max.hpp | 46 ++ .../libtensor/source/reductions/min.cpp | 413 ++++++++++++++++++ .../libtensor/source/reductions/min.hpp | 46 ++ .../source/reductions/reduction_common.cpp | 8 +- 6 files changed, 922 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/max.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/min.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index e8bc17e39f55..13c1bd4a6d7f 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -76,8 +76,8 @@ set(_reduction_sources #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp new file mode 100644 index 000000000000..dfcdc6a7b2a5 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp @@ -0,0 +1,411 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by max reduction code based on atomic_ref */ +template +struct TypePairSupportDataForMaxReductionAtomic +{ + /* value is true if a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForMaxReductionTemps +{ + static constexpr bool is_defined = std::disjunction< + // input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMaxReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +void populate_max_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types]; + +void populate_max_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MaxAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(max_atomic_support_vector); +} + +} // namespace impl + +void init_max(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis0_contig_temps_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_temps_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + using impl::populate_max_atomic_support_dispatch_vector; + populate_max_atomic_support_dispatch_vector(); + using impl::max_atomic_support_vector; + + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_temps_dispatch_table, + max_over_axis1_contig_temps_dispatch_table, + max_atomic_support_vector); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.hpp b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp new file mode 100644 index 000000000000..bc242dc8d74b --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/max.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_max(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp new file mode 100644 index 000000000000..b57c35736adc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp @@ -0,0 +1,413 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by min reduction code based on atomic_ref */ +template +struct TypePairSupportDataForMinReductionAtomic +{ + /* value is true if a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForMinReductionTemps +{ + static constexpr bool is_defined = std::disjunction< + // input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MinOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForMinReductionTemps< + srcTy, dstTy>::is_defined) { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types]; + +void populate_min_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MinAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(min_atomic_support_vector); +} + +} // namespace impl + +void init_min(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis0_contig_temps_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_temps_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + using impl::populate_min_atomic_support_dispatch_vector; + populate_min_atomic_support_dispatch_vector(); + using impl::min_atomic_support_vector; + + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_temps_dispatch_table, + min_over_axis1_contig_temps_dispatch_table, + min_atomic_support_vector); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.hpp b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp new file mode 100644 index 000000000000..e054f44539f3 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/min.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_min(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp index 027c80b27cc2..e7b00b504406 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -40,8 +40,8 @@ // #include "argmax.hpp" // #include "argmin.hpp" // #include "logsumexp.hpp" -// #include "max.hpp" -// #include "min.hpp" +#include "max.hpp" +#include "min.hpp" // #include "prod.hpp" // #include "reduce_hypot.hpp" // #include "sum.hpp" @@ -59,8 +59,8 @@ void init_reduction_functions(py::module_ m) // init_argmax(m); // init_argmin(m); // init_logsumexp(m); - // init_max(m); - // init_min(m); + init_max(m); + init_min(m); // init_prod(m); // init_reduce_hypot(m); // init_sum(m); From aa313ff8f6496d247bc85424af81188d6e7c5c53 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:28:22 -0800 Subject: [PATCH 134/145] Move ti.min()/max() and reuse it in dpnp --- dpctl_ext/tensor/__init__.py | 6 + dpctl_ext/tensor/_reduction.py | 419 +++++++++++++++++++++++++++++++++ dpnp/dpnp_iface_statistics.py | 4 +- 3 files changed, 427 insertions(+), 2 deletions(-) create mode 100644 dpctl_ext/tensor/_reduction.py diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 87bc9f544ab6..62dca856a8b4 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -83,6 +83,10 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip +from ._reduction import ( + max, + min, +) from ._searchsorted import searchsorted from ._set_functions import ( isin, @@ -126,7 +130,9 @@ "isdtype", "isin", "linspace", + "max", "meshgrid", + "min", "moveaxis", "permute_dims", "nonzero", diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py new file mode 100644 index 000000000000..7759aebb0b71 --- /dev/null +++ b/dpctl_ext/tensor/_reduction.py @@ -0,0 +1,419 @@ +# ***************************************************************************** +# Copyright (c) 2026, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# - Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# - Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +# ***************************************************************************** + +import dpctl +import dpctl.tensor as dpt +from dpctl.utils import ExecutionPlacementError, SequentialOrderManager + +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor._tensor_impl as ti +import dpctl_ext.tensor._tensor_reductions_impl as tri + +from ._numpy_helper import normalize_axis_tuple + + +def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + x_tmp = x + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt_ext.permute_dims(x, perm) + red_nd = len(axis) + if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]): + raise ValueError("reduction cannot be performed over zero-size axes") + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = x.dtype + res_usm_type = x.usm_type + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt_ext.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out): + out = dpt_ext.empty_like(out) + else: + out = dpt_ext.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=x_tmp, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[cpy_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + return out + + hev, red_ev = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, red_ev) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm) + return out + + +def max(x, /, *, axis=None, keepdims=False, out=None): + """ + Calculates the maximum value of the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If ``None``, the max is computed over the entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. + """ + return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis) + + +def min(x, /, *, axis=None, keepdims=False, out=None): + """ + Calculates the minimum value of the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If ``None``, the min is computed over the entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. + """ + return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis) + + +def _search_over_axis(x, axis, keepdims, out, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + x_tmp = x + else: + if isinstance(axis, int): + axis = (axis,) + else: + raise TypeError( + f"'axis' argument expected to have type 'int' " + r"or be `None`, " + f"got type {type(axis)}" + ) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt_ext.permute_dims(x, perm) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]): + raise ValueError("reduction cannot be performed over zero-size axes") + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = ti.default_device_index_type(exec_q.sycl_device) + res_usm_type = x.usm_type + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt_ext.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out) and red_nd > 0: + out = dpt_ext.empty_like(out) + else: + out = dpt_ext.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q + ) + + _manager = SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_fill, fill_ev = ti._full_usm_ndarray( + fill_value=0, dst=out, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_fill, fill_ev) + return out + + hev, red_ev = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(hev, red_ev) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=exec_q, depends=[red_ev] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm) + return out + + +def argmax(x, /, *, axis=None, keepdims=False, out=None): + """ + Returns the indices of the maximum values of the input array ``x`` along a + specified axis. + + When the maximum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If ``None``, returns the index of the + maximum value of the flattened array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + maximum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of ``x``. + """ + return _search_over_axis(x, axis, keepdims, out, tri._argmax_over_axis) + + +def argmin(x, /, *, axis=None, keepdims=False, out=None): + """ + Returns the indices of the minimum values of the input array ``x`` along a + specified axis. + + When the minimum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If ``None``, returns the index of the + minimum value of the flattened array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + minimum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of ``x``. + """ + return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis) + + +def count_nonzero(x, /, *, axis=None, keepdims=False, out=None): + """ + Counts the number of elements in the input array ``x`` which are non-zero. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which to count. If a tuple of unique integers, + the number of non-zero values are computed over multiple axes. + If ``None``, the number of non-zero values is computed over the + entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and data + type. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the count of non-zero values. If the sum was + computed over the entire array, a zero-dimensional array is + returned. The returned array will have the default array index data + type. + """ + if x.dtype != dpt.bool: + x = dpt_ext.astype(x, dpt.bool, copy=False) + return sum( + x, + axis=axis, + dtype=ti.default_device_index_type(x.sycl_device), + keepdims=keepdims, + out=out, + ) diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py index daff981d5cc4..97c6aab14058 100644 --- a/dpnp/dpnp_iface_statistics.py +++ b/dpnp/dpnp_iface_statistics.py @@ -1118,7 +1118,7 @@ def max(a, axis=None, out=None, keepdims=False, initial=None, where=True): return dpnp_wrap_reduction_call( usm_a, out, - dpt.max, + dpt_ext.max, a.dtype, axis=axis, keepdims=keepdims, @@ -1395,7 +1395,7 @@ def min(a, axis=None, out=None, keepdims=False, initial=None, where=True): return dpnp_wrap_reduction_call( usm_a, out, - dpt.min, + dpt_ext.min, a.dtype, axis=axis, keepdims=keepdims, From c6d600a6be411a1e3181a2c31ad0b34a2c49467e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:41:38 -0800 Subject: [PATCH 135/145] Move _argmax/argmin_over_axis to _tensor_reductions_impl --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../libtensor/source/reductions/argmax.cpp | 280 ++++++++++++++++++ .../libtensor/source/reductions/argmax.hpp | 46 +++ .../libtensor/source/reductions/argmin.cpp | 280 ++++++++++++++++++ .../libtensor/source/reductions/argmin.hpp | 46 +++ .../source/reductions/reduction_common.cpp | 8 +- 6 files changed, 658 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 13c1bd4a6d7f..7425ab2bc6b3 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -73,8 +73,8 @@ set(_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/all.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp new file mode 100644 index 000000000000..bf6592798b4f --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp @@ -0,0 +1,280 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportForArgmaxReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgmaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgmaxReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmax(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis0_contig_temps_dispatch_table; + using impl::argmax_over_axis1_contig_temps_dispatch_table; + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table, + argmax_over_axis0_contig_temps_dispatch_table, + argmax_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp new file mode 100644 index 000000000000..3274f8c7d0cb --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_argmax(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp new file mode 100644 index 000000000000..acd685c59802 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp @@ -0,0 +1,280 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportForArgminReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgminOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportForArgminReductionTemps::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis0_contig_temps_dispatch_table; + using impl::argmin_over_axis1_contig_temps_dispatch_table; + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table, + argmin_over_axis0_contig_temps_dispatch_table, + argmin_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp new file mode 100644 index 000000000000..1865c258a527 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_argmin(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp index e7b00b504406..ff6b5d994f33 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -37,8 +37,8 @@ #include "all.hpp" #include "any.hpp" -// #include "argmax.hpp" -// #include "argmin.hpp" +#include "argmax.hpp" +#include "argmin.hpp" // #include "logsumexp.hpp" #include "max.hpp" #include "min.hpp" @@ -56,8 +56,8 @@ void init_reduction_functions(py::module_ m) { init_all(m); init_any(m); - // init_argmax(m); - // init_argmin(m); + init_argmax(m); + init_argmin(m); // init_logsumexp(m); init_max(m); init_min(m); From 8ae933d85d3d11b552d86c1e44c520b1bbedefc8 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 04:45:22 -0800 Subject: [PATCH 136/145] Move ti.argmax()/argmin() to dpctl_ext.tensor and reuse them in dpnp --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_reduction.py | 131 +++++++++++---------------------- dpnp/dpnp_iface_searching.py | 12 +-- 3 files changed, 52 insertions(+), 95 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 62dca856a8b4..a3b6a209a47e 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -84,6 +84,8 @@ from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum from ._clip import clip from ._reduction import ( + argmax, + argmin, max, min, ) @@ -102,6 +104,8 @@ "all", "any", "arange", + "argmax", + "argmin", "argsort", "asarray", "asnumpy", diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py index 7759aebb0b71..e39c853284d9 100644 --- a/dpctl_ext/tensor/_reduction.py +++ b/dpctl_ext/tensor/_reduction.py @@ -137,72 +137,6 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): return out -def max(x, /, *, axis=None, keepdims=False, out=None): - """ - Calculates the maximum value of the input array ``x``. - - Args: - x (usm_ndarray): - input array. - axis (Optional[int, Tuple[int, ...]]): - axis or axes along which maxima must be computed. If a tuple - of unique integers, the maxima are computed over multiple axes. - If ``None``, the max is computed over the entire array. - Default: ``None``. - keepdims (Optional[bool]): - if ``True``, the reduced axes (dimensions) are included in the - result as singleton dimensions, so that the returned array remains - compatible with the input arrays according to Array Broadcasting - rules. Otherwise, if ``False``, the reduced axes are not included - in the returned array. Default: ``False``. - out (Optional[usm_ndarray]): - the array into which the result is written. - The data type of ``out`` must match the expected shape and the - expected data type of the result. - If ``None`` then a new array is returned. Default: ``None``. - - Returns: - usm_ndarray: - an array containing the maxima. If the max was computed over the - entire array, a zero-dimensional array is returned. The returned - array has the same data type as ``x``. - """ - return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis) - - -def min(x, /, *, axis=None, keepdims=False, out=None): - """ - Calculates the minimum value of the input array ``x``. - - Args: - x (usm_ndarray): - input array. - axis (Optional[int, Tuple[int, ...]]): - axis or axes along which minima must be computed. If a tuple - of unique integers, the minima are computed over multiple axes. - If ``None``, the min is computed over the entire array. - Default: ``None``. - keepdims (Optional[bool]): - if ``True``, the reduced axes (dimensions) are included in the - result as singleton dimensions, so that the returned array remains - compatible with the input arrays according to Array Broadcasting - rules. Otherwise, if ``False``, the reduced axes are not included - in the returned array. Default: ``False``. - out (Optional[usm_ndarray]): - the array into which the result is written. - The data type of ``out`` must match the expected shape and the - expected data type of the result. - If ``None`` then a new array is returned. Default: ``None``. - - Returns: - usm_ndarray: - an array containing the minima. If the min was computed over the - entire array, a zero-dimensional array is returned. The returned - array has the same data type as ``x``. - """ - return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis) - - def _search_over_axis(x, axis, keepdims, out, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") @@ -376,18 +310,17 @@ def argmin(x, /, *, axis=None, keepdims=False, out=None): return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis) -def count_nonzero(x, /, *, axis=None, keepdims=False, out=None): +def max(x, /, *, axis=None, keepdims=False, out=None): """ - Counts the number of elements in the input array ``x`` which are non-zero. + Calculates the maximum value of the input array ``x``. Args: x (usm_ndarray): input array. axis (Optional[int, Tuple[int, ...]]): - axis or axes along which to count. If a tuple of unique integers, - the number of non-zero values are computed over multiple axes. - If ``None``, the number of non-zero values is computed over the - entire array. + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If ``None``, the max is computed over the entire array. Default: ``None``. keepdims (Optional[bool]): if ``True``, the reduced axes (dimensions) are included in the @@ -397,23 +330,47 @@ def count_nonzero(x, /, *, axis=None, keepdims=False, out=None): in the returned array. Default: ``False``. out (Optional[usm_ndarray]): the array into which the result is written. - The data type of ``out`` must match the expected shape and data - type. + The data type of ``out`` must match the expected shape and the + expected data type of the result. If ``None`` then a new array is returned. Default: ``None``. Returns: usm_ndarray: - an array containing the count of non-zero values. If the sum was - computed over the entire array, a zero-dimensional array is - returned. The returned array will have the default array index data - type. + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. """ - if x.dtype != dpt.bool: - x = dpt_ext.astype(x, dpt.bool, copy=False) - return sum( - x, - axis=axis, - dtype=ti.default_device_index_type(x.sycl_device), - keepdims=keepdims, - out=out, - ) + return _comparison_over_axis(x, axis, keepdims, out, tri._max_over_axis) + + +def min(x, /, *, axis=None, keepdims=False, out=None): + """ + Calculates the minimum value of the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If ``None``, the min is computed over the entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as ``x``. + """ + return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis) diff --git a/dpnp/dpnp_iface_searching.py b/dpnp/dpnp_iface_searching.py index 055aaa999c3a..19279f81286a 100644 --- a/dpnp/dpnp_iface_searching.py +++ b/dpnp/dpnp_iface_searching.py @@ -39,12 +39,10 @@ """ -import dpctl.tensor as dpt - # pylint: disable=no-name-in-module # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpctl_ext.tensor._tensor_impl as dti import dpnp @@ -376,13 +374,13 @@ def searchsorted(a, v, side="left", sorter=None): usm_a = dpnp.get_usm_ndarray(a) if dpnp.isscalar(v): - usm_v = dpt_ext.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type) + usm_v = dpt.asarray(v, sycl_queue=a.sycl_queue, usm_type=a.usm_type) else: usm_v = dpnp.get_usm_ndarray(v) usm_sorter = None if sorter is None else dpnp.get_usm_ndarray(sorter) return dpnp_array._create_from_usm_ndarray( - dpt_ext.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter) + dpt.searchsorted(usm_a, usm_v, side=side, sorter=usm_sorter) ) @@ -474,7 +472,5 @@ def where(condition, x=None, y=None, /, *, order="K", out=None): usm_condition = dpnp.get_usm_ndarray(condition) usm_out = None if out is None else dpnp.get_usm_ndarray(out) - usm_res = dpt_ext.where( - usm_condition, usm_x, usm_y, order=order, out=usm_out - ) + usm_res = dpt.where(usm_condition, usm_x, usm_y, order=order, out=usm_out) return dpnp.get_result_array(usm_res, out) From 2ec3cc7ae429171a7c529c05633f7fe9e55328e7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 05:11:55 -0800 Subject: [PATCH 137/145] Move _prod/sum_over_axis to _tensor_reductions_impl --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../libtensor/source/reductions/prod.cpp | 466 ++++++++++++++++++ .../libtensor/source/reductions/prod.hpp | 46 ++ .../source/reductions/reduction_common.cpp | 8 +- .../libtensor/source/reductions/sum.cpp | 463 +++++++++++++++++ .../libtensor/source/reductions/sum.hpp | 46 ++ 6 files changed, 1027 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/prod.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/sum.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 7425ab2bc6b3..350a0b2518ba 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -78,9 +78,9 @@ set(_reduction_sources #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp ) set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/isin.cpp diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp new file mode 100644 index 000000000000..dc9112a7b39a --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp @@ -0,0 +1,466 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = std::conditional_t, + sycl::logical_and, + sycl::multiplies>; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types]; + +void populate_prod_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::ProductAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(prod_atomic_support_vector); +} + +} // namespace impl + +void init_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis0_contig_temps_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_temps_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + using impl::populate_prod_atomic_support_dispatch_vector; + populate_prod_atomic_support_dispatch_vector(); + using impl::prod_atomic_support_vector; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_temps_dispatch_table, + prod_over_axis1_contig_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp new file mode 100644 index 000000000000..15b1c07e5ddd --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_prod(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp index ff6b5d994f33..40c93fdc26fc 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -42,9 +42,9 @@ // #include "logsumexp.hpp" #include "max.hpp" #include "min.hpp" -// #include "prod.hpp" +#include "prod.hpp" // #include "reduce_hypot.hpp" -// #include "sum.hpp" +#include "sum.hpp" namespace py = pybind11; @@ -61,9 +61,9 @@ void init_reduction_functions(py::module_ m) // init_logsumexp(m); init_max(m); init_min(m); - // init_prod(m); + init_prod(m); // init_reduce_hypot(m); - // init_sum(m); + init_sum(m); } } // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp new file mode 100644 index 000000000000..32e31e9707ed --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp @@ -0,0 +1,463 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = + std::conditional_t, + sycl::logical_or, sycl::plus>; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); + + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types]; + +void populate_sum_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::SumAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(sum_atomic_support_vector); +} + +} // namespace impl + +void init_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis0_contig_temps_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_temps_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + using impl::populate_sum_atomic_support_dispatch_vector; + populate_sum_atomic_support_dispatch_vector(); + using impl::sum_atomic_support_vector; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_temps_dispatch_table, + sum_over_axis1_contig_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp new file mode 100644 index 000000000000..08add902a049 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_sum(py::module_ m); + +} // namespace dpctl::tensor::py_internal From 6b27b1bd1ca53d7dda9070051fb475a1610525f1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 05:13:30 -0800 Subject: [PATCH 138/145] Move ti.sum()/prod() to dpctl_ext.tensor and reuse them in dpnp --- dpctl_ext/tensor/__init__.py | 4 + dpctl_ext/tensor/_reduction.py | 291 ++++++++++++++++++++++++++++++++ dpnp/dpnp_iface_manipulation.py | 4 +- dpnp/dpnp_iface_mathematical.py | 17 +- 4 files changed, 305 insertions(+), 11 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index a3b6a209a47e..1ef8d8eb24e5 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -88,6 +88,8 @@ argmin, max, min, + prod, + sum, ) from ._searchsorted import searchsorted from ._set_functions import ( @@ -143,6 +145,7 @@ "ones", "ones_like", "place", + "prod", "put", "put_along_axis", "repeat", @@ -153,6 +156,7 @@ "sort", "squeeze", "stack", + "sum", "swapaxes", "take", "take_along_axis", diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py index e39c853284d9..7e1113591451 100644 --- a/dpctl_ext/tensor/_reduction.py +++ b/dpctl_ext/tensor/_reduction.py @@ -37,6 +37,10 @@ import dpctl_ext.tensor._tensor_reductions_impl as tri from ._numpy_helper import normalize_axis_tuple +from ._type_utils import ( + _default_accumulation_dtype, + _to_device_supported_dtype, +) def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): @@ -137,6 +141,164 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn): return out +def _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + _reduction_fn, + _dtype_supported, + _default_reduction_type_fn, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + perm = list(axis) + arr = x + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [i for i in range(nd) if i not in axis] + list(axis) + arr = dpt_ext.permute_dims(x, perm) + red_nd = len(axis) + res_shape = arr.shape[: nd - red_nd] + q = x.sycl_queue + inp_dt = x.dtype + if dtype is None: + res_dt = _default_reduction_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + res_usm_type = x.usm_type + + implemented_types = _dtype_supported(inp_dt, res_dt, res_usm_type, q) + if dtype is None and not implemented_types: + raise RuntimeError( + "Automatically determined reduction data type does not " + "have direct implementation" + ) + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + if not out.flags.writable: + raise ValueError("provided `out` array is read-only") + if not keepdims: + final_res_shape = res_shape + else: + inp_shape = x.shape + final_res_shape = tuple( + inp_shape[i] if i not in axis else 1 for i in range(nd) + ) + if not out.shape == final_res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {final_res_shape}, got {out.shape}" + ) + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + if dpctl.utils.get_execution_queue((q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + if keepdims: + out = dpt_ext.squeeze(out, axis=axis) + orig_out = out + if ti._array_overlap(x, out) and implemented_types: + out = dpt_ext.empty_like(out) + else: + out = dpt_ext.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + + _manager = SequentialOrderManager[q] + dep_evs = _manager.submitted_events + if red_nd == 0: + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=out, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[cpy_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + out = orig_out + return out + + if implemented_types: + ht_e, red_e = _reduction_fn( + src=arr, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_e, red_e) + if not (orig_out is None or orig_out is out): + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, dst=orig_out, sycl_queue=q, depends=[red_e] + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + out = orig_out + else: + if _dtype_supported(res_dt, res_dt, res_usm_type, q): + tmp = dpt_ext.empty( + arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + ht_e_red, red_ev = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=out, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e_red, red_ev) + else: + buf_dt = _default_reduction_type_fn(inp_dt, q) + tmp = dpt_ext.empty( + arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=q, depends=dep_evs + ) + _manager.add_event_pair(ht_e_cpy, cpy_e) + tmp_res = dpt_ext.empty( + res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + _manager.add_event_pair(ht_e_red, r_e) + ht_e_cpy2, cpy2_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=out, sycl_queue=q, depends=[r_e] + ) + _manager.add_event_pair(ht_e_cpy2, cpy2_e) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm) + return out + + def _search_over_axis(x, axis, keepdims, out, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") @@ -374,3 +536,132 @@ def min(x, /, *, axis=None, keepdims=False, out=None): array has the same data type as ``x``. """ return _comparison_over_axis(x, axis, keepdims, out, tri._min_over_axis) + + +def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the product of elements in the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which products must be computed. If a tuple + of unique integers, products are computed over multiple axes. + If ``None``, the product is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + ``x``. + * If ``x`` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array ``x`` is allocated. + * If ``x`` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array ``x`` is allocated. + * If ``x`` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array ``x`` is allocated. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the product. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the products. If the product was computed over + the entire array, a zero-dimensional array is returned. The + returned array has the data type as described in the ``dtype`` + parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._prod_over_axis, + tri._prod_over_axis_dtype_supported, + _default_accumulation_dtype, + ) + + +def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the sum of elements in the input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which sums must be computed. If a tuple + of unique integers, sums are computed over multiple axes. + If ``None``, the sum is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real- or complex-valued floating-point data + type, the returned array will have the same data type as + ``x``. + * If ``x`` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array ``x`` is allocated. + * If ``x`` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array ``x`` is allocated. + array ``x`` is allocated. + * If ``x`` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array ``x`` is allocated. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the sum. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the sums. If the sum was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the ``dtype`` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._sum_over_axis, + tri._sum_over_axis_dtype_supported, + _default_accumulation_dtype, + ) diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py index cb0d0d35b0d7..7c9e4957c32b 100644 --- a/dpnp/dpnp_iface_manipulation.py +++ b/dpnp/dpnp_iface_manipulation.py @@ -428,7 +428,9 @@ def _get_first_nan_index(usm_a): if first_nan is not None: # all NaNs are collapsed, so need to put a count of all NaNs # at the last index - dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan]) + dpt_ext.sum( + usm_res.counts[first_nan:], out=usm_res.counts[first_nan] + ) result += (usm_res.counts[: first_nan + 1],) else: result += (usm_res.counts,) diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py index 2c3975625c9e..7aa0a850cedb 100644 --- a/dpnp/dpnp_iface_mathematical.py +++ b/dpnp/dpnp_iface_mathematical.py @@ -47,7 +47,6 @@ import builtins import warnings -import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti import dpctl.utils as dpu import numpy @@ -58,7 +57,7 @@ # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi @@ -730,7 +729,7 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs): usm_max = None if max is None else dpnp.get_usm_ndarray_or_scalar(max) usm_out = None if out is None else dpnp.get_usm_ndarray(out) - usm_res = dpt_ext.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order) + usm_res = dpt.clip(usm_arr, usm_min, usm_max, out=usm_out, order=order) if out is not None and isinstance(out, dpnp_array): return out return dpnp_array._create_from_usm_ndarray(usm_res) @@ -1126,7 +1125,7 @@ def cumprod(a, axis=None, dtype=None, out=None): return dpnp_wrap_reduction_call( usm_a, out, - dpt_ext.cumulative_prod, + dpt.cumulative_prod, _get_reduction_res_dt(a, dtype), axis=axis, dtype=dtype, @@ -1218,7 +1217,7 @@ def cumsum(a, axis=None, dtype=None, out=None): return dpnp_wrap_reduction_call( usm_a, out, - dpt_ext.cumulative_sum, + dpt.cumulative_sum, _get_reduction_res_dt(a, dtype), axis=axis, dtype=dtype, @@ -1307,7 +1306,7 @@ def cumulative_prod( return dpnp_wrap_reduction_call( dpnp.get_usm_ndarray(x), out, - dpt_ext.cumulative_prod, + dpt.cumulative_prod, _get_reduction_res_dt(x, dtype), axis=axis, dtype=dtype, @@ -1403,7 +1402,7 @@ def cumulative_sum( return dpnp_wrap_reduction_call( dpnp.get_usm_ndarray(x), out, - dpt_ext.cumulative_sum, + dpt.cumulative_sum, _get_reduction_res_dt(x, dtype), axis=axis, dtype=dtype, @@ -1480,9 +1479,7 @@ def diff(a, n=1, axis=-1, prepend=None, append=None): ) usm_app = None if append is None else dpnp.get_usm_ndarray_or_scalar(append) - usm_res = dpt_ext.diff( - usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app - ) + usm_res = dpt.diff(usm_a, axis=axis, n=n, prepend=usm_pre, append=usm_app) return dpnp_array._create_from_usm_ndarray(usm_res) From 3041d7d6bb9930b1e100a88e960e444d741d8d28 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 05:29:21 -0800 Subject: [PATCH 139/145] Move _logsumexp/hypot_over_axis to _tensor_reductions_impl --- dpctl_ext/tensor/CMakeLists.txt | 4 +- .../libtensor/source/reductions/logsumexp.cpp | 259 ++++++++++++++++++ .../libtensor/source/reductions/logsumexp.hpp | 46 ++++ .../source/reductions/reduce_hypot.cpp | 255 +++++++++++++++++ .../source/reductions/reduce_hypot.hpp | 46 ++++ .../source/reductions/reduction_common.cpp | 8 +- 6 files changed, 612 insertions(+), 6 deletions(-) create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp create mode 100644 dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt index 350a0b2518ba..cf55035c23d9 100644 --- a/dpctl_ext/tensor/CMakeLists.txt +++ b/dpctl_ext/tensor/CMakeLists.txt @@ -75,11 +75,11 @@ set(_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/any.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp - #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp ) set(_sorting_sources diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp new file mode 100644 index 000000000000..8be813203598 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp @@ -0,0 +1,259 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch_building.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForLogSumExpReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< +#if 1 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, +#endif + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct LogSumExpOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_logsumexp_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table( + logsumexp_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + logsumexp_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + logsumexp_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_logsumexp_over_axis_dispatch_tables; + populate_logsumexp_over_axis_dispatch_tables(); + using impl::logsumexp_over_axis0_contig_temps_dispatch_table; + using impl::logsumexp_over_axis1_contig_temps_dispatch_table; + using impl::logsumexp_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto logsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + logsumexp_over_axis_strided_temps_dispatch_table, + logsumexp_over_axis0_contig_temps_dispatch_table, + logsumexp_over_axis1_contig_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + logsumexp_over_axis_strided_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported, + "", py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp new file mode 100644 index 000000000000..2e2c19877db6 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_logsumexp(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp new file mode 100644 index 000000000000..d7903d12b0bc --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -0,0 +1,255 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#include +#include +#include + +#include + +#include "dpnp4pybind11.hpp" +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch_building.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace dpctl::tensor::py_internal +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +template +struct TypePairSupportDataForHypotReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct HypotOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +void populate_hypot_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table); + + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table); + + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_reduce_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_hypot_over_axis_dispatch_tables; + populate_hypot_over_axis_dispatch_tables(); + using impl::hypot_over_axis0_contig_temps_dispatch_table; + using impl::hypot_over_axis1_contig_temps_dispatch_table; + using impl::hypot_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + hypot_over_axis_strided_temps_dispatch_table, + hypot_over_axis0_contig_temps_dispatch_table, + hypot_over_axis1_contig_temps_dispatch_table); + }; + m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto hypot_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + hypot_over_axis_strided_temps_dispatch_table); + }; + m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp new file mode 100644 index 000000000000..c0a16345af75 --- /dev/null +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.hpp @@ -0,0 +1,46 @@ +//***************************************************************************** +// Copyright (c) 2026, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// - Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_reductions_impl +/// extension. +//===---------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl::tensor::py_internal +{ + +extern void init_reduce_hypot(py::module_ m); + +} // namespace dpctl::tensor::py_internal diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp index 40c93fdc26fc..fca5e09e2fe5 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_common.cpp @@ -39,11 +39,11 @@ #include "any.hpp" #include "argmax.hpp" #include "argmin.hpp" -// #include "logsumexp.hpp" +#include "logsumexp.hpp" #include "max.hpp" #include "min.hpp" #include "prod.hpp" -// #include "reduce_hypot.hpp" +#include "reduce_hypot.hpp" #include "sum.hpp" namespace py = pybind11; @@ -58,11 +58,11 @@ void init_reduction_functions(py::module_ m) init_any(m); init_argmax(m); init_argmin(m); - // init_logsumexp(m); + init_logsumexp(m); init_max(m); init_min(m); init_prod(m); - // init_reduce_hypot(m); + init_reduce_hypot(m); init_sum(m); } From 4872bb6bbd2c839af5825064805de40758144d32 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 3 Mar 2026 05:31:03 -0800 Subject: [PATCH 140/145] Move ti.count_nonzero()/logsumexp()/reduce_hypot() to dpctl_ext.tensor and reuse them --- dpctl_ext/tensor/__init__.py | 6 ++ dpctl_ext/tensor/_reduction.py | 167 +++++++++++++++++++++++++++++++ dpnp/dpnp_iface_counting.py | 5 +- dpnp/dpnp_iface_trigonometric.py | 6 +- 4 files changed, 178 insertions(+), 6 deletions(-) diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py index 1ef8d8eb24e5..c40d9dacdadd 100644 --- a/dpctl_ext/tensor/__init__.py +++ b/dpctl_ext/tensor/__init__.py @@ -86,9 +86,12 @@ from ._reduction import ( argmax, argmin, + count_nonzero, + logsumexp, max, min, prod, + reduce_hypot, sum, ) from ._searchsorted import searchsorted @@ -117,6 +120,7 @@ "can_cast", "concat", "copy", + "count_nonzero", "clip", "cumulative_logsumexp", "cumulative_prod", @@ -136,6 +140,7 @@ "isdtype", "isin", "linspace", + "logsumexp", "max", "meshgrid", "min", @@ -148,6 +153,7 @@ "prod", "put", "put_along_axis", + "reduce_hypot", "repeat", "reshape", "result_type", diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py index 7e1113591451..b8fdcf4f37e6 100644 --- a/dpctl_ext/tensor/_reduction.py +++ b/dpctl_ext/tensor/_reduction.py @@ -39,6 +39,7 @@ from ._numpy_helper import normalize_axis_tuple from ._type_utils import ( _default_accumulation_dtype, + _default_accumulation_dtype_fp_types, _to_device_supported_dtype, ) @@ -472,6 +473,111 @@ def argmin(x, /, *, axis=None, keepdims=False, out=None): return _search_over_axis(x, axis, keepdims, out, tri._argmin_over_axis) +def count_nonzero(x, /, *, axis=None, keepdims=False, out=None): + """ + Counts the number of elements in the input array ``x`` which are non-zero. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which to count. If a tuple of unique integers, + the number of non-zero values are computed over multiple axes. + If ``None``, the number of non-zero values is computed over the + entire array. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and data + type. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the count of non-zero values. If the sum was + computed over the entire array, a zero-dimensional array is + returned. The returned array will have the default array index data + type. + """ + if x.dtype != dpt.bool: + x = dpt.astype(x, dpt.bool, copy=False) + return sum( + x, + axis=axis, + dtype=ti.default_device_index_type(x.sycl_device), + keepdims=keepdims, + out=out, + ) + + +def logsumexp(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the logarithm of the sum of exponentials of elements in the + input array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If ``None``, the result is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real-valued floating-point data type, the + returned array will have the same data type as ``x``. + * If ``x`` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array ``x`` is allocated. + * If ``x`` has a complex-valued floating-point data type, + an error is raised. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the result. + Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. + The returned array has the data type as described in the + ``dtype`` parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_accumulation_dtype_fp_types, + ) + + def max(x, /, *, axis=None, keepdims=False, out=None): """ Calculates the maximum value of the input array ``x``. @@ -602,6 +708,67 @@ def prod(x, /, *, axis=None, dtype=None, keepdims=False, out=None): ) +def reduce_hypot(x, /, *, axis=None, dtype=None, keepdims=False, out=None): + """ + Calculates the square root of the sum of squares of elements in the input + array ``x``. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If ``None``, the result is computed over the entire array. + Default: ``None``. + dtype (Optional[dtype]): + data type of the returned array. If ``None``, the default data + type is inferred from the "kind" of the input array data type. + + * If ``x`` has a real-valued floating-point data type, the + returned array will have the same data type as ``x``. + * If ``x`` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array ``x`` is allocated. + * If ``x`` has a complex-valued floating-point data type, + an error is raised. + + If the data type (either specified or resolved) differs from the + data type of ``x``, the input array elements are cast to the + specified data type before computing the result. Default: ``None``. + keepdims (Optional[bool]): + if ``True``, the reduced axes (dimensions) are included in the + result as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if ``False``, the reduced axes are not included + in the returned array. Default: ``False``. + out (Optional[usm_ndarray]): + the array into which the result is written. + The data type of ``out`` must match the expected shape and the + expected data type of the result or (if provided) ``dtype``. + If ``None`` then a new array is returned. Default: ``None``. + + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The + returned array has the data type as described in the ``dtype`` + parameter description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + out, + tri._hypot_over_axis, + lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_accumulation_dtype_fp_types, + ) + + def sum(x, /, *, axis=None, dtype=None, keepdims=False, out=None): """ Calculates the sum of elements in the input array ``x``. diff --git a/dpnp/dpnp_iface_counting.py b/dpnp/dpnp_iface_counting.py index a4b85aa85294..a8ebafbcead7 100644 --- a/dpnp/dpnp_iface_counting.py +++ b/dpnp/dpnp_iface_counting.py @@ -39,8 +39,9 @@ """ -import dpctl.tensor as dpt - +# TODO: revert to `import dpctl.tensor...` +# when dpnp fully migrates dpctl/tensor +import dpctl_ext.tensor as dpt import dpnp diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py index 460a0dc80f0f..a17c7dfd9d9a 100644 --- a/dpnp/dpnp_iface_trigonometric.py +++ b/dpnp/dpnp_iface_trigonometric.py @@ -42,13 +42,11 @@ # pylint: disable=protected-access # pylint: disable=no-name-in-module - -import dpctl.tensor as dpt import dpctl.tensor._tensor_elementwise_impl as ti # TODO: revert to `import dpctl.tensor...` # when dpnp fully migrates dpctl/tensor -import dpctl_ext.tensor as dpt_ext +import dpctl_ext.tensor as dpt import dpctl_ext.tensor._type_utils as dtu import dpnp import dpnp.backend.extensions.ufunc._ufunc_impl as ufi @@ -935,7 +933,7 @@ def cumlogsumexp( return dpnp_wrap_reduction_call( usm_x, out, - dpt_ext.cumulative_logsumexp, + dpt.cumulative_logsumexp, _get_accumulation_res_dt(x, dtype), axis=axis, dtype=dtype, From 85ef3e0ef0c3248fe39d55c5c428e9a884297afa Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 5 Mar 2026 10:45:57 -0800 Subject: [PATCH 141/145] Apply remarks --- dpctl_ext/tensor/_accumulation.py | 6 +++--- .../libtensor/source/accumulators/accumulate_over_axis.hpp | 5 ++++- .../libtensor/source/accumulators/cumulative_logsumexp.cpp | 3 --- .../libtensor/source/accumulators/cumulative_prod.cpp | 3 --- .../tensor/libtensor/source/accumulators/cumulative_sum.cpp | 3 --- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py index 17596e5647fc..2dfe9656e198 100644 --- a/dpctl_ext/tensor/_accumulation.py +++ b/dpctl_ext/tensor/_accumulation.py @@ -35,14 +35,14 @@ import dpctl_ext.tensor as dpt_ext import dpctl_ext.tensor._tensor_accumulation_impl as tai import dpctl_ext.tensor._tensor_impl as ti -from dpctl_ext.tensor._type_utils import ( + +from ._numpy_helper import normalize_axis_index +from ._type_utils import ( _default_accumulation_dtype, _default_accumulation_dtype_fp_types, _to_device_supported_dtype, ) -from ._numpy_helper import normalize_axis_index - def _accumulate_common( x, diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp index 712ab180ef37..4dd00620a260 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/accumulate_over_axis.hpp @@ -35,9 +35,12 @@ #pragma once +#include #include -#include +#include +#include #include +#include #include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp index 572c93c0066e..f1ad170caa58 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -311,7 +311,6 @@ void init_cumulative_logsumexp(py::module_ m) int trailing_dims_to_accumulate, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_accumulate_over_axis; return py_accumulate_over_axis(src, trailing_dims_to_accumulate, dst, exec_q, depends, cumlogsumexp_strided_dispatch_table, @@ -326,8 +325,6 @@ void init_cumulative_logsumexp(py::module_ m) auto cumlogsumexp_include_initial_pyapi = [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal:: - py_accumulate_final_axis_include_initial; return py_accumulate_final_axis_include_initial( src, dst, exec_q, depends, cumlogsumexp_include_initial_strided_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp index 07c802c8cffa..9a9961441d35 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -322,7 +322,6 @@ void init_cumulative_prod(py::module_ m) auto cumprod_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_accumulate_over_axis; return py_accumulate_over_axis( src, trailing_dims_to_accumulate, dst, exec_q, depends, cumprod_strided_dispatch_table, cumprod_1d_contig_dispatch_table); @@ -336,8 +335,6 @@ void init_cumulative_prod(py::module_ m) auto cumprod_include_initial_pyapi = [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal:: - py_accumulate_final_axis_include_initial; return py_accumulate_final_axis_include_initial( src, dst, exec_q, depends, cumprod_include_initial_strided_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp index 3c422bfd0998..3a0ed6cf3ab5 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -320,7 +320,6 @@ void init_cumulative_sum(py::module_ m) auto cumsum_pyapi = [&](const arrayT &src, int trailing_dims_to_accumulate, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_accumulate_over_axis; return py_accumulate_over_axis( src, trailing_dims_to_accumulate, dst, exec_q, depends, cumsum_strided_dispatch_table, cumsum_1d_contig_dispatch_table); @@ -334,8 +333,6 @@ void init_cumulative_sum(py::module_ m) auto cumsum_include_initial_pyapi = [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal:: - py_accumulate_final_axis_include_initial; return py_accumulate_final_axis_include_initial( src, dst, exec_q, depends, cumsum_include_initial_strided_dispatch_table, From e335b3d5352b5816de78930107619ad45c2f33fa Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Mar 2026 02:21:08 -0700 Subject: [PATCH 142/145] Apply remarks --- .../libtensor/source/accumulators/cumulative_logsumexp.cpp | 1 - .../tensor/libtensor/source/accumulators/cumulative_prod.cpp | 1 - .../tensor/libtensor/source/accumulators/cumulative_sum.cpp | 1 - 3 files changed, 3 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp index f1ad170caa58..e24cf56ddd62 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_logsumexp.cpp @@ -337,7 +337,6 @@ void init_cumulative_logsumexp(py::module_ m) auto cumlogsumexp_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype) { - using dpctl::tensor::py_internal::py_accumulate_dtype_supported; return py_accumulate_dtype_supported( input_dtype, output_dtype, cumlogsumexp_strided_dispatch_table); }; diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp index 9a9961441d35..65f3c311eda1 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_prod.cpp @@ -346,7 +346,6 @@ void init_cumulative_prod(py::module_ m) auto cumprod_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype) { - using dpctl::tensor::py_internal::py_accumulate_dtype_supported; return py_accumulate_dtype_supported(input_dtype, output_dtype, cumprod_strided_dispatch_table); }; diff --git a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp index 3a0ed6cf3ab5..60b46946acc9 100644 --- a/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp +++ b/dpctl_ext/tensor/libtensor/source/accumulators/cumulative_sum.cpp @@ -344,7 +344,6 @@ void init_cumulative_sum(py::module_ m) auto cumsum_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype) { - using dpctl::tensor::py_internal::py_accumulate_dtype_supported; return py_accumulate_dtype_supported(input_dtype, output_dtype, cumsum_strided_dispatch_table); }; From d7d7a2b690ddefa43d66174fd9f01e4710603804 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Mar 2026 03:02:12 -0700 Subject: [PATCH 143/145] Apply remarks --- .../kernels/sorting/search_sorted_detail.hpp | 9 ++------- .../include/kernels/sorting/searchsorted.hpp | 1 - .../include/kernels/sorting/sort_utils.hpp | 7 ++----- .../libtensor/include/kernels/sorting/topk.hpp | 2 ++ .../tensor/libtensor/source/sorting/isin.cpp | 7 ++++--- .../libtensor/source/sorting/merge_argsort.cpp | 16 +++++++--------- .../libtensor/source/sorting/merge_sort.cpp | 12 +++++------- .../source/sorting/py_argsort_common.hpp | 1 - .../libtensor/source/sorting/py_sort_common.hpp | 1 - .../libtensor/source/sorting/radix_argsort.cpp | 15 ++++++--------- .../libtensor/source/sorting/radix_sort.cpp | 16 ++++++---------- .../libtensor/source/sorting/searchsorted.cpp | 10 +++++----- .../tensor/libtensor/source/sorting/topk.cpp | 2 +- .../tensor/libtensor/source/tensor_sorting.cpp | 2 -- 14 files changed, 40 insertions(+), 61 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp index d184261aeabf..1f3576402511 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/search_sorted_detail.hpp @@ -36,10 +36,7 @@ #include -namespace dpctl::tensor::kernels -{ - -namespace search_sorted_detail +namespace dpctl::tensor::kernels::search_sorted_detail { template @@ -119,6 +116,4 @@ std::size_t upper_bound_indexed_impl(const Acc acc, acc_indexer); } -} // namespace search_sorted_detail - -} // namespace dpctl::tensor::kernels +} // namespace dpctl::tensor::kernels::search_sorted_detail diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp index b89ad922e102..bc400c9e569a 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/searchsorted.hpp @@ -35,7 +35,6 @@ #pragma once #include -#include #include #include diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp index 4bbcacd56fcc..fd32905b808e 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -78,10 +77,8 @@ sycl::event iota_impl(sycl::queue &exec_q, } if (offset + n_wi * max_sgSize < nelems) { - static constexpr auto group_ls_props = syclexp::properties{ - syclexp::data_placement_striped - // , syclexp::full_group - }; + static constexpr auto group_ls_props = + syclexp::properties{syclexp::data_placement_striped}; auto out_multi_ptr = sycl::address_space_cast< sycl::access::address_space::global_space, diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp index 90e06a1e9eb8..e66d82247434 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -34,8 +34,10 @@ #pragma once +#include #include #include +#include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp index 728a7ca3b733..f1ae5863bbb9 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/isin.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include @@ -46,6 +47,7 @@ #include "kernels/sorting/isin.hpp" #include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/sycl_alloc_utils.hpp" #include "utils/type_dispatch.hpp" @@ -254,7 +256,7 @@ std::pair simplified_dst_strides.push_back(0); } else { - dpctl::tensor::py_internal::simplify_iteration_space( + simplify_iteration_space( // modified by reference simplified_nd, // read-only inputs @@ -313,9 +315,8 @@ std::pair void init_isin_functions(py::module_ m) { - dpctl::tensor::py_internal::detail::init_isin_dispatch_vector(); + detail::init_isin_dispatch_vector(); - using dpctl::tensor::py_internal::py_isin; m.def("_isin", &py_isin, py::arg("needles"), py::arg("hay"), py::arg("dst"), py::arg("sycl_queue"), py::arg("invert"), py::arg("depends") = py::list()); diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp index f76a74e1f8bf..2b6dcc8bf447 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_argsort.cpp @@ -35,6 +35,8 @@ #include #include +#include +#include #include @@ -121,7 +123,7 @@ void init_merge_argsort_dispatch_tables(void) void init_merge_argsort_functions(py::module_ m) { - dpctl::tensor::py_internal::init_merge_argsort_dispatch_tables(); + init_merge_argsort_dispatch_tables(); auto py_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, const int trailing_dims_to_sort, @@ -129,10 +131,8 @@ void init_merge_argsort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_argsort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - ascending_argsort_contig_dispatch_table); + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_argsort_contig_dispatch_table); }; m.def("_argsort_ascending", py_argsort_ascending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), @@ -144,10 +144,8 @@ void init_merge_argsort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_argsort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - descending_argsort_contig_dispatch_table); + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_argsort_contig_dispatch_table); }; m.def("_argsort_descending", py_argsort_descending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp index 7bb0e37c6aed..23d95fb27691 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp @@ -102,7 +102,7 @@ void init_merge_sort_dispatch_vectors(void) void init_merge_sort_functions(py::module_ m) { - dpctl::tensor::py_internal::init_merge_sort_dispatch_vectors(); + init_merge_sort_dispatch_vectors(); auto py_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, const int trailing_dims_to_sort, @@ -110,9 +110,8 @@ void init_merge_sort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_sort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal::ascending_sort_contig_dispatch_vector); + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_sort_contig_dispatch_vector); }; m.def("_sort_ascending", py_sort_ascending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), @@ -124,9 +123,8 @@ void init_merge_sort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_sort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal::descending_sort_contig_dispatch_vector); + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_sort_contig_dispatch_vector); }; m.def("_sort_descending", py_sort_descending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp index d204b492b2ff..6328b3339376 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/py_argsort_common.hpp @@ -44,7 +44,6 @@ #include "dpnp4pybind11.hpp" #include -#include #include "utils/memory_overlap.hpp" #include "utils/output_validation.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp index 0fdf7bec80fd..ee8777f35077 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/py_sort_common.hpp @@ -44,7 +44,6 @@ #include "dpnp4pybind11.hpp" #include -#include #include "utils/memory_overlap.hpp" #include "utils/output_validation.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp index ba9bbfd61b7c..e54b8f739a4b 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_argsort.cpp @@ -62,6 +62,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; +using dpctl::tensor::ssize_t; using dpctl::tensor::kernels::sort_contig_fn_ptr_t; static sort_contig_fn_ptr_t @@ -152,7 +153,7 @@ void init_radix_argsort_dispatch_tables(void) void init_radix_argsort_functions(py::module_ m) { - dpctl::tensor::py_internal::init_radix_argsort_dispatch_tables(); + init_radix_argsort_dispatch_tables(); auto py_radix_argsort_ascending = [](const dpctl::tensor::usm_ndarray &src, @@ -160,10 +161,8 @@ void init_radix_argsort_functions(py::module_ m) const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_argsort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - ascending_radix_argsort_contig_dispatch_table); + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_radix_argsort_contig_dispatch_table); }; m.def("_radix_argsort_ascending", py_radix_argsort_ascending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), @@ -175,10 +174,8 @@ void init_radix_argsort_functions(py::module_ m) const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_argsort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - descending_radix_argsort_contig_dispatch_table); + return py_argsort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_radix_argsort_contig_dispatch_table); }; m.def("_radix_argsort_descending", py_radix_argsort_descending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), diff --git a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp index fb3cab487df8..35c71a0eb7d3 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/radix_sort.cpp @@ -34,7 +34,6 @@ //===----------------------------------------------------------------------===// #include -#include #include #include #include @@ -62,6 +61,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace impl_ns = dpctl::tensor::kernels::radix_sort_details; +using dpctl::tensor::ssize_t; using dpctl::tensor::kernels::sort_contig_fn_ptr_t; static sort_contig_fn_ptr_t ascending_radix_sort_contig_dispatch_vector[td_ns::num_types]; @@ -152,7 +152,7 @@ bool py_radix_sort_defined(int typenum) void init_radix_sort_functions(py::module_ m) { - dpctl::tensor::py_internal::init_radix_sort_dispatch_vectors(); + init_radix_sort_dispatch_vectors(); auto py_radix_sort_ascending = [](const dpctl::tensor::usm_ndarray &src, const int trailing_dims_to_sort, @@ -160,10 +160,8 @@ void init_radix_sort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_sort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - ascending_radix_sort_contig_dispatch_vector); + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + ascending_radix_sort_contig_dispatch_vector); }; m.def("_radix_sort_ascending", py_radix_sort_ascending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), @@ -175,10 +173,8 @@ void init_radix_sort_functions(py::module_ m) sycl::queue &exec_q, const std::vector &depends) -> std::pair { - return dpctl::tensor::py_internal::py_sort( - src, trailing_dims_to_sort, dst, exec_q, depends, - dpctl::tensor::py_internal:: - descending_radix_sort_contig_dispatch_vector); + return py_sort(src, trailing_dims_to_sort, dst, exec_q, depends, + descending_radix_sort_contig_dispatch_vector); }; m.def("_radix_sort_descending", py_radix_sort_descending, py::arg("src"), py::arg("trailing_dims_to_sort"), py::arg("dst"), diff --git a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp index 5cebb74be2d0..8b1ce04a97d6 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/searchsorted.cpp @@ -35,6 +35,8 @@ #include #include +#include +#include #include #include #include @@ -47,6 +49,7 @@ #include "kernels/sorting/searchsorted.hpp" #include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" #include "utils/output_validation.hpp" #include "utils/rich_comparisons.hpp" #include "utils/sycl_alloc_utils.hpp" @@ -369,7 +372,7 @@ std::pair simplified_positions_strides.push_back(0); } else { - dpctl::tensor::py_internal::simplify_iteration_space( + simplify_iteration_space( // modified by reference simplified_nd, // read-only inputs @@ -462,10 +465,7 @@ std::pair void init_searchsorted_functions(py::module_ m) { - dpctl::tensor::py_internal::detail::init_searchsorted_dispatch_table(); - - using dpctl::tensor::py_internal::py_searchsorted_left; - using dpctl::tensor::py_internal::py_searchsorted_right; + detail::init_searchsorted_dispatch_table(); m.def("_searchsorted_left", &py_searchsorted_left, py::arg("hay"), py::arg("needles"), py::arg("positions"), py::arg("sycl_queue"), diff --git a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp index 23a8d2a4328f..6b8344df12c8 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/topk.cpp @@ -293,7 +293,7 @@ void init_topk_dispatch_vectors(void) void init_topk_functions(py::module_ m) { - dpctl::tensor::py_internal::init_topk_dispatch_vectors(); + init_topk_dispatch_vectors(); m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"), diff --git a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp index 98a2493a7c48..318c3559d77c 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_sorting.cpp @@ -43,8 +43,6 @@ #include "sorting/searchsorted.hpp" #include "sorting/topk.hpp" -namespace py = pybind11; - PYBIND11_MODULE(_tensor_sorting_impl, m) { dpctl::tensor::py_internal::init_isin_functions(m); From 3c26859982f580b194ea26720913220ae7f70ce7 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Mar 2026 03:14:22 -0700 Subject: [PATCH 144/145] Apply remaining remarks --- dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp | 1 - dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp index e66d82247434..a1e57dc9ef30 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -41,7 +41,6 @@ #include #include -#include #include #include "kernels/sorting/merge_sort.hpp" diff --git a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp index 23d95fb27691..fbd60621b3bb 100644 --- a/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp +++ b/dpctl_ext/tensor/libtensor/source/sorting/merge_sort.cpp @@ -33,6 +33,9 @@ /// extension. //===----------------------------------------------------------------------===// +#include +#include + #include #include "dpnp4pybind11.hpp" From e0bba1d4238b96d35e7625c45779519192eefa90 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 9 Mar 2026 03:41:02 -0700 Subject: [PATCH 145/145] Apply remarks --- .../tensor/libtensor/include/kernels/reductions.hpp | 1 - .../tensor/libtensor/source/reductions/argmax.cpp | 1 - .../tensor/libtensor/source/reductions/argmin.cpp | 1 - .../libtensor/source/reductions/logsumexp.cpp | 2 -- .../tensor/libtensor/source/reductions/max.cpp | 1 - .../tensor/libtensor/source/reductions/min.cpp | 1 - .../tensor/libtensor/source/reductions/prod.cpp | 2 -- .../libtensor/source/reductions/reduce_hypot.cpp | 4 +--- .../source/reductions/reduction_atomic_support.hpp | 1 - .../source/reductions/reduction_over_axis.hpp | 13 +------------ .../tensor/libtensor/source/reductions/sum.cpp | 2 -- .../tensor/libtensor/source/tensor_reductions.cpp | 2 -- 12 files changed, 2 insertions(+), 29 deletions(-) diff --git a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp index d77d7df92609..ee6431dec637 100644 --- a/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl_ext/tensor/libtensor/include/kernels/reductions.hpp @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp index bf6592798b4f..10fc49759168 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmax.cpp @@ -264,7 +264,6 @@ void init_argmax(py::module_ m) auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_search_over_axis; return py_search_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, argmax_over_axis_strided_temps_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp index acd685c59802..ec4637b62d49 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/argmin.cpp @@ -264,7 +264,6 @@ void init_argmin(py::module_ m) auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_search_over_axis; return py_search_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, argmin_over_axis_strided_temps_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp index 8be813203598..08ed4f12dbda 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/logsumexp.cpp @@ -233,7 +233,6 @@ void init_logsumexp(py::module_ m) int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_tree_reduction_over_axis; return py_tree_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, logsumexp_over_axis_strided_temps_dispatch_table, @@ -246,7 +245,6 @@ void init_logsumexp(py::module_ m) auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype) { - using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; return py_tree_reduction_dtype_supported( input_dtype, output_dtype, logsumexp_over_axis_strided_temps_dispatch_table); diff --git a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp index dfcdc6a7b2a5..d19ed226d3b4 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/max.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/max.cpp @@ -391,7 +391,6 @@ void init_max(py::module_ m) auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, max_over_axis_strided_atomic_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp index b57c35736adc..97d3432b13ed 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/min.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/min.cpp @@ -393,7 +393,6 @@ void init_min(py::module_ m) auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, min_over_axis_strided_atomic_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp index dc9112a7b39a..e16e0cf25e1d 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/prod.cpp @@ -432,7 +432,6 @@ void init_prod(py::module_ m) auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, prod_over_axis_strided_atomic_dispatch_table, @@ -450,7 +449,6 @@ void init_prod(py::module_ m) auto prod_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype, const std::string &dst_usm_type, sycl::queue &q) { - using dpctl::tensor::py_internal::py_reduction_dtype_supported; return py_reduction_dtype_supported( input_dtype, output_dtype, dst_usm_type, q, prod_over_axis_strided_atomic_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp index d7903d12b0bc..3c343b702238 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -44,9 +44,9 @@ #include #include "kernels/reductions.hpp" +#include "utils/sycl_utils.hpp" #include "utils/type_dispatch_building.hpp" -#include "reduction_atomic_support.hpp" #include "reduction_over_axis.hpp" namespace dpctl::tensor::py_internal @@ -229,7 +229,6 @@ void init_reduce_hypot(py::module_ m) auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_tree_reduction_over_axis; return py_tree_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, hypot_over_axis_strided_temps_dispatch_table, @@ -242,7 +241,6 @@ void init_reduce_hypot(py::module_ m) auto hypot_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype) { - using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; return py_tree_reduction_dtype_supported( input_dtype, output_dtype, hypot_over_axis_strided_temps_dispatch_table); diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp index b7aaf9313aa2..5f9cc32f1203 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -34,7 +34,6 @@ //===---------------------------------------------------------------------===// #pragma once -#include #include #include diff --git a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp index e16a80f129fc..130e61eb8e7d 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -37,8 +37,8 @@ #include #include -#include #include +#include #include #include #include @@ -49,7 +49,6 @@ #include "dpnp4pybind11.hpp" #include #include -#include #include "kernels/reductions.hpp" #include "simplify_iteration_space.hpp" @@ -325,9 +324,6 @@ std::pair py_reduction_over_axis( } } - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - // TODO: not used anywhere auto const &src_shape_vecs = src.get_shape_vector(); auto const &src_strides_vecs = src.get_strides_vector(); @@ -637,9 +633,6 @@ std::pair py_tree_reduction_over_axis( } } - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - auto const &src_shape_vecs = src.get_shape_vector(); auto const &src_strides_vecs = src.get_strides_vector(); auto const &dst_strides_vecs = dst.get_strides_vector(); @@ -912,8 +905,6 @@ std::pair py_search_over_axis( } } - using dpctl::tensor::py_internal::simplify_iteration_space; - auto const &src_shape_vecs = src.get_shape_vector(); auto const &src_strides_vecs = src.get_strides_vector(); auto const &dst_strides_vecs = dst.get_strides_vector(); @@ -1205,7 +1196,6 @@ std::pair shT simplified_red_src_strides; py::ssize_t red_src_offset(0); - using dpctl::tensor::py_internal::simplify_iteration_space_1; simplify_iteration_space_1( simplified_red_nd, src_shape_ptr + dst_nd, red_src_strides, // output @@ -1231,7 +1221,6 @@ std::pair simplified_iter_dst_strides.push_back(0); } else { - using dpctl::tensor::py_internal::simplify_iteration_space; simplify_iteration_space( iter_nd, src_shape_ptr, iter_src_strides, iter_dst_strides, // output diff --git a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp index 32e31e9707ed..294adfc93a26 100644 --- a/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp +++ b/dpctl_ext/tensor/libtensor/source/reductions/sum.cpp @@ -429,7 +429,6 @@ void init_sum(py::module_ m) auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, sum_over_axis_strided_atomic_dispatch_table, @@ -447,7 +446,6 @@ void init_sum(py::module_ m) auto sum_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype, const std::string &dst_usm_type, sycl::queue &q) { - using dpctl::tensor::py_internal::py_reduction_dtype_supported; return py_reduction_dtype_supported( input_dtype, output_dtype, dst_usm_type, q, sum_over_axis_strided_atomic_dispatch_table, diff --git a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp index f0901bbf7ab3..6e6a24f7b934 100644 --- a/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp +++ b/dpctl_ext/tensor/libtensor/source/tensor_reductions.cpp @@ -37,8 +37,6 @@ #include "reductions/reduction_common.hpp" -namespace py = pybind11; - PYBIND11_MODULE(_tensor_reductions_impl, m) { dpctl::tensor::py_internal::init_reduction_functions(m);