diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 7e1170f4ebf..6f286a8d719 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -91,7 +91,7 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp
@@ -114,22 +114,22 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/nextafter.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/reciprocal.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp
@@ -138,7 +138,7 @@ set(_elementwise_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp
-    #${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 279e3a95fd0..71ef714c642 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -73,6 +73,7 @@
     cbrt,
     ceil,
     conj,
+    copysign,
     cos,
     cosh,
     divide,
@@ -96,12 +97,22 @@
     log2,
     log10,
     logaddexp,
+    logical_and,
     logical_not,
+    logical_or,
+    logical_xor,
+    maximum,
+    minimum,
+    multiply,
     negative,
+    nextafter,
+    not_equal,
     positive,
+    pow,
     proj,
     real,
     reciprocal,
+    remainder,
     round,
     rsqrt,
     sign,
@@ -110,6 +121,7 @@
     sinh,
     sqrt,
     square,
+    subtract,
     tan,
     tanh,
     trunc,
@@ -204,6 +216,7 @@
     "concat",
     "conj",
     "copy",
+    "copysign",
     "cos",
     "cosh",
     "count_nonzero",
@@ -244,24 +257,33 @@
     "linspace",
     "log",
     "logaddexp",
+    "logical_and",
     "logical_not",
+    "logical_or",
+    "logical_xor",
     "logsumexp",
     "log1p",
     "log2",
     "log10",
     "max",
+    "maximum",
     "meshgrid",
     "min",
+    "minimum",
     "moveaxis",
+    "multiply",
     "permute_dims",
     "matmul",
     "matrix_transpose",
     "negative",
+    "nextafter",
     "nonzero",
+    "not_equal",
     "ones",
     "ones_like",
     "place",
     "positive",
+    "pow",
     "prod",
     "proj",
     "put",
@@ -269,6 +291,7 @@
     "real",
     "reciprocal",
     "reduce_hypot",
+    "remainder",
     "repeat",
     "reshape",
     "result_type",
@@ -285,6 +308,7 @@
     "square",
     "squeeze",
     "stack",
+    "subtract",
     "sum",
     "swapaxes",
     "take",
diff --git a/dpctl_ext/tensor/_elementwise_funcs.py b/dpctl_ext/tensor/_elementwise_funcs.py
index 17bdf94d9be..6442ef0b459 100644
--- a/dpctl_ext/tensor/_elementwise_funcs.py
+++ b/dpctl_ext/tensor/_elementwise_funcs.py
@@ -35,6 +35,7 @@
     _acceptance_fn_divide,
     _acceptance_fn_negative,
     _acceptance_fn_reciprocal,
+    _acceptance_fn_subtract,
     _resolve_weak_types_all_py_ints,
 )
 
@@ -1243,6 +1244,102 @@
 )
 del _logaddexp_docstring_
 
+# B16: ==== LOGICAL_AND (x1, x2)
+_logical_and_docstring_ = r"""
+logical_and(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical AND for each element `x1_i` of the input array `x1` with
+the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical AND results.
+"""
+logical_and = BinaryElementwiseFunc(
+    "logical_and",
+    ti._logical_and_result_type,
+    ti._logical_and,
+    _logical_and_docstring_,
+)
+del _logical_and_docstring_
+
+# B17: ==== LOGICAL_OR  (x1, x2)
+_logical_or_docstring_ = r"""
+logical_or(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical OR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical OR results.
+"""
+logical_or = BinaryElementwiseFunc(
+    "logical_or",
+    ti._logical_or_result_type,
+    ti._logical_or,
+    _logical_or_docstring_,
+)
+del _logical_or_docstring_
+
+# B18: ==== LOGICAL_XOR (x1, x2)
+_logical_xor_docstring_ = r"""
+logical_xor(x1, x2, /, \*, out=None, order='K')
+
+Computes the logical XOR for each element `x1_i` of the input array `x1`
+with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise logical XOR results.
+"""
+logical_xor = BinaryElementwiseFunc(
+    "logical_xor",
+    ti._logical_xor_result_type,
+    ti._logical_xor,
+    _logical_xor_docstring_,
+)
+del _logical_xor_docstring_
+
 # U24: ==== LOGICAL_NOT (x)
 _logical_not_docstring = r"""
 logical_not(x, /, \*, out=None, order='K')
@@ -1272,6 +1369,106 @@
 )
 del _logical_not_docstring
 
+# B26: ==== MAXIMUM    (x1, x2)
+_maximum_docstring_ = r"""
+maximum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise maxima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise maxima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+maximum = BinaryElementwiseFunc(
+    "maximum",
+    ti._maximum_result_type,
+    ti._maximum,
+    _maximum_docstring_,
+)
+del _maximum_docstring_
+
+# B27: ==== MINIMUM    (x1, x2)
+_minimum_docstring_ = r"""
+minimum(x1, x2, /, \*, out=None, order='K')
+
+Compares two input arrays `x1` and `x2` and returns a new array containing the
+element-wise minima.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise minima. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+minimum = BinaryElementwiseFunc(
+    "minimum",
+    ti._minimum_result_type,
+    ti._minimum,
+    _minimum_docstring_,
+)
+del _minimum_docstring_
+
+# B19: ==== MULTIPLY    (x1, x2)
+_multiply_docstring_ = r"""
+multiply(x1, x2, /, \*, out=None, order='K')
+
+Calculates the product for each element `x1_i` of the input array `x1` with the
+respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array. May have any data type.
+    x2 (usm_ndarray):
+        Second input array. May have any data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise products. The data type of
+        the returned array is determined by the Type Promotion Rules.
+"""
+multiply = BinaryElementwiseFunc(
+    "multiply",
+    ti._multiply_result_type,
+    ti._multiply,
+    _multiply_docstring_,
+    binary_inplace_fn=ti._multiply_inplace,
+)
+del _multiply_docstring_
+
 # U25: ==== NEGATIVE    (x)
 _negative_docstring_ = r"""
 negative(x, /, \*, out=None, order='K')
@@ -1302,6 +1499,77 @@
 )
 del _negative_docstring_
 
+# B28: ==== NEXTAFTER    (x1, x2)
+_nextafter_docstring_ = r"""
+nextafter(x1, x2, /, \*, out=None, order='K')
+
+Calculates the next floating-point value after element `x1_i` of the input
+array `x1` toward the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, expected to have a real-valued floating-point data
+        type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise next representable values of `x1`
+        in the direction of `x2`. The data type of the returned array is
+        determined by the Type Promotion Rules.
+"""
+nextafter = BinaryElementwiseFunc(
+    "nextafter",
+    ti._nextafter_result_type,
+    ti._nextafter,
+    _nextafter_docstring_,
+)
+del _nextafter_docstring_
+
+# B20: ==== NOT_EQUAL   (x1, x2)
+_not_equal_docstring_ = r"""
+not_equal(x1, x2, /, \*, out=None, order='K')
+
+Calculates inequality test results for each element `x1_i` of the
+input array `x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array.
+    x2 (usm_ndarray):
+        Second input array.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the result of element-wise inequality comparison.
+        The returned array has a data type of `bool`.
+"""
+
+not_equal = BinaryElementwiseFunc(
+    "not_equal",
+    ti._not_equal_result_type,
+    ti._not_equal,
+    _not_equal_docstring_,
+    weak_type_resolver=_resolve_weak_types_all_py_ints,
+)
+del _not_equal_docstring_
+
 # U26: ==== POSITIVE    (x)
 _positive_docstring_ = r"""
 positive(x, /, \*, out=None, order='K')
@@ -1328,6 +1596,40 @@
 )
 del _positive_docstring_
 
+# B21: ==== POW         (x1, x2)
+_pow_docstring_ = r"""
+pow(x1, x2, /, \*, out=None, order='K')
+
+Calculates `x1_i` raised to `x2_i` for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (usm_ndarray):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the bases in `x1` raised to the exponents in `x2`
+        element-wise. The data type of the returned array is determined by the
+        Type Promotion Rules.
+"""
+pow = BinaryElementwiseFunc(
+    "pow",
+    ti._pow_result_type,
+    ti._pow,
+    _pow_docstring_,
+    binary_inplace_fn=ti._pow_inplace,
+)
+del _pow_docstring_
+
 # U27: ==== REAL        (x)
 _real_docstring = r"""
 real(x, /, \*, out=None, order='K')
@@ -1359,6 +1661,43 @@
 )
 del _real_docstring
 
+# B22: ==== REMAINDER   (x1, x2)
+_remainder_docstring_ = r"""
+remainder(x1, x2, /, \*, out=None, order='K')
+
+Calculates the remainder of division for each element `x1_i` of the input array
+`x1` with the respective element `x2_i` of the input array `x2`.
+
+This function is equivalent to the Python modulus operator.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise remainders. Each remainder has the
+        same sign as respective element `x2_i`. The data type of the returned
+        array is determined by the Type Promotion Rules.
+"""
+remainder = BinaryElementwiseFunc(
+    "remainder",
+    ti._remainder_result_type,
+    ti._remainder,
+    _remainder_docstring_,
+    binary_inplace_fn=ti._remainder_inplace,
+)
+del _remainder_docstring_
+
 # U28: ==== ROUND       (x)
 _round_docstring = r"""
 round(x, /, \*, out=None, order='K')
@@ -1534,6 +1873,41 @@
 )
 del _sqrt_docstring_
 
+# B23: ==== SUBTRACT    (x1, x2)
+_subtract_docstring_ = r"""
+subtract(x1, x2, /, \*, out=None, order='K')
+
+Calculates the difference between each element `x1_i` of the input
+array `x1` and the respective element `x2_i` of the input array `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a numeric data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a numeric data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array must have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise differences. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+subtract = BinaryElementwiseFunc(
+    "subtract",
+    ti._subtract_result_type,
+    ti._subtract,
+    _subtract_docstring_,
+    binary_inplace_fn=ti._subtract_inplace,
+    acceptance_fn=_acceptance_fn_subtract,
+)
+del _subtract_docstring_
+
 # U34: ==== TAN         (x)
 _tan_docstring = r"""
 tan(x, /, \*, out=None, order='K')
@@ -1710,6 +2084,41 @@
 )
 del _exp2_docstring_
 
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = r"""
+copysign(x1, x2, /, \*, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real-valued floating-point data
+        type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real-valued floating-point
+        data type.
+    out (Union[usm_ndarray, None], optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the new output array, if parameter
+        `out` is ``None``.
+        Default: "K".
+
+Returns:
+    usm_ndarray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+del _copysign_docstring_
+
 # U39: ==== RSQRT        (x)
 _rsqrt_docstring_ = r"""
 rsqrt(x, /, \*, out=None, order='K')
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 00000000000..473c8f6236c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,247 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::copysign
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct CopysignOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct CopysignContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using CopySignHS =
+        hyperparam_detail::CopysignContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = CopySignHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = CopySignHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    copysign_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!CopysignOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::copysign
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
new file mode 100644
index 00000000000..8149c2e2eae
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -0,0 +1,291 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_and
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalAndFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) &&
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 && in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalAndContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalAndFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalAndStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalAndFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalAndOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalAndContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_and_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalAndHS =
+        hyperparam_detail::LogicalAndContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalAndHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalAndHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndContigFunctor,
+        logical_and_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalAndOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_and_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_and_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalAndOutputType, LogicalAndStridedFunctor,
+        logical_and_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalAndStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalAndOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_and_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_and
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
new file mode 100644
index 00000000000..e2e609b487e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -0,0 +1,290 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_or
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalOrFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) ||
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+
+        auto tmp = (in1 || in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalOrContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalOrFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalOrStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalOrFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalOrOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalOrContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_or_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event logical_or_contig_impl(sycl::queue &exec_q,
+                                   std::size_t nelems,
+                                   const char *arg1_p,
+                                   ssize_t arg1_offset,
+                                   const char *arg2_p,
+                                   ssize_t arg2_offset,
+                                   char *res_p,
+                                   ssize_t res_offset,
+                                   const std::vector<sycl::event> &depends = {})
+{
+    using LogicalOrHS =
+        hyperparam_detail::LogicalOrContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalOrHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalOrHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrContigFunctor,
+        logical_or_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalOrOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_or_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_or_strided_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            int nd,
+                            const ssize_t *shape_and_strides,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends,
+                            const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalOrOutputType, LogicalOrStridedFunctor,
+        logical_or_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalOrStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalOrOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_or_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_or
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
new file mode 100644
index 00000000000..8f725e090b1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,292 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of comparison of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::logical_xor
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct LogicalXorFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        using tu_ns::convert_impl;
+
+        return (convert_impl<bool, argT1>(in1) !=
+                convert_impl<bool, argT2>(in2));
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        using tu_ns::vec_cast;
+        auto tmp1 = vec_cast<bool, argT1, vec_sz>(in1);
+        auto tmp2 = vec_cast<bool, argT2, vec_sz>(in2);
+
+        auto tmp = (tmp1 != tmp2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using LogicalXorContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    LogicalXorFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using LogicalXorStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    LogicalXorFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct LogicalXorOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct LogicalXorContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class logical_xor_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using LogicalXorHS =
+        hyperparam_detail::LogicalXorContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = LogicalXorHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = LogicalXorHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorContigFunctor,
+        logical_xor_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()>(x, y), always bool
+     */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename LogicalXorOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class logical_xor_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    logical_xor_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, LogicalXorOutputType, LogicalXorStridedFunctor,
+        logical_xor_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct LogicalXorStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!LogicalXorOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = logical_xor_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::logical_xor
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index f204b664004..af6f95863e6 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -25,6 +25,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for elementwise evaluation of MAXIMUM(x1, x2)
@@ -52,6 +54,7 @@
 
 namespace dpctl::tensor::kernels::maximum
 {
+
 using dpctl::tensor::ssize_t;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index d18577a5cf4..5541e850485 100644
--- a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -25,6 +25,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 // THE POSSIBILITY OF SUCH DAMAGE.
 //*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
 ///
 /// \file
 /// This file defines kernels for elementwise evaluation of MINIMUM(x1, x2)
@@ -51,6 +53,7 @@
 
 namespace dpctl::tensor::kernels::minimum
 {
+
 using dpctl::tensor::ssize_t;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
new file mode 100644
index 00000000000..587a05106ea
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -0,0 +1,648 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of MUL(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::multiply
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct MultiplyFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) *
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 * in2;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 * in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            MultiplyFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using MultiplyStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct MultiplyOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct MultiplyContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event multiply_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyContigFunctor,
+        multiply_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyTypeMapFactory
+{
+    /*! @brief get typeid for output type of multiply(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename MultiplyOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class multiply_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    multiply_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, MultiplyOutputType, MultiplyStridedFunctor,
+        multiply_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+class multiply_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+using MultiplyContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        MultiplyFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, MultiplyContigMatrixContigRowBroadcastingFunctor,
+        multiply_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event multiply_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] * vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return multiply_contig_matrix_contig_row_broadcast_impl<argT2, argT1, resT>(
+        exec_q, host_tasks, n0, n1, mat_p, mat_offset, vec_p, vec_offset, res_p,
+        res_offset, depends);
+};
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename MultiplyOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    multiply_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct MultiplyInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res *= in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res *= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using MultiplyInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using MultiplyInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class multiply_inplace_contig_kernel;
+
+/* @brief Types supported by in-place multiplication */
+template <typename argTy, typename resTy>
+struct MultiplyInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, bool, resTy, bool>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct MultiplyInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x *= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (MultiplyInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    multiply_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using MulHS =
+        hyperparam_detail::MultiplyContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = MulHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = MulHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, MultiplyInplaceContigFunctor,
+        multiply_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class multiply_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event multiply_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, MultiplyInplaceStridedFunctor,
+        multiply_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = multiply_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class multiply_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using MultiplyInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        MultiplyInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event multiply_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, MultiplyInplaceRowMatrixBroadcastingFunctor,
+        multiply_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct MultiplyInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!MultiplyInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = multiply_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::multiply
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
new file mode 100644
index 00000000000..a703892a760
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/nextafter.hpp
@@ -0,0 +1,248 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of NEXTAFTER(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::nextafter
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NextafterFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::nextafter(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto res = sycl::nextafter(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(res)::element_type>) {
+            return res;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(res)::element_type, vec_sz>(
+                res);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NextafterContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    NextafterFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NextafterStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NextafterFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NextafterOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NextafterContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class nextafter_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event nextafter_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NextafterHS =
+        hyperparam_detail::NextafterContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NextafterHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NextafterHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterContigFunctor,
+        nextafter_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::nextafter(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NextafterOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class nextafter_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    nextafter_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NextafterOutputType, NextafterStridedFunctor,
+        nextafter_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NextafterStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NextafterOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = nextafter_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::nextafter
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
new file mode 100644
index 00000000000..224e3fbe5b7
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -0,0 +1,304 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of inequality of
+/// tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+
+namespace dpctl::tensor::kernels::not_equal
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct NotEqualFunctor
+{
+    static_assert(std::is_same_v<resT, bool>);
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::conjunction<
+        std::is_same<argT1, argT2>,
+        std::negation<std::disjunction<tu_ns::is_complex<argT1>,
+                                       tu_ns::is_complex<argT2>>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> && std::is_integral_v<argT2> &&
+                      std::is_signed_v<argT1> != std::is_signed_v<argT2>)
+        {
+            if constexpr (std::is_signed_v<argT1> && !std::is_signed_v<argT2>) {
+                return (in1 < 0) ? true : (static_cast<argT2>(in1) != in2);
+            }
+            else {
+                if constexpr (!std::is_signed_v<argT1> &&
+                              std::is_signed_v<argT2>) {
+                    return (in2 < 0) ? true : (in1 != static_cast<argT1>(in2));
+                }
+            }
+        }
+        else {
+            return (in1 != in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = (in1 != in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using NotEqualContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            NotEqualFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using NotEqualStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    NotEqualFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct NotEqualOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1, bool, T2, bool, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint8_t, T2, std::uint8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, std::int8_t, T2, std::int8_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int16_t, T2, std::int16_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int32_t, T2, std::int32_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::uint64_t, T2, std::int64_t, bool>,
+        td_ns::
+            BinaryTypeMapResultEntry<T1, std::int64_t, T2, std::uint64_t, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, sycl::half, T2, sycl::half, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        bool>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        bool>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct NotEqualContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class not_equal_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event not_equal_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using NotEqHS =
+        hyperparam_detail::NotEqualContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = NotEqHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = NotEqHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualContigFunctor,
+        not_equal_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualTypeMapFactory
+{
+    /*! @brief get typeid for output type of operator()!=(x, y), always bool */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename NotEqualOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class not_equal_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    not_equal_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, NotEqualOutputType, NotEqualStridedFunctor,
+        not_equal_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct NotEqualStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!NotEqualOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = not_equal_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::not_equal
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
new file mode 100644
index 00000000000..46489f45985
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -0,0 +1,602 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of POW(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "sycl_complex.hpp"
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::pow
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct PowFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            auto tmp1 = in1;
+            auto tmp2 = in2;
+            if constexpr (std::is_signed_v<argT2>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    return resT(0);
+                }
+            }
+            resT res = 1;
+            if (tmp1 == 1 || tmp2 == 0) {
+                return res;
+            }
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            return res;
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::pow(exprm_ns::complex<realT1>(in1),
+                                 exprm_ns::complex<realT2>(in2));
+        }
+        else {
+            return sycl::pow(in1, in2);
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> res;
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = in1[i];
+                auto tmp2 = in2[i];
+                if constexpr (std::is_signed_v<argT2>) {
+                    if (tmp2 < 0) {
+                        // invalid; yield 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                resT res_tmp = 1;
+                if (tmp1 == 1 || tmp2 == 0) {
+                    res[i] = res_tmp;
+                    continue;
+                }
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+            return res;
+        }
+        else {
+            auto res = sycl::pow(in1, in2);
+            if constexpr (std::is_same_v<resT,
+                                         typename decltype(res)::element_type>)
+            {
+                return res;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, typename decltype(res)::element_type,
+                                vec_sz>(res);
+            }
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            PowFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using PowStridedFunctor =
+    elementwise_common::BinaryStridedFunctor<argT1,
+                                             argT2,
+                                             resT,
+                                             IndexerT,
+                                             PowFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct PowOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct PowContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg1_p,
+                            ssize_t arg1_offset,
+                            const char *arg2_p,
+                            ssize_t arg2_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, PowOutputType, PowContigFunctor, pow_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                        arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct PowTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::pow(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename PowOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class pow_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event pow_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg1_p,
+                             ssize_t arg1_offset,
+                             const char *arg2_p,
+                             ssize_t arg2_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, PowOutputType, PowStridedFunctor, pow_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
+        arg2_offset, res_p, res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct PowInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            auto tmp1 = res;
+            auto tmp2 = in;
+            if constexpr (std::is_signed_v<argT>) {
+                if (tmp2 < 0) {
+                    // invalid; return 0
+                    res = 0;
+                    return;
+                }
+            }
+            if (tmp1 == 1) {
+                return;
+            }
+            if (tmp2 == 0) {
+                res = 1;
+                return;
+            }
+            resT res_tmp = 1;
+            while (tmp2 > 0) {
+                if (tmp2 & 1) {
+                    res_tmp *= tmp1;
+                }
+                tmp2 >>= 1;
+                tmp1 *= tmp1;
+            }
+            res = res_tmp;
+        }
+        else if constexpr (tu_ns::is_complex<argT>::value &&
+                           tu_ns::is_complex<resT>::value)
+        {
+            using r_resT = typename resT::value_type;
+            using r_argT = typename argT::value_type;
+
+            res = exprm_ns::pow(exprm_ns::complex<r_resT>(res),
+                                exprm_ns::complex<r_argT>(in));
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+        return;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (int i = 0; i < vec_sz; ++i) {
+                auto tmp1 = res[i];
+                auto tmp2 = in[i];
+                if constexpr (std::is_signed_v<argT>) {
+                    if (tmp2 < 0) {
+                        // invalid; return 0
+                        res[i] = 0;
+                        continue;
+                    }
+                }
+                if (tmp1 == 1) {
+                    continue;
+                }
+                if (tmp2 == 0) {
+                    res[i] = 1;
+                    continue;
+                }
+                resT res_tmp = 1;
+                while (tmp2 > 0) {
+                    if (tmp2 & 1) {
+                        res_tmp *= tmp1;
+                    }
+                    tmp2 >>= 1;
+                    tmp1 *= tmp1;
+                }
+                res[i] = res_tmp;
+            }
+        }
+        else {
+            res = sycl::pow(res, in);
+        }
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
+    argT,
+    resT,
+    PowInplaceFunctor<argT, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using PowInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        PowInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class pow_inplace_contig_kernel;
+
+/* @brief Types supported by in-place pow */
+template <typename argTy, typename resTy>
+struct PowInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct PowInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x **= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (PowInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_contig_impl(sycl::queue &exec_q,
+                            std::size_t nelems,
+                            const char *arg_p,
+                            ssize_t arg_offset,
+                            char *res_p,
+                            ssize_t res_offset,
+                            const std::vector<sycl::event> &depends = {})
+{
+    using PowHS = hyperparam_detail::PowContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = PowHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = PowHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel,
+        vec_sz, n_vecs>(exec_q, nelems, arg_p, arg_offset, res_p, res_offset,
+                        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class pow_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event
+    pow_inplace_strided_impl(sycl::queue &exec_q,
+                             std::size_t nelems,
+                             int nd,
+                             const ssize_t *shape_and_strides,
+                             const char *arg_p,
+                             ssize_t arg_offset,
+                             char *res_p,
+                             ssize_t res_offset,
+                             const std::vector<sycl::event> &depends,
+                             const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct PowInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!PowInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = pow_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::pow
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
new file mode 100644
index 00000000000..d7583286210
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -0,0 +1,577 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of the
+/// modulo of tensor elements.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::remainder
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct RemainderFunctor
+{
+    static_assert(std::is_same_v<argT1, argT2>);
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            if (in2 == argT2(0)) {
+                return resT(0);
+            }
+            if constexpr (std::is_signed_v<argT1> || std::is_signed_v<argT2>) {
+                auto out = (in1 % in2);
+                if (out != 0 && l_xor(in1 < 0, in2 < 0)) {
+                    out += in2;
+                }
+                return out;
+            }
+            else {
+                return (in1 % in2);
+            }
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            if (rem) {
+                if (l_xor(in2 < 0, rem < 0)) {
+                    rem += in2;
+                }
+            }
+            else {
+                rem = sycl::copysign(resT(0), in2);
+            }
+            return rem;
+        }
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        if constexpr (std::is_integral_v<argT1> || std::is_integral_v<argT2>) {
+            sycl::vec<resT, vec_sz> rem;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in2[i] == argT2(0)) {
+                    rem[i] = resT(0);
+                }
+                else {
+                    rem[i] = in1[i] % in2[i];
+                    if constexpr (std::is_signed_v<argT1> ||
+                                  std::is_signed_v<argT2>) {
+                        if (rem[i] != 0 && l_xor(in1[i] < 0, in2[i] < 0)) {
+                            rem[i] += in2[i];
+                        }
+                    }
+                }
+            }
+            return rem;
+        }
+        else {
+            auto rem = sycl::fmod(in1, in2);
+            using remT = typename decltype(rem)::element_type;
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (rem[i]) {
+                    if (l_xor(in2[i] < 0, rem[i] < 0)) {
+                        rem[i] += in2[i];
+                    }
+                }
+                else {
+                    rem[i] = sycl::copysign(remT(0), in2[i]);
+                }
+            }
+            if constexpr (std::is_same_v<resT, remT>) {
+                return rem;
+            }
+            else {
+                using dpctl::tensor::type_utils::vec_cast;
+
+                return vec_cast<resT, remT, vec_sz>(rem);
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderContigFunctor = elementwise_common::BinaryContigFunctor<
+    argT1,
+    argT2,
+    resT,
+    RemainderFunctor<argT1, argT2, resT>,
+    vec_sz,
+    n_vecs,
+    enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using RemainderStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    RemainderFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct RemainderOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct RemainderContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event remainder_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg1_p,
+                                  ssize_t arg1_offset,
+                                  const char *arg2_p,
+                                  ssize_t arg2_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderContigFunctor,
+        remainder_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderTypeMapFactory
+{
+    /*! @brief get typeid for output type of remainder(T x, T y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RemainderOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class remainder_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    remainder_strided_impl(sycl::queue &exec_q,
+                           std::size_t nelems,
+                           int nd,
+                           const ssize_t *shape_and_strides,
+                           const char *arg1_p,
+                           ssize_t arg1_offset,
+                           const char *arg2_p,
+                           ssize_t arg2_offset,
+                           char *res_p,
+                           ssize_t res_offset,
+                           const std::vector<sycl::event> &depends,
+                           const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, RemainderOutputType, RemainderStridedFunctor,
+        remainder_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                  arg1_offset, arg2_p, arg2_offset, res_p,
+                                  res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct RemainderInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    // functor is only well-defined when argT and resT are the same
+    static_assert(std::is_same_v<argT, resT>);
+
+    void operator()(resT &res, const argT &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+            if (in == argT(0)) {
+                res = 0;
+                return;
+            }
+            if constexpr (std::is_signed_v<argT> || std::is_signed_v<resT>) {
+                auto tmp = res;
+                res %= in;
+                if (res != resT(0) && l_xor(tmp < 0, in < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res %= in;
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+            if (res) {
+                if (l_xor(in < 0, res < 0)) {
+                    res += in;
+                }
+            }
+            else {
+                res = sycl::copysign(resT(0), in);
+            }
+        }
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        if constexpr (std::is_integral_v<argT> || std::is_integral_v<resT>) {
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (in[i] == argT(0)) {
+                    res[i] = 0;
+                }
+                else {
+                    auto rem = res[i] % in[i];
+                    if constexpr (std::is_signed_v<argT> ||
+                                  std::is_signed_v<resT>) {
+                        if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) {
+                            rem += in[i];
+                        }
+                    }
+                    res[i] = rem;
+                }
+            }
+        }
+        else {
+            res = sycl::fmod(res, in);
+#pragma unroll
+            for (auto i = 0; i < vec_sz; ++i) {
+                if (res[i]) {
+                    if (l_xor(in[i] < 0, res[i] < 0)) {
+                        res[i] += in[i];
+                    }
+                }
+                else {
+                    res[i] = sycl::copysign(resT(0), in[i]);
+                }
+            }
+        }
+    }
+
+private:
+    bool l_xor(bool b1, bool b2) const
+    {
+        return (b1 != b2);
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using RemainderInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        RemainderInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using RemainderInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        RemainderInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class remainder_inplace_contig_kernel;
+
+/* @brief Types supported by in-place remainder */
+template <typename argTy, typename resTy>
+struct RemainderInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct RemainderInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x %= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (RemainderInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    remainder_inplace_contig_impl(sycl::queue &exec_q,
+                                  std::size_t nelems,
+                                  const char *arg_p,
+                                  ssize_t arg_offset,
+                                  char *res_p,
+                                  ssize_t res_offset,
+                                  const std::vector<sycl::event> &depends = {})
+{
+    using RemHS =
+        hyperparam_detail::RemainderContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = RemHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = RemHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, RemainderInplaceContigFunctor,
+        remainder_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class remainder_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event remainder_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, RemainderInplaceStridedFunctor,
+        remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                          arg_p, arg_offset, res_p, res_offset,
+                                          depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct RemainderInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!RemainderInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = remainder_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::remainder
diff --git a/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
new file mode 100644
index 00000000000..e43abdbe009
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -0,0 +1,646 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of DIVIDE(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+#include <vector>
+
+#include "vec_size_util.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch_building.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+
+namespace dpctl::tensor::kernels::subtract
+{
+
+using dpctl::tensor::ssize_t;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT>
+struct SubtractFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT1>, tu_ns::is_complex<argT2>>>;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return in1 - in2;
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+        operator()(const sycl::vec<argT1, vec_sz> &in1,
+                   const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = in1 - in2;
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            SubtractFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs,
+                                            enable_sg_loadstore>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using SubtractStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2>
+struct SubtractOutputType
+{
+    using value_type = typename std::disjunction<
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint8_t,
+                                        T2,
+                                        std::uint8_t,
+                                        std::uint8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int8_t,
+                                        T2,
+                                        std::int8_t,
+                                        std::int8_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint16_t,
+                                        T2,
+                                        std::uint16_t,
+                                        std::uint16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int16_t,
+                                        T2,
+                                        std::int16_t,
+                                        std::int16_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint32_t,
+                                        T2,
+                                        std::uint32_t,
+                                        std::uint32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int32_t,
+                                        T2,
+                                        std::int32_t,
+                                        std::int32_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::uint64_t,
+                                        T2,
+                                        std::uint64_t,
+                                        std::uint64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::int64_t,
+                                        T2,
+                                        std::int64_t,
+                                        std::int64_t>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<float>,
+                                        T2,
+                                        std::complex<float>,
+                                        std::complex<float>>,
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        std::complex<double>,
+                                        T2,
+                                        std::complex<double>,
+                                        std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+
+    static constexpr bool is_defined = !std::is_same_v<value_type, void>;
+};
+
+namespace hyperparam_detail
+{
+
+namespace vsu_ns = dpctl::tensor::kernels::vec_size_utils;
+
+using vsu_ns::BinaryContigHyperparameterSetEntry;
+using vsu_ns::ContigHyperparameterSetDefault;
+
+template <typename argTy1, typename argTy2>
+struct SubtractContigHyperparameterSet
+{
+    using value_type =
+        typename std::disjunction<ContigHyperparameterSetDefault<4u, 2u>>;
+
+    constexpr static auto vec_sz = value_type::vec_sz;
+    constexpr static auto n_vecs = value_type::n_vecs;
+};
+
+} // end of namespace hyperparam_detail
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event subtract_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg1_p,
+                                 ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 ssize_t arg2_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<argTy1, argTy2>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractContigFunctor,
+        subtract_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
+        res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename SubtractOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class subtract_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+    subtract_strided_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          int nd,
+                          const ssize_t *shape_and_strides,
+                          const char *arg1_p,
+                          ssize_t arg1_offset,
+                          const char *arg2_p,
+                          ssize_t arg2_offset,
+                          char *res_p,
+                          ssize_t res_offset,
+                          const std::vector<sycl::event> &depends,
+                          const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, SubtractOutputType, SubtractStridedFunctor,
+        subtract_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigMatrixContigRowBroadcastingFunctor =
+    elementwise_common::BinaryContigMatrixContigRowBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+using SubtractContigRowContigMatrixBroadcastingFunctor =
+    elementwise_common::BinaryContigRowContigMatrixBroadcastingFunctor<
+        argT1,
+        argT2,
+        resT,
+        SubtractFunctor<argT1, argT2, resT>>;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_matrix_row_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+class subtract_row_matrix_broadcast_sg_krn;
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_matrix_contig_row_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = mat[i,j] - vec[j]
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
+        argT1, argT2, resT, SubtractContigMatrixContigRowBroadcastingFunctor,
+        subtract_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
+                                              mat_offset, vec_p, vec_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigMatrixContigRowBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_matrix_contig_row_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT1, typename argT2, typename resT>
+sycl::event subtract_contig_row_contig_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix,
+                 //    res[i,j] = op(vec[j], mat[i,j])
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl<
+        argT1, argT2, resT, SubtractContigRowContigMatrixBroadcastingFunctor,
+        subtract_row_matrix_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, vec_p,
+                                              vec_offset, mat_p, mat_offset,
+                                              res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractContigRowContigMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractOutputType<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            using resT = typename SubtractOutputType<T1, T2>::value_type;
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value ||
+                          dpctl::tensor::type_utils::is_complex<resT>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn =
+                    subtract_contig_row_contig_matrix_broadcast_impl<T1, T2,
+                                                                     resT>;
+                return fn;
+            }
+        }
+    }
+};
+
+template <typename argT, typename resT>
+struct SubtractInplaceFunctor
+{
+
+    using supports_sg_loadstore = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+    using supports_vec = std::negation<
+        std::disjunction<tu_ns::is_complex<argT>, tu_ns::is_complex<resT>>>;
+
+    void operator()(resT &res, const argT &in)
+    {
+        res -= in;
+    }
+
+    template <int vec_sz>
+    void operator()(sycl::vec<resT, vec_sz> &res,
+                    const sycl::vec<argT, vec_sz> &in)
+    {
+        res -= in;
+    }
+};
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz = 4u,
+          std::uint8_t n_vecs = 2u,
+          bool enable_sg_loadstore = true>
+using SubtractInplaceContigFunctor =
+    elementwise_common::BinaryInplaceContigFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>,
+        vec_sz,
+        n_vecs,
+        enable_sg_loadstore>;
+
+template <typename argT, typename resT, typename IndexerT>
+using SubtractInplaceStridedFunctor =
+    elementwise_common::BinaryInplaceStridedFunctor<
+        argT,
+        resT,
+        IndexerT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT,
+          typename resT,
+          std::uint8_t vec_sz,
+          std::uint8_t n_vecs>
+class subtract_inplace_contig_kernel;
+
+/* @brief Types supported by in-place subtraction */
+template <typename argTy, typename resTy>
+struct SubtractInplaceTypePairSupport
+{
+    /* value if true a kernel for <argTy, resTy> must be instantiated  */
+    static constexpr bool is_defined = std::disjunction<
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, resTy, std::int8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, resTy, std::uint8_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, resTy, std::int16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, resTy, std::uint16_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, resTy, std::int32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, resTy, std::uint32_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, resTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, resTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, sycl::half, resTy, sycl::half>,
+        td_ns::TypePairDefinedEntry<argTy, float, resTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, double, resTy, double>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<float>,
+                                    resTy,
+                                    std::complex<float>>,
+        td_ns::TypePairDefinedEntry<argTy,
+                                    std::complex<double>,
+                                    resTy,
+                                    std::complex<double>>,
+        // fall-through
+        td_ns::NotDefinedEntry>::is_defined;
+};
+
+template <typename fnT, typename argT, typename resT>
+struct SubtractInplaceTypeMapFactory
+{
+    /*! @brief get typeid for output type of x -= y */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        if constexpr (SubtractInplaceTypePairSupport<argT, resT>::is_defined) {
+            return td_ns::GetTypeid<resT>{}.get();
+        }
+        else {
+            return td_ns::GetTypeid<void>{}.get();
+        }
+    }
+};
+
+template <typename argTy, typename resTy>
+sycl::event
+    subtract_inplace_contig_impl(sycl::queue &exec_q,
+                                 std::size_t nelems,
+                                 const char *arg_p,
+                                 ssize_t arg_offset,
+                                 char *res_p,
+                                 ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    using SubHS =
+        hyperparam_detail::SubtractContigHyperparameterSet<resTy, argTy>;
+    static constexpr std::uint8_t vec_sz = SubHS::vec_sz;
+    static constexpr std::uint8_t n_vecs = SubHS::n_vecs;
+
+    return elementwise_common::binary_inplace_contig_impl<
+        argTy, resTy, SubtractInplaceContigFunctor,
+        subtract_inplace_contig_kernel, vec_sz, n_vecs>(
+        exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceContigFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename resT, typename argT, typename IndexerT>
+class subtract_inplace_strided_kernel;
+
+template <typename argTy, typename resTy>
+sycl::event subtract_inplace_strided_impl(
+    sycl::queue &exec_q,
+    std::size_t nelems,
+    int nd,
+    const ssize_t *shape_and_strides,
+    const char *arg_p,
+    ssize_t arg_offset,
+    char *res_p,
+    ssize_t res_offset,
+    const std::vector<sycl::event> &depends,
+    const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_inplace_strided_impl<
+        argTy, resTy, SubtractInplaceStridedFunctor,
+        subtract_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides,
+                                         arg_p, arg_offset, res_p, res_offset,
+                                         depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = subtract_inplace_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename argT, typename resT>
+class subtract_inplace_row_matrix_broadcast_sg_krn;
+
+template <typename argT, typename resT>
+using SubtractInplaceRowMatrixBroadcastingFunctor =
+    elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor<
+        argT,
+        resT,
+        SubtractInplaceFunctor<argT, resT>>;
+
+template <typename argT, typename resT>
+sycl::event subtract_inplace_row_matrix_broadcast_impl(
+    sycl::queue &exec_q,
+    std::vector<sycl::event> &host_tasks,
+    std::size_t n0,
+    std::size_t n1,
+    const char *vec_p, // typeless pointer to (n1,) contiguous row
+    ssize_t vec_offset,
+    char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix
+    ssize_t mat_offset,
+    const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_inplace_row_matrix_broadcast_impl<
+        argT, resT, SubtractInplaceRowMatrixBroadcastingFunctor,
+        subtract_inplace_row_matrix_broadcast_sg_krn>(
+        exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset,
+        depends);
+}
+
+template <typename fnT, typename T1, typename T2>
+struct SubtractInplaceRowMatrixBroadcastFactory
+{
+    fnT get()
+    {
+        if constexpr (!SubtractInplaceTypePairSupport<T1, T2>::is_defined) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            if constexpr (dpctl::tensor::type_utils::is_complex<T1>::value ||
+                          dpctl::tensor::type_utils::is_complex<T2>::value)
+            {
+                fnT fn = nullptr;
+                return fn;
+            }
+            else {
+                fnT fn = subtract_inplace_row_matrix_broadcast_impl<T1, T2>;
+                return fn;
+            }
+        }
+    }
+};
+
+} // namespace dpctl::tensor::kernels::subtract
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
new file mode 100644
index 00000000000..dea66b6c940
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "copysign.hpp"
+#include "elementwise_functions.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B25: ===== COPYSIGN (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_copysign(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
new file mode 100644
index 00000000000..875443d792c
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/copysign.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_copysign(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
index efee3ef529f..dc09318d66a 100644
--- a/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp
@@ -54,7 +54,7 @@
 #include "cbrt.hpp"
 #include "ceil.hpp"
 #include "conj.hpp"
-// #include "copysign.hpp"
+#include "copysign.hpp"
 #include "cos.hpp"
 #include "cosh.hpp"
 #include "equal.hpp"
@@ -77,22 +77,22 @@
 #include "log1p.hpp"
 #include "log2.hpp"
 #include "logaddexp.hpp"
-// #include "logical_and.hpp"
+#include "logical_and.hpp"
 #include "logical_not.hpp"
-// #include "logical_or.hpp"
-// #include "logical_xor.hpp"
-// #include "maximum.hpp"
-// #include "minimum.hpp"
-// #include "multiply.hpp"
+#include "logical_or.hpp"
+#include "logical_xor.hpp"
+#include "maximum.hpp"
+#include "minimum.hpp"
+#include "multiply.hpp"
 #include "negative.hpp"
-// #include "nextafter.hpp"
-// #include "not_equal.hpp"
+#include "nextafter.hpp"
+#include "not_equal.hpp"
 #include "positive.hpp"
-// #include "pow.hpp"
+#include "pow.hpp"
 #include "proj.hpp"
 #include "real.hpp"
 #include "reciprocal.hpp"
-// #include "remainder.hpp"
+#include "remainder.hpp"
 #include "round.hpp"
 #include "rsqrt.hpp"
 #include "sign.hpp"
@@ -101,7 +101,7 @@
 #include "sinh.hpp"
 #include "sqrt.hpp"
 #include "square.hpp"
-// #include "subtract.hpp"
+#include "subtract.hpp"
 #include "tan.hpp"
 #include "tanh.hpp"
 #include "true_divide.hpp"
@@ -134,7 +134,7 @@ void init_elementwise_functions(py::module_ m)
     init_cbrt(m);
     init_ceil(m);
     init_conj(m);
-    // init_copysign(m);
+    init_copysign(m);
     init_cos(m);
     init_cosh(m);
     init_divide(m);
@@ -158,22 +158,22 @@ void init_elementwise_functions(py::module_ m)
     init_log1p(m);
     init_log2(m);
     init_logaddexp(m);
-    // init_logical_and(m);
+    init_logical_and(m);
     init_logical_not(m);
-    // init_logical_or(m);
-    // init_logical_xor(m);
-    // init_maximum(m);
-    // init_minimum(m);
-    // init_multiply(m);
-    // init_nextafter(m);
+    init_logical_or(m);
+    init_logical_xor(m);
+    init_maximum(m);
+    init_minimum(m);
+    init_multiply(m);
+    init_nextafter(m);
     init_negative(m);
-    // init_not_equal(m);
+    init_not_equal(m);
     init_positive(m);
-    // init_pow(m);
+    init_pow(m);
     init_proj(m);
     init_real(m);
     init_reciprocal(m);
-    // init_remainder(m);
+    init_remainder(m);
     init_round(m);
     init_rsqrt(m);
     init_sign(m);
@@ -182,7 +182,7 @@ void init_elementwise_functions(py::module_ m)
     init_sinh(m);
     init_sqrt(m);
     init_square(m);
-    // init_subtract(m);
+    init_subtract(m);
     init_tan(m);
     init_tanh(m);
     init_trunc(m);
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
new file mode 100644
index 00000000000..7b7d21301e4
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_and.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_and.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B16: ===== LOGICAL_AND (x1, x2)
+namespace impl
+{
+namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and;
+
+static binary_contig_impl_fn_ptr_t
+    logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_and_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_and_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalAndTypeMapFactory;
+    DispatchTableBuilder<int, LogicalAndTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_and_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalAndStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalAndStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_and_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalAndContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalAndContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_and_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_and(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_and_dispatch_tables();
+        using impl::logical_and_contig_dispatch_table;
+        using impl::logical_and_output_id_table;
+        using impl::logical_and_strided_dispatch_table;
+
+        auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_and_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_and_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_and_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_and_output_id_table);
+        };
+        m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_and_result_type", logical_and_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
new file mode 100644
index 00000000000..c22a98f2414
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_and.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_and(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
new file mode 100644
index 00000000000..270b257a80e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_or.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_or.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B17: ===== LOGICAL_OR (x1, x2)
+namespace impl
+{
+namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or;
+
+static binary_contig_impl_fn_ptr_t
+    logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_or_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_or_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalOrTypeMapFactory;
+    DispatchTableBuilder<int, LogicalOrTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_or_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalOrStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalOrStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_or_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalOrContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalOrContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_or_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_or(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_or_dispatch_tables();
+        using impl::logical_or_contig_dispatch_table;
+        using impl::logical_or_output_id_table;
+        using impl::logical_or_strided_dispatch_table;
+
+        auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                    const arrayT &dst, sycl::queue &exec_q,
+                                    const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_or_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_or_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_or_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_or_output_id_table);
+        };
+        m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_or_result_type", logical_or_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
new file mode 100644
index 00000000000..11e83fe8ced
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_or.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_or(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
new file mode 100644
index 00000000000..d49b52cfc42
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "logical_xor.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/logical_xor.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B18: ===== LOGICAL_XOR (x1, x2)
+namespace impl
+{
+namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor;
+
+static binary_contig_impl_fn_ptr_t
+    logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_logical_xor_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = logical_xor_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::LogicalXorTypeMapFactory;
+    DispatchTableBuilder<int, LogicalXorTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(logical_xor_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::LogicalXorStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, LogicalXorStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::LogicalXorContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, LogicalXorContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_logical_xor(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_logical_xor_dispatch_tables();
+        using impl::logical_xor_contig_dispatch_table;
+        using impl::logical_xor_output_id_table;
+        using impl::logical_xor_strided_dispatch_table;
+
+        auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                     const arrayT &dst, sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, logical_xor_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                logical_xor_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                logical_xor_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1,
+                                                 const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               logical_xor_output_id_table);
+        };
+        m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
new file mode 100644
index 00000000000..24c16324912
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/logical_xor.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_logical_xor(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
new file mode 100644
index 00000000000..dc88f2b9ad8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "maximum.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/maximum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B26: ===== MAXIMUM (x1, x2)
+namespace impl
+{
+namespace maximum_fn_ns = dpctl::tensor::kernels::maximum;
+
+static binary_contig_impl_fn_ptr_t
+    maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int maximum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_maximum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = maximum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MaximumTypeMapFactory;
+    DispatchTableBuilder<int, MaximumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(maximum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MaximumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MaximumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(maximum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MaximumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MaximumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(maximum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_maximum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_maximum_dispatch_tables();
+        using impl::maximum_contig_dispatch_table;
+        using impl::maximum_output_id_table;
+        using impl::maximum_strided_dispatch_table;
+
+        auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, maximum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                maximum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                maximum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto maximum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               maximum_output_id_table);
+        };
+        m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_maximum_result_type", maximum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
new file mode 100644
index 00000000000..1f8fc027ac1
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/maximum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_maximum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
new file mode 100644
index 00000000000..ef48b765860
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "minimum.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/minimum.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B27: ===== MINIMUM (x1, x2)
+namespace impl
+{
+namespace minimum_fn_ns = dpctl::tensor::kernels::minimum;
+
+static binary_contig_impl_fn_ptr_t
+    minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int minimum_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_minimum_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = minimum_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MinimumTypeMapFactory;
+    DispatchTableBuilder<int, MinimumTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(minimum_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MinimumStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MinimumStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(minimum_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MinimumContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MinimumContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(minimum_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_minimum(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_minimum_dispatch_tables();
+        using impl::minimum_contig_dispatch_table;
+        using impl::minimum_output_id_table;
+        using impl::minimum_strided_dispatch_table;
+
+        auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                 const arrayT &dst, sycl::queue &exec_q,
+                                 const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, minimum_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                minimum_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                minimum_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto minimum_result_type_pyapi = [&](const py::dtype &dtype1,
+                                             const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               minimum_output_id_table);
+        };
+        m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_minimum_result_type", minimum_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
new file mode 100644
index 00000000000..be2e18a9b37
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/minimum.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_minimum(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
new file mode 100644
index 00000000000..49611aaaf38
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.cpp
@@ -0,0 +1,243 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "multiply.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/multiply.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B19: ===== MULTIPLY (x1, x2)
+namespace impl
+{
+
+namespace multiply_fn_ns = dpctl::tensor::kernels::multiply;
+
+static binary_contig_impl_fn_ptr_t
+    multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int multiply_output_id_table[td_ns::num_types][td_ns::num_types];
+static int multiply_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// mul(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    multiply_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// mul(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    multiply_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    multiply_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_multiply_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = multiply_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::MultiplyTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(multiply_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::MultiplyStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, MultiplyStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(multiply_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::MultiplyContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, MultiplyContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(multiply_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        MultiplyContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        multiply_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        MultiplyContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::MultiplyInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         MultiplyInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::MultiplyInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         MultiplyInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         MultiplyInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::MultiplyInplaceTypeMapFactory;
+    DispatchTableBuilder<int, MultiplyInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(multiply_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_multiply(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_multiply_dispatch_tables();
+        using impl::multiply_contig_dispatch_table;
+        using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::multiply_output_id_table;
+        using impl::multiply_strided_dispatch_table;
+
+        auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, multiply_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                multiply_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                multiply_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                multiply_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto multiply_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               multiply_output_id_table);
+        };
+        m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_multiply_result_type", multiply_result_type_pyapi, "");
+
+        using impl::multiply_inplace_contig_dispatch_table;
+        using impl::multiply_inplace_output_id_table;
+        using impl::multiply_inplace_row_matrix_dispatch_table;
+        using impl::multiply_inplace_strided_dispatch_table;
+
+        auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, multiply_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                multiply_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                multiply_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                multiply_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
new file mode 100644
index 00000000000..a4ed946a850
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/multiply.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_multiply(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
new file mode 100644
index 00000000000..2fd706f667e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "nextafter.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/nextafter.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B28: ===== NEXTAFTER (x1, x2)
+namespace impl
+{
+namespace nextafter_fn_ns = dpctl::tensor::kernels::nextafter;
+
+static binary_contig_impl_fn_ptr_t
+    nextafter_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int nextafter_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    nextafter_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_nextafter_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = nextafter_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NextafterTypeMapFactory;
+    DispatchTableBuilder<int, NextafterTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(nextafter_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NextafterStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NextafterStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(nextafter_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NextafterContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NextafterContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(nextafter_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_nextafter(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_nextafter_dispatch_tables();
+        using impl::nextafter_contig_dispatch_table;
+        using impl::nextafter_output_id_table;
+        using impl::nextafter_strided_dispatch_table;
+
+        auto nextafter_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, nextafter_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                nextafter_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                nextafter_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto nextafter_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               nextafter_output_id_table);
+        };
+        m.def("_nextafter", nextafter_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_nextafter_result_type", nextafter_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
new file mode 100644
index 00000000000..76ad701d401
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/nextafter.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_nextafter(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
new file mode 100644
index 00000000000..967bfda4e8d
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.cpp
@@ -0,0 +1,145 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "not_equal.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/not_equal.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+// B20: ===== NOT_EQUAL (x1, x2)
+namespace impl
+{
+namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal;
+
+static binary_contig_impl_fn_ptr_t
+    not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_not_equal_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = not_equal_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::NotEqualTypeMapFactory;
+    DispatchTableBuilder<int, NotEqualTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(not_equal_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::NotEqualStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, NotEqualStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(not_equal_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::NotEqualContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, NotEqualContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(not_equal_contig_dispatch_table);
+};
+
+} // namespace impl
+
+void init_not_equal(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_not_equal_dispatch_tables();
+        using impl::not_equal_contig_dispatch_table;
+        using impl::not_equal_output_id_table;
+        using impl::not_equal_strided_dispatch_table;
+
+        auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, not_equal_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                not_equal_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                not_equal_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               not_equal_output_id_table);
+        };
+        m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_not_equal_result_type", not_equal_result_type_pyapi, "");
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
new file mode 100644
index 00000000000..c6c99bb793b
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/not_equal.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_not_equal(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
new file mode 100644
index 00000000000..cfb0649e317
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.cpp
@@ -0,0 +1,202 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "pow.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/pow.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B21: ===== POW (x1, x2)
+namespace impl
+{
+
+namespace pow_fn_ns = dpctl::tensor::kernels::pow;
+
+static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types]
+                                                            [td_ns::num_types];
+
+static int pow_output_id_table[td_ns::num_types][td_ns::num_types];
+static int pow_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_pow_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = pow_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::PowTypeMapFactory;
+    DispatchTableBuilder<int, PowTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(pow_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::PowStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, PowStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(pow_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::PowContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, PowContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(pow_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::PowInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         PowInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::PowInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         PowInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::PowInplaceTypeMapFactory;
+    DispatchTableBuilder<int, PowInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(pow_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_pow(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_pow_dispatch_tables();
+        using impl::pow_contig_dispatch_table;
+        using impl::pow_output_id_table;
+        using impl::pow_strided_dispatch_table;
+
+        auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                             const arrayT &dst, sycl::queue &exec_q,
+                             const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, pow_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                pow_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                pow_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto pow_result_type_pyapi = [&](const py::dtype &dtype1,
+                                         const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               pow_output_id_table);
+        };
+        m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_pow_result_type", pow_result_type_pyapi, "");
+
+        using impl::pow_inplace_contig_dispatch_table;
+        using impl::pow_inplace_output_id_table;
+        using impl::pow_inplace_strided_dispatch_table;
+
+        auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                     sycl::queue &exec_q,
+                                     const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, pow_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                pow_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                pow_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
new file mode 100644
index 00000000000..197a23b80d8
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/pow.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_pow(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
new file mode 100644
index 00000000000..a4a8c7fbd6e
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.cpp
@@ -0,0 +1,204 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "remainder.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/remainder.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B22: ===== REMAINDER (x1, x2)
+namespace impl
+{
+
+namespace remainder_fn_ns = dpctl::tensor::kernels::remainder;
+
+static binary_contig_impl_fn_ptr_t
+    remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int remainder_output_id_table[td_ns::num_types][td_ns::num_types];
+static int remainder_inplace_output_id_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    remainder_inplace_strided_dispatch_table[td_ns::num_types]
+                                            [td_ns::num_types];
+
+void populate_remainder_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = remainder_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::RemainderTypeMapFactory;
+    DispatchTableBuilder<int, RemainderTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(remainder_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::RemainderStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, RemainderStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(remainder_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::RemainderContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, RemainderContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(remainder_contig_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::RemainderInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         RemainderInplaceStridedFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::RemainderInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         RemainderInplaceContigFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::RemainderInplaceTypeMapFactory;
+    DispatchTableBuilder<int, RemainderInplaceTypeMapFactory, num_types> dtb6;
+    dtb6.populate_dispatch_table(remainder_inplace_output_id_table);
+}
+
+} // namespace impl
+
+void init_remainder(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_remainder_dispatch_tables();
+        using impl::remainder_contig_dispatch_table;
+        using impl::remainder_output_id_table;
+        using impl::remainder_strided_dispatch_table;
+
+        auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                   const arrayT &dst, sycl::queue &exec_q,
+                                   const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, remainder_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                remainder_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                remainder_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto remainder_result_type_pyapi = [&](const py::dtype &dtype1,
+                                               const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               remainder_output_id_table);
+        };
+        m.def("_remainder", remainder_pyapi, "", py::arg("src1"),
+              py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_remainder_result_type", remainder_result_type_pyapi, "");
+
+        using impl::remainder_inplace_contig_dispatch_table;
+        using impl::remainder_inplace_output_id_table;
+        using impl::remainder_inplace_strided_dispatch_table;
+
+        auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                           sycl::queue &exec_q,
+                                           const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, remainder_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                remainder_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                remainder_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
new file mode 100644
index 00000000000..c00bdc9e0e6
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/remainder.hpp
@@ -0,0 +1,46 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl::tensor::py_internal
+{
+
+extern void init_remainder(py::module_ m);
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
new file mode 100644
index 00000000000..bcd15c2b0b5
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.cpp
@@ -0,0 +1,242 @@
+//*****************************************************************************
+// Copyright (c) 2026, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+// - Neither the name of the copyright holder nor the names of its contributors
+//   may be used to endorse or promote products derived from this software
+//   without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_elementwise_impl
+/// extension, specifically functions for elementwise operations.
+//===---------------------------------------------------------------------===//
+
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+#include "dpnp4pybind11.hpp"
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "elementwise_functions.hpp"
+#include "subtract.hpp"
+#include "utils/type_dispatch.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include "kernels/elementwise_functions/common_inplace.hpp"
+#include "kernels/elementwise_functions/subtract.hpp"
+
+namespace dpctl::tensor::py_internal
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common;
+using ew_cmn_ns::binary_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_strided_impl_fn_ptr_t;
+
+using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t;
+using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t;
+
+// B23: ===== SUBTRACT (x1, x2)
+namespace impl
+{
+namespace subtract_fn_ns = dpctl::tensor::kernels::subtract;
+
+static binary_contig_impl_fn_ptr_t
+    subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+static int subtract_output_id_table[td_ns::num_types][td_ns::num_types];
+static int subtract_inplace_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+// sub(matrix, row)
+static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t
+    subtract_contig_matrix_contig_row_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+// sub(row, matrix)
+static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t
+    subtract_contig_row_contig_matrix_broadcast_dispatch_table
+        [td_ns::num_types][td_ns::num_types];
+
+static binary_inplace_contig_impl_fn_ptr_t
+    subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_strided_impl_fn_ptr_t
+    subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t
+    subtract_inplace_row_matrix_dispatch_table[td_ns::num_types]
+                                              [td_ns::num_types];
+
+void populate_subtract_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = subtract_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::SubtractTypeMapFactory;
+    DispatchTableBuilder<int, SubtractTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(subtract_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::SubtractStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, SubtractStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(subtract_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::SubtractContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, SubtractContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(subtract_contig_dispatch_table);
+
+    // function pointers for operation on contiguous matrix, contiguous row
+    // with contiguous matrix output
+    using fn_ns::SubtractContigMatrixContigRowBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t,
+        SubtractContigMatrixContigRowBroadcastFactory, num_types>
+        dtb4;
+    dtb4.populate_dispatch_table(
+        subtract_contig_matrix_contig_row_broadcast_dispatch_table);
+
+    // function pointers for operation on contiguous row, contiguous matrix
+    // with contiguous matrix output
+    using fn_ns::SubtractContigRowContigMatrixBroadcastFactory;
+    DispatchTableBuilder<
+        binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t,
+        SubtractContigRowContigMatrixBroadcastFactory, num_types>
+        dtb5;
+    dtb5.populate_dispatch_table(
+        subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+
+    // function pointers for inplace operation on general strided arrays
+    using fn_ns::SubtractInplaceStridedFactory;
+    DispatchTableBuilder<binary_inplace_strided_impl_fn_ptr_t,
+                         SubtractInplaceStridedFactory, num_types>
+        dtb6;
+    dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table);
+
+    // function pointers for inplace operation on contiguous inputs and output
+    using fn_ns::SubtractInplaceContigFactory;
+    DispatchTableBuilder<binary_inplace_contig_impl_fn_ptr_t,
+                         SubtractInplaceContigFactory, num_types>
+        dtb7;
+    dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table);
+
+    // function pointers for inplace operation on contiguous matrix
+    // and contiguous row
+    using fn_ns::SubtractInplaceRowMatrixBroadcastFactory;
+    DispatchTableBuilder<binary_inplace_row_matrix_broadcast_impl_fn_ptr_t,
+                         SubtractInplaceRowMatrixBroadcastFactory, num_types>
+        dtb8;
+    dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table);
+
+    // which types are supported by the in-place kernels
+    using fn_ns::SubtractInplaceTypeMapFactory;
+    DispatchTableBuilder<int, SubtractInplaceTypeMapFactory, num_types> dtb9;
+    dtb9.populate_dispatch_table(subtract_inplace_output_id_table);
+};
+
+} // namespace impl
+
+void init_subtract(py::module_ m)
+{
+    using arrayT = dpctl::tensor::usm_ndarray;
+    using event_vecT = std::vector<sycl::event>;
+    {
+        impl::populate_subtract_dispatch_tables();
+        using impl::subtract_contig_dispatch_table;
+        using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table;
+        using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table;
+        using impl::subtract_output_id_table;
+        using impl::subtract_strided_dispatch_table;
+
+        auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2,
+                                  const arrayT &dst, sycl::queue &exec_q,
+                                  const event_vecT &depends = {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, subtract_output_id_table,
+                // function pointers to handle operation on contiguous
+                // arrays (pointers may be nullptr)
+                subtract_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays
+                // (most general case)
+                subtract_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_matrix_contig_row_broadcast_dispatch_table,
+                // function pointers to handle operation of c-contig matrix
+                // and c-contig row with broadcasting (may be nullptr)
+                subtract_contig_row_contig_matrix_broadcast_dispatch_table);
+        };
+        auto subtract_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               subtract_output_id_table);
+        };
+        m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_subtract_result_type", subtract_result_type_pyapi, "");
+
+        using impl::subtract_inplace_contig_dispatch_table;
+        using impl::subtract_inplace_output_id_table;
+        using impl::subtract_inplace_row_matrix_dispatch_table;
+        using impl::subtract_inplace_strided_dispatch_table;
+
+        auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst,
+                                          sycl::queue &exec_q,
+                                          const event_vecT &depends = {}) {
+            return py_binary_inplace_ufunc(
+                src, dst, exec_q, depends, subtract_inplace_output_id_table,
+                // function pointers to handle inplace operation on
+                // contiguous arrays (pointers may be nullptr)
+                subtract_inplace_contig_dispatch_table,
+                // function pointers to handle inplace operation on strided
+                // arrays (most general case)
+                subtract_inplace_strided_dispatch_table,
+                // function pointers to handle inplace operation on
+                // c-contig matrix with c-contig row with broadcasting
+                // (may be nullptr)
+                subtract_inplace_row_matrix_dispatch_table);
+        };
+        m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"),
+              py::arg("rhs"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+    }
+}
+
+} // namespace dpctl::tensor::py_internal
diff --git a/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp
new file mode 100644
index 00000000000..89cdfd6d0ea
--- /dev/null
+++ b/dpctl_ext/tensor/libtensor/source/elementwise_functions/subtract.hpp
@@ -0,0 +1,42 @@
+//===----------- Implementation of _tensor_impl module  ---------*-C++-*-/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_impl extensions,
+/// specifically functions for elementwise operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_subtract(py::module_ m);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
index a9ec3ebd0a8..711ecaf9722 100644
--- a/dpnp/dpnp_iface_logic.py
+++ b/dpnp/dpnp_iface_logic.py
@@ -43,15 +43,13 @@
 # pylint: disable=duplicate-code
 # pylint: disable=no-name-in-module
 
-
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
 from dpnp.dpnp_algo.dpnp_elementwise_common import DPNPBinaryFunc, DPNPUnaryFunc
@@ -661,8 +659,8 @@ def array_equiv(a1, a2):
 
 equal = DPNPBinaryFunc(
     "equal",
-    ti_ext._equal_result_type,
-    ti_ext._equal,
+    ti._equal_result_type,
+    ti._equal,
     _EQUAL_DOCSTRING,
 )
 
@@ -737,8 +735,8 @@ def array_equiv(a1, a2):
 
 greater = DPNPBinaryFunc(
     "greater",
-    ti_ext._greater_result_type,
-    ti_ext._greater,
+    ti._greater_result_type,
+    ti._greater,
     _GREATER_DOCSTRING,
 )
 
@@ -814,8 +812,8 @@ def array_equiv(a1, a2):
 
 greater_equal = DPNPBinaryFunc(
     "greater_equal",
-    ti_ext._greater_equal_result_type,
-    ti_ext._greater_equal,
+    ti._greater_equal_result_type,
+    ti._greater_equal,
     _GREATER_EQUAL_DOCSTRING,
 )
 
@@ -1095,8 +1093,8 @@ def iscomplexobj(x):
 
 isfinite = DPNPUnaryFunc(
     "isfinite",
-    ti_ext._isfinite_result_type,
-    ti_ext._isfinite,
+    ti._isfinite_result_type,
+    ti._isfinite,
     _ISFINITE_DOCSTRING,
 )
 
@@ -1338,8 +1336,8 @@ def isin(
 
 isinf = DPNPUnaryFunc(
     "isinf",
-    ti_ext._isinf_result_type,
-    ti_ext._isinf,
+    ti._isinf_result_type,
+    ti._isinf,
     _ISINF_DOCSTRING,
 )
 
@@ -1396,8 +1394,8 @@ def isin(
 
 isnan = DPNPUnaryFunc(
     "isnan",
-    ti_ext._isnan_result_type,
-    ti_ext._isnan,
+    ti._isnan_result_type,
+    ti._isnan,
     _ISNAN_DOCSTRING,
 )
 
@@ -1750,8 +1748,8 @@ def isscalar(element):
 
 less = DPNPBinaryFunc(
     "less",
-    ti_ext._less_result_type,
-    ti_ext._less,
+    ti._less_result_type,
+    ti._less,
     _LESS_DOCSTRING,
 )
 
@@ -1826,8 +1824,8 @@ def isscalar(element):
 
 less_equal = DPNPBinaryFunc(
     "less_equal",
-    ti_ext._less_equal_result_type,
-    ti_ext._less_equal,
+    ti._less_equal_result_type,
+    ti._less_equal,
     _LESS_EQUAL_DOCSTRING,
 )
 
@@ -1969,8 +1967,8 @@ def isscalar(element):
 
 logical_not = DPNPUnaryFunc(
     "logical_not",
-    ti_ext._logical_not_result_type,
-    ti_ext._logical_not,
+    ti._logical_not_result_type,
+    ti._logical_not,
     _LOGICAL_NOT_DOCSTRING,
 )
 
diff --git a/dpnp/dpnp_iface_mathematical.py b/dpnp/dpnp_iface_mathematical.py
index 54a17cec0c3..89bc0868160 100644
--- a/dpnp/dpnp_iface_mathematical.py
+++ b/dpnp/dpnp_iface_mathematical.py
@@ -47,14 +47,13 @@
 import builtins
 import warnings
 
-import dpctl.tensor._tensor_elementwise_impl as ti
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
 import dpctl_ext.tensor as dpt
-import dpctl_ext.tensor._tensor_elementwise_impl as ti_ext
+import dpctl_ext.tensor._tensor_elementwise_impl as ti
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.ufunc._ufunc_impl as ufi
@@ -385,8 +384,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 abs = DPNPUnaryFunc(
     "abs",
-    ti_ext._abs_result_type,
-    ti_ext._abs,
+    ti._abs_result_type,
+    ti._abs,
     _ABS_DOCSTRING,
     mkl_fn_to_call="_mkl_abs_to_call",
     mkl_impl_fn="_abs",
@@ -469,8 +468,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 add = DPNPBinaryFunc(
     "add",
-    ti_ext._add_result_type,
-    ti_ext._add,
+    ti._add_result_type,
+    ti._add,
     _ADD_DOCSTRING,
     mkl_fn_to_call="_mkl_add_to_call",
     mkl_impl_fn="_add",
@@ -541,8 +540,8 @@ def _validate_interp_param(param, name, exec_q, usm_type, dtype=None):
 
 angle = DPNPAngle(
     "angle",
-    ti_ext._angle_result_type,
-    ti_ext._angle,
+    ti._angle_result_type,
+    ti._angle,
     _ANGLE_DOCSTRING,
     mkl_fn_to_call="_mkl_arg_to_call",
     mkl_impl_fn="_arg",
@@ -647,8 +646,8 @@ def around(x, /, decimals=0, out=None):
 
 ceil = DPNPUnaryFunc(
     "ceil",
-    ti_ext._ceil_result_type,
-    ti_ext._ceil,
+    ti._ceil_result_type,
+    ti._ceil,
     _CEIL_DOCSTRING,
     mkl_fn_to_call="_mkl_ceil_to_call",
     mkl_impl_fn="_ceil",
@@ -782,8 +781,8 @@ def clip(a, /, min=None, max=None, *, out=None, order="K", **kwargs):
 
 conj = DPNPUnaryFunc(
     "conj",
-    ti_ext._conj_result_type,
-    ti_ext._conj,
+    ti._conj_result_type,
+    ti._conj,
     _CONJ_DOCSTRING,
     mkl_fn_to_call="_mkl_conj_to_call",
     mkl_impl_fn="_conj",
@@ -1558,12 +1557,12 @@ def diff(a, n=1, axis=-1, prepend=None, append=None):
 
 divide = DPNPBinaryFunc(
     "divide",
-    ti_ext._divide_result_type,
-    ti_ext._divide,
+    ti._divide_result_type,
+    ti._divide,
     _DIVIDE_DOCSTRING,
     mkl_fn_to_call="_mkl_div_to_call",
     mkl_impl_fn="_div",
-    binary_inplace_fn=ti_ext._divide_inplace,
+    binary_inplace_fn=ti._divide_inplace,
     acceptance_fn=dtu._acceptance_fn_divide,
 )
 
@@ -2057,8 +2056,8 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor = DPNPUnaryFunc(
     "floor",
-    ti_ext._floor_result_type,
-    ti_ext._floor,
+    ti._floor_result_type,
+    ti._floor,
     _FLOOR_DOCSTRING,
     mkl_fn_to_call="_mkl_floor_to_call",
     mkl_impl_fn="_floor",
@@ -2139,10 +2138,10 @@ def ediff1d(ary, to_end=None, to_begin=None):
 
 floor_divide = DPNPBinaryFunc(
     "floor_divide",
-    ti_ext._floor_divide_result_type,
-    ti_ext._floor_divide,
+    ti._floor_divide_result_type,
+    ti._floor_divide,
     _FLOOR_DIVIDE_DOCSTRING,
-    binary_inplace_fn=ti_ext._floor_divide_inplace,
+    binary_inplace_fn=ti._floor_divide_inplace,
 )
 
 
@@ -2941,8 +2940,8 @@ def gradient(f, *varargs, axis=None, edge_order=1):
 
 imag = DPNPImag(
     "imag",
-    ti_ext._imag_result_type,
-    ti_ext._imag,
+    ti._imag_result_type,
+    ti._imag,
     _IMAG_DOCSTRING,
 )
 
@@ -3852,8 +3851,8 @@ def _check_nan_inf(val, val_dt):
 
 negative = DPNPUnaryFunc(
     "negative",
-    ti_ext._negative_result_type,
-    ti_ext._negative,
+    ti._negative_result_type,
+    ti._negative,
     _NEGATIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_negative,
 )
@@ -3988,8 +3987,8 @@ def _check_nan_inf(val, val_dt):
 
 positive = DPNPUnaryFunc(
     "positive",
-    ti_ext._positive_result_type,
-    ti_ext._positive,
+    ti._positive_result_type,
+    ti._positive,
     _POSITIVE_DOCSTRING,
     acceptance_fn=acceptance_fn_positive,
 )
@@ -4250,8 +4249,8 @@ def prod(
 
 proj = DPNPUnaryFunc(
     "proj",
-    ti_ext._proj_result_type,
-    ti_ext._proj,
+    ti._proj_result_type,
+    ti._proj,
     _PROJ_DOCSTRING,
 )
 
@@ -4313,8 +4312,8 @@ def prod(
 
 real = DPNPReal(
     "real",
-    ti_ext._real_result_type,
-    ti_ext._real,
+    ti._real_result_type,
+    ti._real,
     _REAL_DOCSTRING,
 )
 
@@ -4596,8 +4595,8 @@ def real_if_close(a, tol=100):
 
 round = DPNPRound(
     "round",
-    ti_ext._round_result_type,
-    ti_ext._round,
+    ti._round_result_type,
+    ti._round,
     _ROUND_DOCSTRING,
     mkl_fn_to_call="_mkl_round_to_call",
     mkl_impl_fn="_round",
@@ -4668,8 +4667,8 @@ def real_if_close(a, tol=100):
 
 sign = DPNPUnaryFunc(
     "sign",
-    ti_ext._sign_result_type,
-    ti_ext._sign,
+    ti._sign_result_type,
+    ti._sign,
     _SIGN_DOCSTRING,
     acceptance_fn=acceptance_fn_sign,
 )
@@ -4730,8 +4729,8 @@ def real_if_close(a, tol=100):
 
 signbit = DPNPUnaryFunc(
     "signbit",
-    ti_ext._signbit_result_type,
-    ti_ext._signbit,
+    ti._signbit_result_type,
+    ti._signbit,
     _SIGNBIT_DOCSTRING,
 )
 
@@ -5229,8 +5228,8 @@ def trapezoid(y, x=None, dx=1.0, axis=-1):
 
 trunc = DPNPUnaryFunc(
     "trunc",
-    ti_ext._trunc_result_type,
-    ti_ext._trunc,
+    ti._trunc_result_type,
+    ti._trunc,
     _TRUNC_DOCSTRING,
     mkl_fn_to_call="_mkl_trunc_to_call",
     mkl_impl_fn="_trunc",