diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index d2ac90621aaa..eb66c91dc8c2 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -37,7 +37,7 @@ jobs:
       actions: write
 
     runs-on: ${{ matrix.os }}
-    timeout-minutes: 60
+    timeout-minutes: 80
 
     defaults:
       run:
diff --git a/.gitignore b/.gitignore
index 0cfebe53f623..f8ed987fa0d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,4 @@ core
 # TODO: revert to `dpctl/`
 # when dpnp fully migrates dpctl/tensor
 dpctl_ext/**/*.cpython*.so
+dpctl_ext/include/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7bb7f650dac..489283f45a43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -344,5 +344,14 @@ if(DEFINED SKBUILD)
     set(_ignore_me ${SKBUILD})
 endif()
 
-add_subdirectory(dpnp)
+# TODO: Replace `${CMAKE_BINARY_DIR}` with a dedicated public include root
+# for dpctl_ext C-API headers
+# Unlike dpctl which exposes C-API from `dpctl/apis/include`,
+# dpctl_ext currently relies on generated headers in the build tree.
+# `${CMAKE_BINARY_DIR}` is a temporary workaround.
+
+add_library(DpctlExtCAPI INTERFACE)
+target_include_directories(DpctlExtCAPI INTERFACE ${CMAKE_BINARY_DIR})
+
 add_subdirectory(dpctl_ext)
+add_subdirectory(dpnp)
diff --git a/dpctl_ext/CMakeLists.txt b/dpctl_ext/CMakeLists.txt
index e58693091422..a5524e8bb3db 100644
--- a/dpctl_ext/CMakeLists.txt
+++ b/dpctl_ext/CMakeLists.txt
@@ -112,8 +112,89 @@ else()
 endif()
 
 # at build time create include/ directory and copy header files over
-# set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(DPCTL_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 set(CMAKE_INSTALL_RPATH "$ORIGIN")
 
+function(build_dpctl_ext _trgt _src _dest)
+    set(options SYCL)
+    cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "RELATIVE_PATH" "" ${ARGN})
+    add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src)
+    set(_cythonize_trgt "${_trgt}_cythonize_pyx")
+    python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
+    if(BUILD_DPCTL_EXT_SYCL)
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        target_compile_options(${_trgt} PRIVATE -fno-sycl-id-queries-fit-in-int)
+        target_link_options(${_trgt} PRIVATE -fsycl-device-code-split=per_kernel)
+        if(DPCTL_OFFLOAD_COMPRESS)
+            target_link_options(${_trgt} PRIVATE --offload-compress)
+        endif()
+        if(_dpctl_sycl_targets)
+            # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE ${_dpctl_sycl_target_compile_options}
+            )
+            target_link_options(${_trgt} PRIVATE ${_dpctl_sycl_target_link_options})
+        endif()
+    endif()
+    target_link_libraries(${_trgt} PRIVATE Python::NumPy)
+    if(DPCTL_GENERATE_COVERAGE)
+        target_compile_definitions(${_trgt} PRIVATE CYTHON_TRACE=1 CYTHON_TRACE_NOGIL=1)
+        if(BUILD_DPCTL_EXT_SYCL)
+            target_compile_options(${_trgt} PRIVATE -fno-sycl-use-footer)
+        endif()
+    endif()
+    # Dpctl
+    target_include_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR})
+    target_link_directories(${_trgt} PRIVATE ${Dpctl_INCLUDE_DIR}/..)
+    target_link_libraries(${_trgt} PRIVATE DPCTLSyclInterface)
+    set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
+    target_link_options(${_trgt} PRIVATE ${_linker_options})
+    get_filename_component(_name_wle ${_generated_src} NAME_WLE)
+    get_filename_component(_generated_src_dir ${_generated_src} DIRECTORY)
+    set(_generated_public_h "${_generated_src_dir}/${_name_wle}.h")
+    set(_generated_api_h "${_generated_src_dir}/${_name_wle}_api.h")
+
+    # TODO: create separate folder inside build folder that contains only
+    #   headers related to this target and appropriate folder structure to
+    #   eliminate shadow dependencies
+    get_filename_component(_generated_src_dir_dir ${_generated_src_dir} DIRECTORY)
+    # TODO: do not set directory if we did not generate header
+    target_include_directories(${_trgt} INTERFACE ${_generated_src_dir_dir})
+    set(_rpath_value "$ORIGIN")
+    if(BUILD_DPCTL_EXT_RELATIVE_PATH)
+        set(_rpath_value "${_rpath_value}/${BUILD_DPCTL_EXT_RELATIVE_PATH}")
+    endif()
+    if(DPCTL_WITH_REDIST)
+        set(_rpath_value "${_rpath_value}:${_rpath_value}/../../..")
+    endif()
+    set_target_properties(${_trgt} PROPERTIES INSTALL_RPATH ${_rpath_value})
+
+    install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest})
+    install(
+        FILES ${_generated_api_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    install(
+        FILES ${_generated_public_h}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/dpctl_ext/include/${_dest}
+        OPTIONAL
+    )
+    if(DPCTL_GENERATE_COVERAGE)
+        get_filename_component(_original_src_dir ${_src} DIRECTORY)
+        file(RELATIVE_PATH _rel_dir ${CMAKE_SOURCE_DIR} ${_original_src_dir})
+        install(FILES ${_generated_src} DESTINATION ${CMAKE_INSTALL_PREFIX}/${_rel_dir})
+    endif()
+
+    # Create target with headers only, because python is managing all the
+    # library imports at runtime
+    set(_trgt_headers ${_trgt}_headers)
+    add_library(${_trgt_headers} INTERFACE)
+    add_dependencies(${_trgt_headers} ${_trgt})
+    get_target_property(_trgt_headers_dir ${_trgt} INTERFACE_INCLUDE_DIRECTORIES)
+    target_include_directories(${_trgt_headers} INTERFACE ${_trgt_headers_dir})
+endfunction()
+
 add_subdirectory(tensor)
diff --git a/dpctl_ext/tensor/CMakeLists.txt b/dpctl_ext/tensor/CMakeLists.txt
index 6f286a8d7198..8df593b0838d 100644
--- a/dpctl_ext/tensor/CMakeLists.txt
+++ b/dpctl_ext/tensor/CMakeLists.txt
@@ -29,6 +29,15 @@
 
 find_package(Python COMPONENTS Development.Module)
 
+file(GLOB _cython_sources *.pyx)
+foreach(_cy_file ${_cython_sources})
+    get_filename_component(_trgt ${_cy_file} NAME_WLE)
+    build_dpctl_ext(${_trgt} ${_cy_file} "dpctl_ext/tensor" RELATIVE_PATH "..")
+    target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/include)
+    # target_link_libraries(DpctlCAPI INTERFACE ${_trgt}_headers)
+    target_link_libraries(DpctlExtCAPI INTERFACE ${_trgt}_headers)
+endforeach()
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
@@ -338,6 +347,7 @@ foreach(python_module_name ${_py_trgts})
     #   dpctl4pybind11.hpp. It will allow to simplify dependency tree
     # NOTE: dpctl C-API is resolved at runtime via Python
     # target_link_libraries(${python_module_name} PRIVATE DpctlCAPI)
+    target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
     if(DPNP_WITH_REDIST)
         set_target_properties(
             ${python_module_name}
diff --git a/dpctl_ext/tensor/__init__.pxd b/dpctl_ext/tensor/__init__.pxd
new file mode 100644
index 000000000000..a4bcecfec1d1
--- /dev/null
+++ b/dpctl_ext/tensor/__init__.pxd
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+""" This file declares the extension types and functions for the Cython API
+    implemented in _usmarray.pyx file.
+"""
+
+# distutils: language = c++
+# cython: language_level=3
+
+from ._usmarray cimport *
diff --git a/dpctl_ext/tensor/__init__.py b/dpctl_ext/tensor/__init__.py
index 7a6923169c1f..03980e194fd0 100644
--- a/dpctl_ext/tensor/__init__.py
+++ b/dpctl_ext/tensor/__init__.py
@@ -28,7 +28,9 @@
 
 
 from ._accumulation import cumulative_logsumexp, cumulative_prod, cumulative_sum
+from ._array_api import __array_api_version__, __array_namespace_info__
 from ._clip import clip
+from ._constants import e, inf, nan, newaxis, pi
 from ._copy_utils import (
     asnumpy,
     astype,
@@ -53,6 +55,29 @@
     zeros,
     zeros_like,
 )
+from ._data_types import (
+    bool,
+    complex64,
+    complex128,
+    dtype,
+    float16,
+    float32,
+    float64,
+    int8,
+    int16,
+    int32,
+    int64,
+    uint8,
+    uint16,
+    uint32,
+    uint64,
+)
+from ._device import Device
+from ._dldevice_conversions import (
+    dldevice_to_sycl_device,
+    sycl_device_to_dldevice,
+)
+from ._dlpack import from_dlpack
 from ._elementwise_funcs import (
     abs,
     acos,
@@ -157,6 +182,13 @@
     tile,
     unstack,
 )
+from ._print import (
+    get_print_options,
+    print_options,
+    set_print_options,
+    usm_ndarray_repr,
+    usm_ndarray_str,
+)
 from ._reduction import (
     argmax,
     argmin,
@@ -168,6 +200,12 @@
     reduce_hypot,
     sum,
 )
+
+# isort: off
+# placed here to avoid circular import
+from ._usmarray import DLDeviceType, usm_ndarray
+
+# isort: on
 from ._reshape import reshape
 from ._search_functions import where
 from ._searchsorted import searchsorted
@@ -185,6 +223,32 @@
 from ._utility_functions import all, any, diff
 
 __all__ = [
+    "Device",
+    "DLDeviceType",
+    "usm_ndarray",
+    # data types
+    "bool",
+    "dtype",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+    # constants
+    "e",
+    "inf",
+    "nan",
+    "newaxis",
+    "pi",
+    # functions
     "abs",
     "acos",
     "acosh",
@@ -229,6 +293,7 @@
     "cumulative_sum",
     "diff",
     "divide",
+    "dldevice_to_sycl_device",
     "empty",
     "empty_like",
     "equal",
@@ -242,9 +307,11 @@
     "flip",
     "floor",
     "floor_divide",
+    "from_dlpack",
     "from_numpy",
     "full",
     "full_like",
+    "get_print_options",
     "greater",
     "greater_equal",
     "hypot",
@@ -288,6 +355,7 @@
     "place",
     "positive",
     "pow",
+    "print_options",
     "prod",
     "proj",
     "put",
@@ -303,6 +371,7 @@
     "round",
     "rsqrt",
     "searchsorted",
+    "set_print_options",
     "sign",
     "signbit",
     "sin",
@@ -316,6 +385,7 @@
     "subtract",
     "sum",
     "swapaxes",
+    "sycl_device_to_dldevice",
     "take",
     "take_along_axis",
     "tan",
@@ -332,9 +402,13 @@
     "unique_inverse",
     "unique_values",
     "unstack",
+    "usm_ndarray_repr",
+    "usm_ndarray_str",
     "var",
     "vecdot",
     "where",
     "zeros",
     "zeros_like",
+    "__array_api_version__",
+    "__array_namespace_info__",
 ]
diff --git a/dpctl_ext/tensor/_accumulation.py b/dpctl_ext/tensor/_accumulation.py
index 2dfe9656e198..8628628f3bf8 100644
--- a/dpctl_ext/tensor/_accumulation.py
+++ b/dpctl_ext/tensor/_accumulation.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_accumulation_impl as tai
 import dpctl_ext.tensor._tensor_impl as ti
 
@@ -82,7 +81,7 @@ def _accumulate_common(
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     q = x.sycl_queue
     inp_dt = x.dtype
     res_usm_type = x.usm_type
@@ -130,16 +129,16 @@ def _accumulate_common(
             )
         # permute out array dims if necessary
         if a1 != nd:
-            out = dpt_ext.permute_dims(out, perm)
+            out = dpt.permute_dims(out, perm)
             orig_out = out
         if ti._array_overlap(x, out) and implemented_types:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_sh, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         if a1 != nd:
-            out = dpt_ext.permute_dims(out, perm)
+            out = dpt.permute_dims(out, perm)
 
     _manager = SequentialOrderManager[q]
     depends = _manager.submitted_events
@@ -166,7 +165,7 @@ def _accumulate_common(
             out = orig_out
     else:
         if _dtype_supported(res_dt, res_dt):
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -191,18 +190,18 @@ def _accumulate_common(
             _manager.add_event_pair(ht_e, acc_ev)
         else:
             buf_dt = _default_accumulation_type_fn(inp_dt, q)
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=arr, dst=tmp, sycl_queue=q, depends=depends
             )
             _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt_ext.empty(
+            tmp_res = dpt.empty(
                 res_sh, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             if a1 != nd:
-                tmp_res = dpt_ext.permute_dims(tmp_res, perm)
+                tmp_res = dpt.permute_dims(tmp_res, perm)
             if not include_initial:
                 ht_e, acc_ev = _accumulate_fn(
                     src=tmp,
@@ -225,10 +224,10 @@ def _accumulate_common(
             _manager.add_event_pair(ht_e_cpy2, cpy_e2)
 
     if appended_axis:
-        out = dpt_ext.squeeze(out)
+        out = dpt.squeeze(out)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(out, inv_perm)
+        out = dpt.permute_dims(out, inv_perm)
 
     return out
 
diff --git a/dpctl_ext/tensor/_array_api.py b/dpctl_ext/tensor/_array_api.py
new file mode 100644
index 000000000000..09f71bc1bdd3
--- /dev/null
+++ b/dpctl_ext/tensor/_array_api.py
@@ -0,0 +1,256 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import dpctl
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+
+from ._tensor_impl import (
+    default_device_complex_type,
+    default_device_fp_type,
+    default_device_index_type,
+    default_device_int_type,
+)
+
+
+def _isdtype_impl(dtype, kind):
+    if isinstance(kind, str):
+        if kind == "bool":
+            return dtype.kind == "b"
+        elif kind == "signed integer":
+            return dtype.kind == "i"
+        elif kind == "unsigned integer":
+            return dtype.kind == "u"
+        elif kind == "integral":
+            return dtype.kind in "iu"
+        elif kind == "real floating":
+            return dtype.kind == "f"
+        elif kind == "complex floating":
+            return dtype.kind == "c"
+        elif kind == "numeric":
+            return dtype.kind in "iufc"
+        else:
+            raise ValueError(f"Unrecognized data type kind: {kind}")
+
+    elif isinstance(kind, tuple):
+        return any(_isdtype_impl(dtype, k) for k in kind)
+    else:
+        raise TypeError(f"Unsupported type for dtype kind: {type(kind)}")
+
+
+def _get_device_impl(d):
+    if d is None:
+        return dpctl.select_default_device()
+    elif isinstance(d, dpctl.SyclDevice):
+        return d
+    elif isinstance(d, (dpt.Device, dpctl.SyclQueue)):
+        return d.sycl_device
+    else:
+        try:
+            return dpctl.SyclDevice(d)
+        except TypeError:
+            raise TypeError(f"Unsupported type for device argument: {type(d)}")
+
+
+__array_api_version__ = "2024.12"
+
+
+class Info:
+    """namespace returned by ``__array_namespace_info__()``"""
+
+    def __init__(self):
+        self._capabilities = {
+            "boolean indexing": True,
+            "data-dependent shapes": True,
+            "max dimensions": None,
+        }
+        self._all_dtypes = {
+            "bool": dpt.bool,
+            "float32": dpt.float32,
+            "float64": dpt.float64,
+            "complex64": dpt.complex64,
+            "complex128": dpt.complex128,
+            "int8": dpt.int8,
+            "int16": dpt.int16,
+            "int32": dpt.int32,
+            "int64": dpt.int64,
+            "uint8": dpt.uint8,
+            "uint16": dpt.uint16,
+            "uint32": dpt.uint32,
+            "uint64": dpt.uint64,
+        }
+
+    def capabilities(self):
+        """
+        capabilities()
+
+        Returns a dictionary of ``dpctl``'s capabilities.
+
+        The dictionary contains the following keys:
+            ``"boolean indexing"``:
+                boolean indicating ``dpctl``'s support of boolean indexing.
+                Value: ``True``
+            ``"data-dependent shapes"``:
+                boolean indicating ``dpctl``'s support of data-dependent shapes.
+                Value: ``True``
+            ``max dimensions``:
+                integer indication the maximum array dimension supported by ``dpctl``.
+                Value: ``None``
+
+        Returns:
+            dict:
+                dictionary of ``dpctl``'s capabilities
+        """
+        return self._capabilities.copy()
+
+    def default_device(self):
+        """
+        default_device()
+
+        Returns the default SYCL device.
+        """
+        return dpctl.select_default_device()
+
+    def default_dtypes(self, *, device=None):
+        """
+        default_dtypes(*, device=None)
+
+        Returns a dictionary of default data types for ``device``.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device
+                is used), an instance of :class:`dpctl.SyclDevice`, an instance
+                of :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of default data types for ``device``:
+
+                    - ``"real floating"``: dtype
+                    - ``"complex floating"``: dtype
+                    - ``"integral"``: dtype
+                    - ``"indexing"``: dtype
+        """
+        device = _get_device_impl(device)
+        return {
+            "real floating": dpt.dtype(default_device_fp_type(device)),
+            "complex floating": dpt.dtype(default_device_complex_type(device)),
+            "integral": dpt.dtype(default_device_int_type(device)),
+            "indexing": dpt.dtype(default_device_index_type(device)),
+        }
+
+    def dtypes(self, *, device=None, kind=None):
+        """
+        dtypes(*, device=None, kind=None)
+
+        Returns a dictionary of all Array API data types of a specified
+        ``kind`` supported by ``device``.
+
+        This dictionary only includes data types supported by the
+        `Python Array API <https://data-apis.org/array-api/latest/>`_
+        specification.
+
+        Args:
+            device (Optional[:class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue`, :class:`dpctl.tensor.Device`, str]):
+                array API concept of device used in getting default data types.
+                ``device`` can be ``None`` (in which case the default device is
+                used), an instance of :class:`dpctl.SyclDevice`, an instance of
+                :class:`dpctl.SyclQueue`, a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`, or
+                a filter selector string.
+                Default: ``None``.
+
+            kind (Optional[str, Tuple[str, ...]]):
+                data type kind.
+
+                - if ``kind`` is ``None``, returns a dictionary of all data
+                  types supported by `device`
+                - if ``kind`` is a string, returns a dictionary containing the
+                  data types belonging to the data type kind specified.
+
+                  Supports:
+
+                  * ``"bool"``
+                  * ``"signed integer"``
+                  * ``"unsigned integer"``
+                  * ``"integral"``
+                  * ``"real floating"``
+                  * ``"complex floating"``
+                  * ``"numeric"``
+
+                - if ``kind`` is a tuple, the tuple represents a union of
+                  ``kind`` strings, and returns a dictionary containing data
+                  types corresponding to the-specified union.
+
+                Default: ``None``.
+
+        Returns:
+            dict:
+                a dictionary of the supported data types of the specified
+                ``kind``
+        """
+        device = _get_device_impl(device)
+        _fp64 = device.has_aspect_fp64
+        if kind is None:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if _fp64 or (key != "float64" and key != "complex128")
+            }
+        else:
+            return {
+                key: val
+                for key, val in self._all_dtypes.items()
+                if (_fp64 or (key != "float64" and key != "complex128"))
+                and _isdtype_impl(val, kind)
+            }
+
+    def devices(self):
+        """
+        devices()
+
+        Returns a list of supported devices.
+        """
+        return dpctl.get_devices()
+
+
+def __array_namespace_info__():
+    """
+    __array_namespace_info__()
+
+    Returns a namespace with Array API namespace inspection utilities.
+
+    """
+    return Info()
diff --git a/dpctl_ext/tensor/_clip.py b/dpctl_ext/tensor/_clip.py
index c21d601966bd..8071f13bee19 100644
--- a/dpctl_ext/tensor/_clip.py
+++ b/dpctl_ext/tensor/_clip.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 
@@ -163,7 +162,7 @@ def _clip_none(x, val, out, order, _binary_fn):
 
         if ti._array_overlap(x, out):
             if not ti._same_logical_tensors(x, out):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
         if isinstance(val, dpt.usm_ndarray):
             if (
@@ -171,12 +170,12 @@ def _clip_none(x, val, out, order, _binary_fn):
                 and not ti._same_logical_tensors(val, out)
                 and val_dtype == res_dt
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
     if isinstance(val, dpt.usm_ndarray):
         val_ary = val
     else:
-        val_ary = dpt_ext.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
+        val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q)
 
     if order == "A":
         order = (
@@ -197,7 +196,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, val_ary, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -205,9 +204,9 @@ def _clip_none(x, val, out, order, _binary_fn):
                     order=order,
                 )
         if x_shape != res_shape:
-            x = dpt_ext.broadcast_to(x, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
         if val_ary.shape != res_shape:
-            val_ary = dpt_ext.broadcast_to(val_ary, res_shape)
+            val_ary = dpt.broadcast_to(val_ary, res_shape)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_binary_ev, binary_ev = _binary_fn(
@@ -229,7 +228,7 @@ def _clip_none(x, val, out, order, _binary_fn):
         if order == "K":
             buf = _empty_like_orderK(val_ary, res_dt)
         else:
-            buf = dpt_ext.empty_like(val_ary, dtype=res_dt, order=order)
+            buf = dpt.empty_like(val_ary, dtype=res_dt, order=order)
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -242,7 +241,7 @@ def _clip_none(x, val, out, order, _binary_fn):
                     x, buf, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -251,8 +250,8 @@ def _clip_none(x, val, out, order, _binary_fn):
                 )
 
         if x_shape != res_shape:
-            x = dpt_ext.broadcast_to(x, res_shape)
-        buf = dpt_ext.broadcast_to(buf, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
+        buf = dpt.broadcast_to(buf, res_shape)
         ht_binary_ev, binary_ev = _binary_fn(
             src1=x,
             src2=buf,
@@ -313,9 +312,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
     if order not in ["K", "C", "F", "A"]:
         order = "K"
     if x.dtype.kind in "iu":
-        if isinstance(min, int) and min <= dpt_ext.iinfo(x.dtype).min:
+        if isinstance(min, int) and min <= dpt.iinfo(x.dtype).min:
             min = None
-        if isinstance(max, int) and max >= dpt_ext.iinfo(x.dtype).max:
+        if isinstance(max, int) and max >= dpt.iinfo(x.dtype).max:
             max = None
     if min is None and max is None:
         exec_q = x.sycl_queue
@@ -353,14 +352,14 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
                 else:
                     return out
         else:
             if order == "K":
                 out = _empty_like_orderK(x, x.dtype)
             else:
-                out = dpt_ext.empty_like(x, order=order)
+                out = dpt.empty_like(x, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -519,7 +518,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
 
             if ti._array_overlap(x, out):
                 if not ti._same_logical_tensors(x, out):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
             if isinstance(min, dpt.usm_ndarray):
                 if (
@@ -527,7 +526,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(min, out)
                     and buf1_dt is None
                 ):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
             if isinstance(max, dpt.usm_ndarray):
                 if (
@@ -535,16 +534,16 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     and not ti._same_logical_tensors(max, out)
                     and buf2_dt is None
                 ):
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
         if isinstance(min, dpt.usm_ndarray):
             a_min = min
         else:
-            a_min = dpt_ext.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
+            a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q)
         if isinstance(max, dpt.usm_ndarray):
             a_max = max
         else:
-            a_max = dpt_ext.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
+            a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
@@ -572,7 +571,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -580,11 +579,11 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
             if x_shape != res_shape:
-                x = dpt_ext.broadcast_to(x, res_shape)
+                x = dpt.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt_ext.broadcast_to(a_min, res_shape)
+                a_min = dpt.broadcast_to(a_min, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt_ext.broadcast_to(a_max, res_shape)
+                a_max = dpt.broadcast_to(a_max, res_shape)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = ti._clip(
@@ -612,7 +611,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(a_max, buf2_dt)
             else:
-                buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
+                buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -631,7 +630,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -639,10 +638,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt_ext.broadcast_to(x, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
             if a_min.shape != res_shape:
-                a_min = dpt_ext.broadcast_to(a_min, res_shape)
-            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                a_min = dpt.broadcast_to(a_min, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=a_min,
@@ -668,7 +667,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(a_min, buf1_dt)
             else:
-                buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
+                buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
             _manager = SequentialOrderManager[exec_q]
             dep_ev = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -687,7 +686,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         exec_q,
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -695,10 +694,10 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                         order=order,
                     )
 
-            x = dpt_ext.broadcast_to(x, res_shape)
-            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+            x = dpt.broadcast_to(x, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
             if a_max.shape != res_shape:
-                a_max = dpt_ext.broadcast_to(a_max, res_shape)
+                a_max = dpt.broadcast_to(a_max, res_shape)
             ht_binary_ev, binary_ev = ti._clip(
                 src=x,
                 min=buf1,
@@ -736,7 +735,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(a_min, buf1_dt)
         else:
-            buf1 = dpt_ext.empty_like(a_min, dtype=buf1_dt, order=order)
+            buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order)
 
         _manager = SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -747,7 +746,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(a_max, buf2_dt)
         else:
-            buf2 = dpt_ext.empty_like(a_max, dtype=buf2_dt, order=order)
+            buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=a_max, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -758,7 +757,7 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -766,9 +765,9 @@ def clip(x, /, min=None, max=None, out=None, order="K"):
                     order=order,
                 )
 
-        x = dpt_ext.broadcast_to(x, res_shape)
-        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
-        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+        x = dpt.broadcast_to(x, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
         ht_, clip_ev = ti._clip(
             src=x,
             min=buf1,
diff --git a/dpctl_ext/tensor/_constants.py b/dpctl_ext/tensor/_constants.py
new file mode 100644
index 000000000000..4c134bd9d375
--- /dev/null
+++ b/dpctl_ext/tensor/_constants.py
@@ -0,0 +1,36 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numpy as np
+
+newaxis = None
+
+pi = np.pi
+e = np.e
+nan = np.nan
+inf = np.inf
diff --git a/dpctl_ext/tensor/_copy_utils.py b/dpctl_ext/tensor/_copy_utils.py
index 37879997b788..b056511ac33b 100644
--- a/dpctl_ext/tensor/_copy_utils.py
+++ b/dpctl_ext/tensor/_copy_utils.py
@@ -32,17 +32,16 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
+from ._data_types import _get_dtype
+from ._device import normalize_queue_device
 from ._numpy_helper import normalize_axis_index
 from ._type_utils import _dtype_supported_by_device_impl
 
@@ -91,7 +90,7 @@ def _copy_from_numpy(np_ary, usm_type="device", sycl_queue=None):
         )
     else:
         Xusm_dtype = dt
-    Xusm = dpt_ext.empty(
+    Xusm = dpt.empty(
         Xnp.shape, dtype=Xusm_dtype, usm_type=usm_type, sycl_queue=sycl_queue
     )
     _copy_from_numpy_into(Xusm, Xnp)
@@ -159,7 +158,7 @@ def _extract_impl(ary, ary_mask, axis=0):
     elif isinstance(ary_mask, np.ndarray):
         dst_usm_type = ary.usm_type
         exec_q = ary.sycl_queue
-        ary_mask = dpt_ext.asarray(
+        ary_mask = dpt.asarray(
             ary_mask, usm_type=dst_usm_type, sycl_queue=exec_q
         )
     else:
@@ -176,7 +175,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         )
     mask_nelems = ary_mask.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt_ext.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
+    cumsum = dpt.empty(mask_nelems, dtype=cumsum_dt, device=ary_mask.device)
     exec_q = cumsum.sycl_queue
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -184,7 +183,7 @@ def _extract_impl(ary, ary_mask, axis=0):
         ary_mask, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     dst_shape = ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
-    dst = dpt_ext.empty(
+    dst = dpt.empty(
         dst_shape, dtype=ary.dtype, usm_type=dst_usm_type, device=ary.device
     )
     if dst.size == 0:
@@ -247,7 +246,7 @@ def _nonzero_impl(ary):
     usm_type = ary.usm_type
     mask_nelems = ary.size
     cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
-    cumsum = dpt_ext.empty(
+    cumsum = dpt.empty(
         mask_nelems, dtype=cumsum_dt, sycl_queue=exec_q, order="C"
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -256,7 +255,7 @@ def _nonzero_impl(ary):
         ary, cumsum, sycl_queue=exec_q, depends=dep_evs
     )
     indexes_dt = ti.default_device_index_type(exec_q.sycl_device)
-    indexes = dpt_ext.empty(
+    indexes = dpt.empty(
         (ary.ndim, mask_count),
         dtype=indexes_dt,
         usm_type=usm_type,
@@ -284,14 +283,14 @@ def _prepare_indices_arrays(inds, q, usm_type):
             lambda ind: (
                 ind
                 if isinstance(ind, dpt.usm_ndarray)
-                else dpt_ext.asarray(ind, usm_type=usm_type, sycl_queue=q)
+                else dpt.asarray(ind, usm_type=usm_type, sycl_queue=q)
             ),
             inds,
         )
     )
 
     # promote to a common integral type if possible
-    ind_dt = dpt_ext.result_type(*inds)
+    ind_dt = dpt.result_type(*inds)
     if ind_dt.kind not in "ui":
         raise ValueError(
             "cannot safely promote indices to an integer data type"
@@ -299,18 +298,122 @@ def _prepare_indices_arrays(inds, q, usm_type):
     inds = tuple(
         map(
             lambda ind: (
-                ind if ind.dtype == ind_dt else dpt_ext.astype(ind, ind_dt)
+                ind if ind.dtype == ind_dt else dpt.astype(ind, ind_dt)
             ),
             inds,
         )
     )
 
     # broadcast
-    inds = dpt_ext.broadcast_arrays(*inds)
+    inds = dpt.broadcast_arrays(*inds)
 
     return inds
 
 
+def _place_impl(ary, ary_mask, vals, axis=0):
+    """
+    Extract elements of ary by applying mask starting from slot
+    dimension axis.
+    """
+    if not isinstance(ary, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expecting type dpctl.tensor.usm_ndarray, got {type(ary)}"
+        )
+    if isinstance(ary_mask, dpt.usm_ndarray):
+        exec_q = dpctl.utils.get_execution_queue(
+            (
+                ary.sycl_queue,
+                ary_mask.sycl_queue,
+            )
+        )
+        coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+            (
+                ary.usm_type,
+                ary_mask.usm_type,
+            )
+        )
+        if exec_q is None:
+            raise dpctl.utils.ExecutionPlacementError(
+                "arrays have different associated queues. "
+                "Use `y.to_device(x.device)` to migrate."
+            )
+    elif isinstance(ary_mask, np.ndarray):
+        exec_q = ary.sycl_queue
+        coerced_usm_type = ary.usm_type
+        ary_mask = dpt.asarray(
+            ary_mask, usm_type=coerced_usm_type, sycl_queue=exec_q
+        )
+    else:
+        raise TypeError(
+            "Expecting type dpctl.tensor.usm_ndarray or numpy.ndarray, got "
+            f"{type(ary_mask)}"
+        )
+    if exec_q is not None:
+        if not isinstance(vals, dpt.usm_ndarray):
+            vals = dpt.asarray(
+                vals,
+                dtype=ary.dtype,
+                usm_type=coerced_usm_type,
+                sycl_queue=exec_q,
+            )
+        else:
+            exec_q = dpctl.utils.get_execution_queue((exec_q, vals.sycl_queue))
+            coerced_usm_type = dpctl.utils.get_coerced_usm_type(
+                (
+                    coerced_usm_type,
+                    vals.usm_type,
+                )
+            )
+    if exec_q is None:
+        raise dpctl.utils.ExecutionPlacementError(
+            "arrays have different associated queues. "
+            "Use `Y.to_device(X.device)` to migrate."
+        )
+    ary_nd = ary.ndim
+    pp = normalize_axis_index(operator.index(axis), ary_nd)
+    mask_nd = ary_mask.ndim
+    if pp < 0 or pp + mask_nd > ary_nd:
+        raise ValueError(
+            "Parameter p is inconsistent with input array dimensions"
+        )
+    mask_nelems = ary_mask.size
+    cumsum_dt = dpt.int32 if mask_nelems < int32_t_max else dpt.int64
+    cumsum = dpt.empty(
+        mask_nelems,
+        dtype=cumsum_dt,
+        usm_type=coerced_usm_type,
+        device=ary_mask.device,
+    )
+    exec_q = cumsum.sycl_queue
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_ev = _manager.submitted_events
+    mask_count = ti.mask_positions(
+        ary_mask, cumsum, sycl_queue=exec_q, depends=dep_ev
+    )
+    expected_vals_shape = (
+        ary.shape[:pp] + (mask_count,) + ary.shape[pp + mask_nd :]
+    )
+    if vals.dtype == ary.dtype:
+        rhs = vals
+    else:
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
+    if mask_nelems == 0:
+        return
+    dep_ev = _manager.submitted_events
+    hev, pl_ev = ti._place(
+        dst=ary,
+        cumsum=cumsum,
+        axis_start=pp,
+        axis_end=pp + mask_nd,
+        rhs=rhs,
+        sycl_queue=exec_q,
+        depends=dep_ev,
+    )
+    _manager.add_event_pair(hev, pl_ev)
+    return
+
+
 def _put_multi_index(ary, inds, p, vals, mode=0):
     if not isinstance(ary, dpt.usm_ndarray):
         raise TypeError(
@@ -332,7 +435,7 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
 
     if exec_q is not None:
         if not isinstance(vals, dpt.usm_ndarray):
-            vals = dpt_ext.asarray(
+            vals = dpt.asarray(
                 vals,
                 dtype=ary.dtype,
                 usm_type=coerced_usm_type,
@@ -367,8 +470,8 @@ def _put_multi_index(ary, inds, p, vals, mode=0):
     if vals.dtype == ary.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, ary.dtype)
-    rhs = dpt_ext.broadcast_to(rhs, expected_vals_shape)
+        rhs = dpt.astype(vals, ary.dtype)
+    rhs = dpt.broadcast_to(rhs, expected_vals_shape)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     dep_ev = _manager.submitted_events
     hev, put_ev = ti._put(
@@ -418,7 +521,7 @@ def _take_multi_index(ary, inds, p, mode=0):
     if 0 in ary_sh[p:p_end] and ind0.size != 0:
         raise IndexError("cannot take non-empty indices from an empty axis")
     res_shape = ary_sh[:p] + ind0.shape + ary_sh[p_end:]
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=ary.dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
@@ -681,9 +784,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
     inv_perm = sorted(range(x.ndim), key=lambda i: perm[i])
     sh = x.shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if min(st) < 0:
         st_sorted = [st[i] for i in perm]
         sl = tuple(
@@ -695,7 +796,7 @@ def _make_empty_like_orderK(x, dt, usm_type, dev):
             for i in range(x.ndim)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def _empty_like_orderK(x, dt, usm_type=None, dev=None):
@@ -714,11 +815,11 @@ def _empty_like_orderK(x, dt, usm_type=None, dev=None):
         dev = x.device
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt_ext.empty_like(
+        return dpt.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt_ext.empty_like(
+        return dpt.empty_like(
             x, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -736,11 +837,11 @@ def _from_numpy_empty_like_orderK(x, dt, usm_type, dev):
         raise TypeError(f"Expected numpy.ndarray, got {type(x)}")
     fl = x.flags
     if fl["C"] or x.size <= 1:
-        return dpt_ext.empty(
+        return dpt.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     elif fl["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             x.shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     return _make_empty_like_orderK(x, dt, usm_type, dev)
@@ -760,11 +861,11 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     fl1 = X1.flags
     fl2 = X2.flags
     if fl1["C"] or fl2["C"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -787,9 +888,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
     st2_sorted = [st2[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if max(min(st1_sorted), min(st2_sorted)) < 0:
         sl = tuple(
             (
@@ -800,7 +899,7 @@ def _empty_like_pair_orderK(X1, X2, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
@@ -827,11 +926,11 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     fl2 = X2.flags
     fl3 = X3.flags
     if fl1["C"] or fl2["C"] or fl3["C"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="C"
         )
     if fl1["F"] and fl2["F"] and fl3["F"]:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape, dtype=dt, usm_type=usm_type, device=dev, order="F"
         )
     st1 = list(X1.strides)
@@ -859,9 +958,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
     st3_sorted = [st3[i] for i in perm]
     sh = res_shape
     sh_sorted = tuple(sh[i] for i in perm)
-    R = dpt_ext.empty(
-        sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C"
-    )
+    R = dpt.empty(sh_sorted, dtype=dt, usm_type=usm_type, device=dev, order="C")
     if max(min(st1_sorted), min(st2_sorted), min(st3_sorted)) < 0:
         sl = tuple(
             (
@@ -876,7 +973,7 @@ def _empty_like_triple_orderK(X1, X2, X3, dt, res_shape, usm_type, dev):
             for i in range(nd1)
         )
         R = R[sl]
-    return dpt_ext.permute_dims(R, inv_perm)
+    return dpt.permute_dims(R, inv_perm)
 
 
 def copy(usm_ary, /, *, order="K"):
@@ -1019,7 +1116,7 @@ def astype(
     else:
         target_dtype = _get_dtype(newdtype, usm_ary.sycl_queue)
 
-    if not dpt_ext.can_cast(ary_dtype, target_dtype, casting=casting):
+    if not dpt.can_cast(ary_dtype, target_dtype, casting=casting):
         raise TypeError(
             f"Can not cast from {ary_dtype} to {newdtype} "
             f"according to rule {casting}."
diff --git a/dpctl_ext/tensor/_ctors.py b/dpctl_ext/tensor/_ctors.py
index 21c3d0077189..d249efa8a602 100644
--- a/dpctl_ext/tensor/_ctors.py
+++ b/dpctl_ext/tensor/_ctors.py
@@ -31,17 +31,16 @@
 
 import dpctl
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
-from dpctl.tensor._data_types import _get_dtype
-from dpctl.tensor._device import normalize_queue_device
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
+from dpctl_ext.tensor._data_types import _get_dtype
+from dpctl_ext.tensor._device import normalize_queue_device
+from dpctl_ext.tensor._usmarray import _is_object_with_buffer_protocol
 
 from ._copy_utils import (
     _empty_like_orderK,
@@ -182,7 +181,7 @@ def _asarray_from_seq(
     if order in "KA":
         order = "C"
     if isinstance(exec_q, dpctl.SyclQueue):
-        res = dpt_ext.empty(
+        res = dpt.empty(
             seq_shape,
             dtype=dtype,
             usm_type=usm_type,
@@ -193,7 +192,7 @@ def _asarray_from_seq(
         _device_copy_walker(seq_obj, res, _manager)
         return res
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             seq_shape,
             dtype=dtype,
             usm_type=usm_type,
@@ -312,7 +311,7 @@ def _asarray_from_usm_ndarray(
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        tmp = dpt_ext.asnumpy(usm_ndary)
+        tmp = dpt.asnumpy(usm_ndary)
         res[...] = tmp
     return res
 
@@ -361,7 +360,7 @@ def _copy_through_host_walker(seq_o, usm_res):
             )
             is None
         ):
-            usm_res[...] = dpt_ext.asnumpy(seq_o).copy()
+            usm_res[...] = dpt.asnumpy(seq_o).copy()
             return
         else:
             usm_res[...] = seq_o
@@ -381,7 +380,7 @@ def _copy_through_host_walker(seq_o, usm_res):
             )
             is None
         ):
-            usm_res[...] = dpt_ext.asnumpy(usm_ar).copy()
+            usm_res[...] = dpt.asnumpy(usm_ar).copy()
         else:
             usm_res[...] = usm_ar
         return
@@ -1092,7 +1091,7 @@ def eye(
     n_cols = n_rows if n_cols is None else operator.index(n_cols)
     k = operator.index(k)
     if k >= n_cols or -k >= n_rows:
-        return dpt_ext.zeros(
+        return dpt.zeros(
             (n_rows, n_cols),
             dtype=dtype,
             order=order,
@@ -1194,14 +1193,14 @@ def full(
             sycl_queue = normalize_queue_device(
                 sycl_queue=sycl_queue, device=device
             )
-        X = dpt_ext.asarray(
+        X = dpt.asarray(
             fill_value,
             dtype=dtype,
             order=order,
             usm_type=usm_type,
             sycl_queue=sycl_queue,
         )
-        return dpt_ext.copy(dpt_ext.broadcast_to(X, shape), order=order)
+        return dpt.copy(dpt.broadcast_to(X, shape), order=order)
     else:
         _validate_fill_value(fill_value)
 
@@ -1301,14 +1300,14 @@ def full_like(
     if order == "K":
         _ensure_native_dtype_device_support(dtype, sycl_queue.sycl_device)
         if isinstance(fill_value, (dpt.usm_ndarray, np.ndarray, tuple, list)):
-            X = dpt_ext.asarray(
+            X = dpt.asarray(
                 fill_value,
                 dtype=dtype,
                 order=order,
                 usm_type=usm_type,
                 sycl_queue=sycl_queue,
             )
-            X = dpt_ext.broadcast_to(X, sh)
+            X = dpt.broadcast_to(X, sh)
             res = _empty_like_orderK(x, dtype, usm_type, sycl_queue)
             _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
             # order copy after tasks populating X
@@ -1434,14 +1433,14 @@ def linspace(
         start = float(start)
         stop = float(stop)
 
-    res = dpt_ext.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
+    res = dpt.empty(num, dtype=dt, usm_type=usm_type, sycl_queue=sycl_queue)
     _manager = dpctl.utils.SequentialOrderManager[sycl_queue]
     hev, la_ev = ti._linspace_affine(
         start, stop, dst=res, include_endpoint=endpoint, sycl_queue=sycl_queue
     )
     _manager.add_event_pair(hev, la_ev)
 
-    return res if int_dt is None else dpt_ext.astype(res, int_dt)
+    return res if int_dt is None else dpt.astype(res, int_dt)
 
 
 def meshgrid(*arrays, indexing="xy"):
@@ -1506,15 +1505,15 @@ def meshgrid(*arrays, indexing="xy"):
 
     res = []
     if n > 1 and indexing == "xy":
-        res.append(dpt_ext.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
-        res.append(dpt_ext.reshape(arrays[1], sh, copy=True))
+        res.append(dpt.reshape(arrays[0], (1, -1) + sh[2:], copy=True))
+        res.append(dpt.reshape(arrays[1], sh, copy=True))
         arrays, sh = arrays[2:], sh[-2:] + sh[:-2]
 
     for array in arrays:
-        res.append(dpt_ext.reshape(array, sh, copy=True))
+        res.append(dpt.reshape(array, sh, copy=True))
         sh = sh[-1:] + sh[:-1]
 
-    output = dpt_ext.broadcast_arrays(*res)
+    output = dpt.broadcast_arrays(*res)
 
     return output
 
@@ -1707,7 +1706,7 @@ def tril(x, /, *, k=0):
 
     q = x.sycl_queue
     if k >= shape[nd - 1] - 1:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1721,7 +1720,7 @@ def tril(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     elif k < -shape[nd - 2]:
-        res = dpt_ext.zeros(
+        res = dpt.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1729,7 +1728,7 @@ def tril(x, /, *, k=0):
             sycl_queue=q,
         )
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1785,7 +1784,7 @@ def triu(x, /, *, k=0):
 
     q = x.sycl_queue
     if k > shape[nd - 1]:
-        res = dpt_ext.zeros(
+        res = dpt.zeros(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1793,7 +1792,7 @@ def triu(x, /, *, k=0):
             sycl_queue=q,
         )
     elif k <= -shape[nd - 2] + 1:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
@@ -1807,7 +1806,7 @@ def triu(x, /, *, k=0):
         )
         _manager.add_event_pair(hev, cpy_ev)
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape,
             dtype=x.dtype,
             order=order,
diff --git a/dpctl_ext/tensor/_data_types.py b/dpctl_ext/tensor/_data_types.py
new file mode 100644
index 000000000000..faf30ffdabd0
--- /dev/null
+++ b/dpctl_ext/tensor/_data_types.py
@@ -0,0 +1,104 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from numpy import bool_ as np_bool_
+from numpy import complexfloating as np_complexfloating
+from numpy import dtype
+from numpy import floating as np_floating
+from numpy import integer as np_integer
+from numpy import issubdtype as np_issubdtype
+
+from ._tensor_impl import (
+    default_device_bool_type as ti_default_device_bool_type,
+)
+from ._tensor_impl import (
+    default_device_complex_type as ti_default_device_complex_type,
+)
+from ._tensor_impl import default_device_fp_type as ti_default_device_fp_type
+from ._tensor_impl import default_device_int_type as ti_default_device_int_type
+
+bool = dtype("bool")
+int8 = dtype("int8")
+int16 = dtype("int16")
+int32 = dtype("int32")
+int64 = dtype("int64")
+uint8 = dtype("uint8")
+uint16 = dtype("uint16")
+uint32 = dtype("uint32")
+uint64 = dtype("uint64")
+float16 = dtype("float16")
+float32 = dtype("float32")
+float64 = dtype("float64")
+complex64 = dtype("complex64")
+complex128 = dtype("complex128")
+
+
+def _get_dtype(inp_dt, sycl_obj, ref_type=None):
+    """
+    Type inference utility to construct data type
+    object with defaults based on reference type.
+
+    _get_dtype is used by dpctl.tensor.asarray
+    to infer data type of the output array from the
+    input sequence.
+    """
+    if inp_dt is None:
+        if ref_type in [None, float] or np_issubdtype(ref_type, np_floating):
+            fp_dt = ti_default_device_fp_type(sycl_obj)
+            return dtype(fp_dt)
+        if ref_type in [bool, np_bool_]:
+            bool_dt = ti_default_device_bool_type(sycl_obj)
+            return dtype(bool_dt)
+        if ref_type is int or np_issubdtype(ref_type, np_integer):
+            int_dt = ti_default_device_int_type(sycl_obj)
+            return dtype(int_dt)
+        if ref_type is complex or np_issubdtype(ref_type, np_complexfloating):
+            cfp_dt = ti_default_device_complex_type(sycl_obj)
+            return dtype(cfp_dt)
+        raise TypeError(f"Reference type {ref_type} not recognized.")
+    return dtype(inp_dt)
+
+
+__all__ = [
+    "dtype",
+    "_get_dtype",
+    "bool",
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "int64",
+    "uint64",
+    "float16",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
diff --git a/dpctl_ext/tensor/_device.py b/dpctl_ext/tensor/_device.py
new file mode 100644
index 000000000000..8d763bc721e3
--- /dev/null
+++ b/dpctl_ext/tensor/_device.py
@@ -0,0 +1,195 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+
+import dpctl
+from dpctl._sycl_device_factory import _cached_default_device
+from dpctl._sycl_queue_manager import get_device_cached_queue
+
+__doc__ = "Implementation of array API mandated Device class"
+
+
+class Device:
+    """
+    An object representing Data-API concept of device.
+
+    This is a wrapper around :class:`dpctl.SyclQueue` with custom
+    formatting. The class does not have public constructor,
+    but a class method :meth:`dpctl.tensor.Device.create_device` to construct
+    it from `device` keyword argument in Array-API functions.
+
+    Instance can be queried for ``sycl_queue``, ``sycl_context``,
+    or ``sycl_device``.
+    """
+
+    __device_queue_map__ = {}
+    sycl_queue_ = None
+
+    def __new__(cls, *args, **kwargs):
+        raise TypeError("No public constructor")
+
+    @classmethod
+    def create_device(cls, device=None):
+        """Device.create_device(device=None)
+
+        Creates instance of Device from argument.
+
+        Args:
+            device:
+                Device specification, i.e. `None`, :class:`.Device`,
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.SyclDevice`
+                corresponding to a root SYCL device.
+        Raises:
+            ValueError: if an instance of :class:`dpctl.SycDevice` corresponding
+                        to a sub-device was specified as the argument
+            SyclQueueCreationError: if :class:`dpctl.SyclQueue` could not be
+                                    created from the argument
+        """
+        dev = device
+        obj = super().__new__(cls)
+        if isinstance(dev, Device):
+            obj.sycl_queue_ = dev.sycl_queue
+        elif isinstance(dev, dpctl.SyclQueue):
+            obj.sycl_queue_ = dev
+        elif isinstance(dev, dpctl.SyclDevice):
+            par = dev.parent_device
+            if par is None:
+                obj.sycl_queue_ = get_device_cached_queue(dev)
+            else:
+                raise ValueError(
+                    f"Using non-root device {dev} to specify offloading "
+                    "target is ambiguous. Please use dpctl.SyclQueue "
+                    "targeting this device"
+                )
+        else:
+            if dev is None:
+                _dev = _cached_default_device()
+            else:
+                _dev = dpctl.SyclDevice(dev)
+            obj.sycl_queue_ = get_device_cached_queue(_dev)
+        return obj
+
+    @property
+    def sycl_queue(self):
+        """:class:`dpctl.SyclQueue` used to offload to this :class:`.Device`."""
+        return self.sycl_queue_
+
+    @property
+    def sycl_context(self):
+        """:class:`dpctl.SyclContext` associated with this :class:`.Device`."""
+        return self.sycl_queue_.sycl_context
+
+    @property
+    def sycl_device(self):
+        """:class:`dpctl.SyclDevice` targeted by this :class:`.Device`."""
+        return self.sycl_queue_.sycl_device
+
+    def __repr__(self):
+        try:
+            sd = self.sycl_device
+        except AttributeError as exc:
+            raise ValueError(
+                f"Instance of {self.__class__} is not initialized"
+            ) from exc
+        try:
+            fs = sd.filter_string
+            return f"Device({fs})"
+        except TypeError:
+            # This is a sub-device
+            return repr(self.sycl_queue)
+
+    def print_device_info(self):
+        """Outputs information about targeted SYCL device"""
+        self.sycl_device.print_device_info()
+
+    def wait(self):
+        """Call ``wait`` method of the underlying ``sycl_queue``."""
+        self.sycl_queue_.wait()
+
+    def __eq__(self, other):
+        """Equality comparison based on underlying ``sycl_queue``."""
+        if isinstance(other, Device):
+            return self.sycl_queue.__eq__(other.sycl_queue)
+        elif isinstance(other, dpctl.SyclQueue):
+            return self.sycl_queue.__eq__(other)
+        return False
+
+    def __hash__(self):
+        """Compute object's hash value."""
+        return self.sycl_queue.__hash__()
+
+
+def normalize_queue_device(sycl_queue=None, device=None):
+    """normalize_queue_device(sycl_queue=None, device=None)
+
+    Utility to process exclusive keyword arguments 'device'
+    and 'sycl_queue' in functions of `dpctl.tensor`.
+
+    Args:
+        sycl_queue (:class:`dpctl.SyclQueue`, optional):
+            explicitly indicates where USM allocation is done
+            and the population code (if any) is executed.
+            Value `None` is interpreted as get the SYCL queue
+            from `device` keyword, or use default queue.
+            Default: None
+        device (string, :class:`dpctl.SyclDevice`, :class:`dpctl.SyclQueue,
+            :class:`dpctl.tensor.Device`, optional):
+            array-API keyword indicating non-partitioned SYCL device
+            where array is allocated.
+
+    Returns
+        :class:`dpctl.SyclQueue` object implied by either of provided
+        keywords. If both are None, `dpctl.SyclQueue()` is returned.
+        If both are specified and imply the same queue, `sycl_queue`
+        is returned.
+
+    Raises:
+        TypeError: if argument is not of the expected type, or keywords
+            imply incompatible queues.
+    """
+    q = sycl_queue
+    d = device
+    if q is None:
+        d = Device.create_device(d)
+        return d.sycl_queue
+    if not isinstance(q, dpctl.SyclQueue):
+        raise TypeError(f"Expected dpctl.SyclQueue, got {type(q)}")
+    if d is None:
+        return q
+    d = Device.create_device(d)
+    qq = dpctl.utils.get_execution_queue(
+        (
+            q,
+            d.sycl_queue,
+        )
+    )
+    if qq is None:
+        raise TypeError(
+            "sycl_queue and device keywords can not be both specified"
+        )
+    return qq
diff --git a/dpctl_ext/tensor/_dldevice_conversions.py b/dpctl_ext/tensor/_dldevice_conversions.py
new file mode 100644
index 000000000000..595a280689a5
--- /dev/null
+++ b/dpctl_ext/tensor/_dldevice_conversions.py
@@ -0,0 +1,52 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+from dpctl._sycl_device import SyclDevice
+
+from ._usmarray import DLDeviceType
+
+
+def dldevice_to_sycl_device(dl_dev: tuple):
+    if isinstance(dl_dev, tuple):
+        if len(dl_dev) != 2:
+            raise ValueError("dldevice tuple must have length 2")
+    else:
+        raise TypeError(
+            f"dl_dev is expected to be a 2-tuple, got " f"{type(dl_dev)}"
+        )
+    if dl_dev[0] != DLDeviceType.kDLOneAPI:
+        raise ValueError("dldevice type must be kDLOneAPI")
+    return SyclDevice(str(dl_dev[1]))
+
+
+def sycl_device_to_dldevice(dev: SyclDevice):
+    if not isinstance(dev, SyclDevice):
+        raise TypeError(
+            "dev is expected to be a SyclDevice, got " f"{type(dev)}"
+        )
+    return (DLDeviceType.kDLOneAPI, dev.get_device_id())
diff --git a/dpctl_ext/tensor/_dlpack.pxd b/dpctl_ext/tensor/_dlpack.pxd
new file mode 100644
index 000000000000..75378bfa7a92
--- /dev/null
+++ b/dpctl_ext/tensor/_dlpack.pxd
@@ -0,0 +1,73 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+from dpctl._sycl_device cimport SyclDevice
+from numpy cimport ndarray
+
+from ._usmarray cimport usm_ndarray
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    int device_CPU "kDLCPU"
+    int device_CUDA "kDLCUDA"
+    int device_CUDAHost "kDLCUDAHost"
+    int device_CUDAManaged "kDLCUDAManaged"
+    int device_DLROCM "kDLROCM"
+    int device_ROCMHost "kDLROCMHost"
+    int device_OpenCL "kDLOpenCL"
+    int device_Vulkan "kDLVulkan"
+    int device_Metal "kDLMetal"
+    int device_VPI "kDLVPI"
+    int device_OneAPI "kDLOneAPI"
+    int device_WebGPU "kDLWebGPU"
+    int device_Hexagon "kDLHexagon"
+    int device_MAIA "kDLMAIA"
+    int device_Trn "kDLTrn"
+
+cpdef object to_dlpack_capsule(usm_ndarray array) except +
+cpdef object to_dlpack_versioned_capsule(
+    usm_ndarray array, bint copied
+) except +
+cpdef object numpy_to_dlpack_versioned_capsule(
+    ndarray array, bint copied
+) except +
+cpdef object from_dlpack_capsule(object dltensor) except +
+
+cdef class DLPackCreationError(Exception):
+    """
+    A DLPackCreateError exception is raised when constructing
+    DLPack capsule from `usm_ndarray` based on a USM allocation
+    on a partitioned SYCL device.
+    """
+    pass
diff --git a/dpctl_ext/tensor/_dlpack.pyx b/dpctl_ext/tensor/_dlpack.pyx
new file mode 100644
index 000000000000..fde4415b7425
--- /dev/null
+++ b/dpctl_ext/tensor/_dlpack.pyx
@@ -0,0 +1,1245 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+cdef extern from "numpy/npy_no_deprecated_api.h":
+    pass
+
+cimport cpython
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from dpctl._backend cimport (
+    DPCTLDevice_Delete,
+    DPCTLDevice_GetParentDevice,
+    DPCTLSyclDeviceRef,
+    DPCTLSyclUSMRef,
+)
+from dpctl._sycl_queue_manager cimport get_device_cached_queue
+from libc cimport stdlib
+from libc.stdint cimport int64_t, uint8_t, uint16_t, uint32_t, uint64_t
+from numpy cimport ndarray
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+import ctypes
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from ._device import Device
+
+
+cdef extern from "dlpack/dlpack.h" nogil:
+    cdef int DLPACK_MAJOR_VERSION
+
+    cdef int DLPACK_MINOR_VERSION
+
+    cdef int DLPACK_FLAG_BITMASK_READ_ONLY
+
+    cdef int DLPACK_FLAG_BITMASK_IS_COPIED
+
+    ctypedef struct DLPackVersion:
+        uint32_t major
+        uint32_t minor
+
+    cdef enum DLDeviceType:
+        kDLCPU
+        kDLCUDA
+        kDLCUDAHost
+        kDLCUDAManaged
+        kDLROCM
+        kDLROCMHost
+        kDLOpenCL
+        kDLVulkan
+        kDLMetal
+        kDLVPI
+        kDLOneAPI
+        kDLWebGPU
+        kDLHexagon
+        kDLMAIA
+        kDLTrn
+
+    ctypedef struct DLDevice:
+        DLDeviceType device_type
+        int device_id
+
+    cdef enum DLDataTypeCode:
+        kDLInt
+        kDLUInt
+        kDLFloat
+        kDLBfloat
+        kDLComplex
+        kDLBool
+        kDLFloat8_e3m4
+        kDLFloat8_e4m3
+        kDLFloat8_e4m3b11fnuz
+        kDLFloat8_e4m3fn
+        kDLFloat8_e4m3fnuz
+        kDLFloat8_e5m2
+        kDLFloat8_e5m2fnuz
+        kDLFloat8_e8m0fnu
+        kDLFloat6_e2m3fn
+        kDLFloat6_e3m2fn
+        kDLFloat4_e2m1fn
+
+    ctypedef struct DLDataType:
+        uint8_t code
+        uint8_t bits
+        uint16_t lanes
+
+    ctypedef struct DLTensor:
+        void *data
+        DLDevice device
+        int ndim
+        DLDataType dtype
+        int64_t *shape
+        int64_t *strides
+        uint64_t byte_offset
+
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void *manager_ctx
+        void (*deleter)(DLManagedTensor *)  # noqa: E211
+
+    ctypedef struct DLManagedTensorVersioned:
+        DLPackVersion version
+        void *manager_ctx
+        void (*deleter)(DLManagedTensorVersioned *)  # noqa: E211
+        uint64_t flags
+        DLTensor dl_tensor
+
+
+def get_build_dlpack_version():
+    """
+    Returns a tuple of integers representing the `major` and `minor`
+    version of DLPack :module:`dpctl.tensor` was built with.
+    This tuple can be passed as the `max_version` argument to
+    `__dlpack__` to guarantee module:`dpctl.tensor` can properly
+    consume capsule.
+
+    Returns:
+        Tuple[int, int]
+            A tuple of integers representing the `major` and `minor`
+            version of DLPack used to build :module:`dpctl.tensor`.
+    """
+    return (DLPACK_MAJOR_VERSION, DLPACK_MINOR_VERSION)
+
+
+cdef void _pycapsule_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensor *dlm_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor")
+        dlm_tensor.deleter(dlm_tensor)
+
+
+cdef void _managed_tensor_deleter(
+    DLManagedTensor *dlm_tensor
+) noexcept with gil:
+    if dlm_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlm_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlm_tensor.manager_ctx)
+        dlm_tensor.manager_ctx = NULL
+        stdlib.free(dlm_tensor)
+
+
+cdef void _pycapsule_versioned_deleter(object dlt_capsule) noexcept:
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    if cpython.PyCapsule_IsValid(dlt_capsule, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+            dlt_capsule, "dltensor_versioned")
+        dlmv_tensor.deleter(dlmv_tensor)
+
+
+cdef void _managed_tensor_versioned_deleter(
+    DLManagedTensorVersioned *dlmv_tensor
+) noexcept with gil:
+    if dlmv_tensor is not NULL:
+        # we only delete shape, because we make single allocation to
+        # accommodate both shape and strides if strides are needed
+        stdlib.free(dlmv_tensor.dl_tensor.shape)
+        cpython.Py_DECREF(<object>dlmv_tensor.manager_ctx)
+        dlmv_tensor.manager_ctx = NULL
+        stdlib.free(dlmv_tensor)
+
+
+cdef object _get_default_context(c_dpctl.SyclDevice dev):
+    try:
+        default_context = dev.sycl_platform.default_context
+    except RuntimeError:
+        # RT does not support default_context
+        default_context = None
+
+    return default_context
+
+cdef int get_array_dlpack_device_id(
+    usm_ndarray usm_ary
+) except -1:
+    """Finds ordinal number of the parent of device where array
+    was allocated.
+    """
+    cdef c_dpctl.SyclQueue ary_sycl_queue
+    cdef c_dpctl.SyclDevice ary_sycl_device
+    cdef DPCTLSyclDeviceRef pDRef = NULL
+    cdef int device_id = -1
+
+    ary_sycl_queue = usm_ary.get_sycl_queue()
+    ary_sycl_device = ary_sycl_queue.get_sycl_device()
+
+    default_context = _get_default_context(ary_sycl_device)
+    if default_context is None:
+        # check that ary_sycl_device is a non-partitioned device
+        pDRef = DPCTLDevice_GetParentDevice(ary_sycl_device.get_device_ref())
+        if pDRef is not NULL:
+            DPCTLDevice_Delete(pDRef)
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays allocated "
+                "on non-partitioned SYCL devices on platforms where "
+                "default_context oneAPI extension is not supported."
+            )
+    else:
+        if not usm_ary.sycl_context == default_context:
+            raise DLPackCreationError(
+                "to_dlpack_capsule: DLPack can only export arrays based on USM "
+                "allocations bound to a default platform SYCL context"
+            )
+    device_id = ary_sycl_device.get_device_id()
+
+    if device_id < 0:
+        raise DLPackCreationError(
+            "get_array_dlpack_device_id: failed to determine device_id"
+        )
+
+    return device_id
+
+
+cpdef to_dlpack_capsule(usm_ndarray usm_ary):
+    """
+    to_dlpack_capsule(usm_ary)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensor`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+    Returns:
+        A new capsule with name ``"dltensor"`` that contains
+        a pointer to ``DLManagedTensor`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for ``DLManagedTensor``
+            did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensor``.
+    """
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlm_tensor = <DLManagedTensor *> stdlib.malloc(
+        sizeof(DLManagedTensor))
+    if dlm_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_capsule: Could not allocate memory for DLManagedTensor"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlm_tensor)
+            raise MemoryError(
+                "to_dlpack_capsule: Could not allocate memory for shape/strides"
+            )
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlm_tensor)
+                raise BufferError(
+                    "to_dlpack_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlm_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlm_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    dlm_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlm_tensor.deleter = _managed_tensor_deleter
+
+    return cpython.PyCapsule_New(dlm_tensor, "dltensor", _pycapsule_deleter)
+
+
+cpdef to_dlpack_versioned_capsule(usm_ndarray usm_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(usm_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`dpctl.tensor.usm_ndarray` instance.
+
+    Args:
+        usm_ary: An instance of :class:`dpctl.tensor.usm_ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor. This may happen when array was allocated
+            on a partitioned sycl device, or its USM allocation is
+            not bound to the platform default SYCL context.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = usm_ary.get_ndim()
+    cdef char *data_ptr = usm_ary.get_data()
+    cdef Py_ssize_t *shape_ptr = NULL
+    cdef Py_ssize_t *strides_ptr = NULL
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef int device_id = -1
+    cdef int flags = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef Py_ssize_t si = 1
+
+    ary_base = usm_ary.get_base()
+
+    # Find ordinal number of the parent device
+    device_id = get_array_dlpack_device_id(usm_ary)
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+    if nd > 0:
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        # this can be a separate function for handling shapes and strides
+        shape_ptr = usm_ary.get_shape()
+        for i in range(nd):
+            shape_strides_ptr[i] = shape_ptr[i]
+        strides_ptr = usm_ary.get_strides()
+        flags = usm_ary.flags_
+        if strides_ptr:
+            for i in range(nd):
+                shape_strides_ptr[nd + i] = strides_ptr[i]
+        else:
+            if flags & USM_ARRAY_C_CONTIGUOUS:
+                si = 1
+                for i in range(nd - 1, -1, -1):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            elif flags & USM_ARRAY_F_CONTIGUOUS:
+                si = 1
+                for i in range(0, nd):
+                    shape_strides_ptr[nd + i] = si
+                    si = si * shape_ptr[i]
+            else:
+                stdlib.free(shape_strides_ptr)
+                stdlib.free(dlmv_tensor)
+                raise BufferError(
+                    "to_dlpack_versioned_capsule: Invalid array encountered "
+                    "when building strides"
+                )
+
+            strides_ptr = <Py_ssize_t *>&shape_strides_ptr[nd]
+
+    # this can all be a function for building the dl_tensor
+    # object (separate from dlm/dlmv)
+    ary_dt = usm_ary.dtype
+    ary_dtk = ary_dt.kind
+    element_offset = usm_ary.get_offset()
+    byte_offset = element_offset * (<Py_ssize_t>ary_dt.itemsize)
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void*>(data_ptr - byte_offset)
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLOneAPI
+    dl_tensor.device.device_id = device_id
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f"):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c"):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not (flags & USM_ARRAY_WRITABLE):
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>ary_base
+    cpython.Py_INCREF(ary_base)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cpdef numpy_to_dlpack_versioned_capsule(ndarray npy_ary, bint copied):
+    """
+    to_dlpack_versioned_capsule(npy_ary, copied)
+
+    Constructs named Python capsule object referencing
+    instance of ``DLManagedTensorVersioned`` from
+    :class:`numpy.ndarray` instance.
+
+    Args:
+        npy_ary: An instance of :class:`numpy.ndarray`
+        copied: A bint representing whether the data was previously
+            copied in order to set the flags with the is-copied
+            bitmask.
+    Returns:
+        A new capsule with name ``"dltensor_versioned"`` that
+        contains a pointer to ``DLManagedTensorVersioned`` struct.
+    Raises:
+        DLPackCreationError: when array can be represented as
+            DLPack tensor.
+        MemoryError: when host allocation to needed for
+            ``DLManagedTensorVersioned`` did not succeed.
+        ValueError: when array elements data type could not be represented
+            in ``DLManagedTensorVersioned``.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef uint32_t dlmv_flags = 0
+    cdef int nd = npy_ary.ndim
+    cdef int64_t *shape_strides_ptr = NULL
+    cdef int i = 0
+    cdef Py_ssize_t byte_offset = 0
+    cdef int itemsize = npy_ary.itemsize
+
+    dlmv_tensor = <DLManagedTensorVersioned *> stdlib.malloc(
+        sizeof(DLManagedTensorVersioned))
+    if dlmv_tensor is NULL:
+        raise MemoryError(
+            "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+            "for DLManagedTensorVersioned"
+        )
+
+    shape = npy_ary.ctypes.shape_as(ctypes.c_int64)
+    strides = npy_ary.ctypes.strides_as(ctypes.c_int64)
+    if nd > 0:
+        if npy_ary.size != 1:
+            for i in range(nd):
+                if shape[i] != 1 and strides[i] % itemsize != 0:
+                    stdlib.free(dlmv_tensor)
+                    raise BufferError(
+                        "numpy_to_dlpack_versioned_capsule: DLPack cannot "
+                        "encode an array if strides are not a multiple of "
+                        "itemsize"
+                    )
+        shape_strides_ptr = <int64_t *>stdlib.malloc((sizeof(int64_t) * 2) * nd)
+        if shape_strides_ptr is NULL:
+            stdlib.free(dlmv_tensor)
+            raise MemoryError(
+                "numpy_to_dlpack_versioned_capsule: Could not allocate memory "
+                "for shape/strides"
+            )
+        for i in range(nd):
+            shape_strides_ptr[i] = shape[i]
+            shape_strides_ptr[nd + i] = strides[i] // itemsize
+
+    writable_flag = npy_ary.flags["W"]
+
+    ary_dt = npy_ary.dtype
+    ary_dtk = ary_dt.kind
+
+    dl_tensor = &dlmv_tensor.dl_tensor
+    dl_tensor.data = <void *> npy_ary.data
+    dl_tensor.ndim = nd
+    dl_tensor.byte_offset = <uint64_t>byte_offset
+    dl_tensor.shape = &shape_strides_ptr[0] if nd > 0 else NULL
+    dl_tensor.strides = &shape_strides_ptr[nd] if nd > 0 else NULL
+    dl_tensor.device.device_type = kDLCPU
+    dl_tensor.device.device_id = 0
+    dl_tensor.dtype.lanes = <uint16_t>1
+    dl_tensor.dtype.bits = <uint8_t>(ary_dt.itemsize * 8)
+    if (ary_dtk == "b"):
+        dl_tensor.dtype.code = <uint8_t>kDLBool
+    elif (ary_dtk == "u"):
+        dl_tensor.dtype.code = <uint8_t>kDLUInt
+    elif (ary_dtk == "i"):
+        dl_tensor.dtype.code = <uint8_t>kDLInt
+    elif (ary_dtk == "f" and ary_dt.itemsize <= 8):
+        dl_tensor.dtype.code = <uint8_t>kDLFloat
+    elif (ary_dtk == "c" and ary_dt.itemsize <= 16):
+        dl_tensor.dtype.code = <uint8_t>kDLComplex
+    else:
+        stdlib.free(shape_strides_ptr)
+        stdlib.free(dlmv_tensor)
+        raise ValueError("Unrecognized array data type")
+
+    # set flags down here
+    if copied:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_IS_COPIED
+    if not writable_flag:
+        dlmv_flags |= DLPACK_FLAG_BITMASK_READ_ONLY
+    dlmv_tensor.flags = dlmv_flags
+
+    dlmv_tensor.version.major = DLPACK_MAJOR_VERSION
+    dlmv_tensor.version.minor = DLPACK_MINOR_VERSION
+
+    dlmv_tensor.manager_ctx = <void*>npy_ary
+    cpython.Py_INCREF(npy_ary)
+    dlmv_tensor.deleter = _managed_tensor_versioned_deleter
+
+    return cpython.PyCapsule_New(
+        dlmv_tensor, "dltensor_versioned", _pycapsule_versioned_deleter
+    )
+
+
+cdef class _DLManagedTensorOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensor struct
+    transferred from a 'dlpack' capsule.
+    """
+    cdef DLManagedTensor * dlm_tensor
+
+    def __cinit__(self):
+        self.dlm_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlm_tensor:
+            self.dlm_tensor.deleter(self.dlm_tensor)
+            self.dlm_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorOwner _create(DLManagedTensor *dlm_tensor_src):
+        cdef _DLManagedTensorOwner res
+        res = _DLManagedTensorOwner.__new__(_DLManagedTensorOwner)
+        res.dlm_tensor = dlm_tensor_src
+        return res
+
+
+cdef class _DLManagedTensorVersionedOwner:
+    """
+    Helper class managing the lifetime of the DLManagedTensorVersioned
+    struct transferred from a 'dlpack_versioned' capsule.
+    """
+    cdef DLManagedTensorVersioned * dlmv_tensor
+
+    def __cinit__(self):
+        self.dlmv_tensor = NULL
+
+    def __dealloc__(self):
+        if self.dlmv_tensor:
+            self.dlmv_tensor.deleter(self.dlmv_tensor)
+            self.dlmv_tensor = NULL
+
+    @staticmethod
+    cdef _DLManagedTensorVersionedOwner _create(
+        DLManagedTensorVersioned *dlmv_tensor_src
+    ):
+        cdef _DLManagedTensorVersionedOwner res
+        res = _DLManagedTensorVersionedOwner.__new__(
+            _DLManagedTensorVersionedOwner
+        )
+        res.dlmv_tensor = dlmv_tensor_src
+        return res
+
+
+cdef dict _numpy_array_interface_from_dl_tensor(DLTensor *dlt, bint ro_flag):
+    """Constructs a NumPy `__array_interface__` dictionary from a DLTensor."""
+    cdef int itemsize = 0
+
+    if dlt.dtype.lanes != 1:
+        raise BufferError(
+            "Can not import DLPack tensor with lanes != 1"
+        )
+    itemsize = dlt.dtype.bits // 8
+    shape = list()
+    if (dlt.strides is NULL):
+        strides = None
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+    else:
+        strides = list()
+        for dim in range(dlt.ndim):
+            shape.append(dlt.shape[dim])
+            # convert to byte-strides
+            strides.append(dlt.strides[dim] * itemsize)
+        strides = tuple(strides)
+    shape = tuple(shape)
+    if (dlt.dtype.code == kDLUInt):
+        ary_dt = "u" + str(itemsize)
+    elif (dlt.dtype.code == kDLInt):
+        ary_dt = "i" + str(itemsize)
+    elif (dlt.dtype.code == kDLFloat):
+        ary_dt = "f" + str(itemsize)
+    elif (dlt.dtype.code == kDLComplex):
+        ary_dt = "c" + str(itemsize)
+    elif (dlt.dtype.code == kDLBool):
+        ary_dt = "b" + str(itemsize)
+    else:
+        raise BufferError(
+            "Can not import DLPack tensor with type code {}.".format(
+                <object>dlt.dtype.code
+            )
+        )
+    typestr = "|" + ary_dt
+    return dict(
+        version=3,
+        shape=shape,
+        strides=strides,
+        data=(<size_t> dlt.data, True if ro_flag else False),
+        offset=dlt.byte_offset,
+        typestr=typestr,
+    )
+
+
+class _numpy_array_interface_wrapper:
+    """
+    Class that wraps a Python capsule and dictionary for consumption by NumPy.
+
+    Implementation taken from
+    https://github.com/dmlc/dlpack/blob/main/apps/numpy_dlpack/dlpack/to_numpy.py
+
+    Args:
+        array_interface:
+            A dictionary describing the underlying memory. Formatted
+            to match `numpy.ndarray.__array_interface__`.
+
+        pycapsule:
+            A Python capsule wrapping the dlpack tensor that will be
+            converted to numpy.
+    """
+
+    def __init__(self, array_interface, memory_owner) -> None:
+        self.__array_interface__ = array_interface
+        self._memory_owner = memory_owner
+
+
+cdef bint _is_kdlcpu_device(DLDevice *dev):
+    "Check if DLTensor.DLDevice denotes (kDLCPU, 0)"
+    return (dev[0].device_type == kDLCPU) and (dev[0].device_id == 0)
+
+
+cpdef object from_dlpack_capsule(object py_caps):
+    """
+    from_dlpack_capsule(py_caps)
+
+    Reconstructs instance of :class:`dpctl.tensor.usm_ndarray` from
+    named Python capsule object referencing instance of ``DLManagedTensor``
+    without copy. The instance forms a view in the memory of the tensor.
+
+    Args:
+        caps:
+            Python capsule with name ``"dltensor"`` expected to reference
+            an instance of ``DLManagedTensor`` struct.
+    Returns:
+        Instance of :class:`dpctl.tensor.usm_ndarray` with a view into
+        memory of the tensor. Capsule is renamed to ``"used_dltensor"``
+        upon success.
+    Raises:
+        TypeError:
+            if argument is not a ``"dltensor"`` capsule.
+        ValueError:
+            if argument is ``"used_dltensor"`` capsule
+        BufferError:
+            if the USM pointer is not bound to the reconstructed
+            sycl context, or the DLPack's device_type is not supported
+            by :mod:`dpctl`.
+    """
+    cdef DLManagedTensorVersioned *dlmv_tensor = NULL
+    cdef DLManagedTensor *dlm_tensor = NULL
+    cdef DLTensor *dl_tensor = NULL
+    cdef int versioned = 0
+    cdef int readonly = 0
+    cdef bytes usm_type
+    cdef size_t sz = 1
+    cdef size_t alloc_sz = 1
+    cdef int i
+    cdef int device_id = -1
+    cdef int element_bytesize = 0
+    cdef Py_ssize_t offset_min = 0
+    cdef Py_ssize_t offset_max = 0
+    cdef char *mem_ptr = NULL
+    cdef Py_ssize_t mem_ptr_delta = 0
+    cdef Py_ssize_t element_offset = 0
+    cdef int64_t stride_i = -1
+    cdef int64_t shape_i = -1
+
+    if cpython.PyCapsule_IsValid(py_caps, "dltensor"):
+        dlm_tensor = <DLManagedTensor*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor")
+        dl_tensor = &dlm_tensor.dl_tensor
+    elif cpython.PyCapsule_IsValid(py_caps, "dltensor_versioned"):
+        dlmv_tensor = <DLManagedTensorVersioned*>cpython.PyCapsule_GetPointer(
+                py_caps, "dltensor_versioned")
+        if dlmv_tensor.version.major > DLPACK_MAJOR_VERSION:
+            raise BufferError(
+                "Can not import DLPack tensor with major version "
+                f"greater than {DLPACK_MAJOR_VERSION}"
+            )
+        versioned = 1
+        readonly = (dlmv_tensor.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0
+        dl_tensor = &dlmv_tensor.dl_tensor
+    elif (
+        cpython.PyCapsule_IsValid(py_caps, "used_dltensor")
+        or cpython.PyCapsule_IsValid(py_caps, "used_dltensor_versioned")
+    ):
+        raise ValueError(
+            "A DLPack tensor object can not be consumed multiple times"
+        )
+    else:
+        raise TypeError(
+            "`from_dlpack_capsule` expects a Python 'dltensor' capsule"
+        )
+
+    # Verify that we can work with this device
+    if dl_tensor.device.device_type == kDLOneAPI:
+        device_id = dl_tensor.device.device_id
+        root_device = dpctl.SyclDevice(str(<int>device_id))
+        try:
+            default_context = root_device.sycl_platform.default_context
+        except RuntimeError:
+            default_context = get_device_cached_queue(root_device).sycl_context
+        if dl_tensor.data is NULL:
+            usm_type = b"device"
+            q = get_device_cached_queue((default_context, root_device,))
+        else:
+            usm_type = c_dpmem._Memory.get_pointer_type(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context)
+            if usm_type == b"unknown":
+                raise BufferError(
+                    "Data pointer in DLPack is not bound to default sycl "
+                    f"context of device '{device_id}', translated to "
+                    f"{root_device.filter_string}"
+                )
+            alloc_device = c_dpmem._Memory.get_pointer_device(
+                <DPCTLSyclUSMRef> dl_tensor.data,
+                <c_dpctl.SyclContext>default_context
+            )
+            q = get_device_cached_queue((default_context, alloc_device,))
+        if dl_tensor.dtype.bits % 8:
+            raise BufferError(
+                "Can not import DLPack tensor whose element's "
+                "bitsize is not a multiple of 8"
+            )
+        if dl_tensor.dtype.lanes != 1:
+            raise BufferError(
+                "Can not import DLPack tensor with lanes != 1"
+            )
+        if dl_tensor.ndim > 0:
+            offset_min = 0
+            offset_max = 0
+            for i in range(dl_tensor.ndim):
+                stride_i = dl_tensor.strides[i]
+                shape_i = dl_tensor.shape[i]
+                if shape_i > 1:
+                    shape_i -= 1
+                    if stride_i > 0:
+                        offset_max = offset_max + stride_i * shape_i
+                    else:
+                        offset_min = offset_min + stride_i * shape_i
+            sz = offset_max - offset_min + 1
+        if sz == 0:
+            sz = 1
+
+        element_bytesize = (dl_tensor.dtype.bits // 8)
+        sz = sz * element_bytesize
+        element_offset = dl_tensor.byte_offset // element_bytesize
+
+        # transfer ownership
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+
+        if dl_tensor.data is NULL:
+            usm_mem = dpmem.MemoryUSMDevice(sz, q)
+        else:
+            mem_ptr_delta = dl_tensor.byte_offset - (
+                element_offset * element_bytesize
+            )
+            mem_ptr = <char *>dl_tensor.data
+            alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                (offset_max + 1) * element_bytesize)
+            tmp = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef> mem_ptr,
+                max(alloc_sz, <uint64_t>element_bytesize),
+                (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                memory_owner=dlmv_holder if versioned else dlm_holder
+            )
+            if mem_ptr_delta == 0:
+                usm_mem = tmp
+            else:
+                alloc_sz = dl_tensor.byte_offset + <uint64_t>(
+                    (offset_max * element_bytesize + mem_ptr_delta))
+                usm_mem = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                    <DPCTLSyclUSMRef> (
+                        mem_ptr + (element_bytesize - mem_ptr_delta)
+                    ),
+                    max(alloc_sz, <uint64_t>element_bytesize),
+                    (<c_dpctl.SyclQueue>q).get_queue_ref(),
+                    memory_owner=tmp
+                )
+
+        py_shape = list()
+        if (dl_tensor.shape is not NULL):
+            for i in range(dl_tensor.ndim):
+                py_shape.append(dl_tensor.shape[i])
+        if (dl_tensor.strides is not NULL):
+            py_strides = list()
+            for i in range(dl_tensor.ndim):
+                py_strides.append(dl_tensor.strides[i])
+        else:
+            py_strides = None
+        if (dl_tensor.dtype.code == kDLUInt):
+            ary_dt = np.dtype("u" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLInt):
+            ary_dt = np.dtype("i" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLFloat):
+            ary_dt = np.dtype("f" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLComplex):
+            ary_dt = np.dtype("c" + str(element_bytesize))
+        elif (dl_tensor.dtype.code == kDLBool):
+            ary_dt = np.dtype("?")
+        else:
+            raise BufferError(
+                "Can not import DLPack tensor with type code {}.".format(
+                    <object>dl_tensor.dtype.code
+                )
+            )
+        res_ary = usm_ndarray(
+            py_shape,
+            dtype=ary_dt,
+            buffer=usm_mem,
+            strides=py_strides,
+            offset=element_offset
+        )
+        if readonly:
+            res_ary.flags_ = (res_ary.flags_ & ~USM_ARRAY_WRITABLE)
+        return res_ary
+    elif _is_kdlcpu_device(&dl_tensor.device):
+        ary_iface = _numpy_array_interface_from_dl_tensor(dl_tensor, readonly)
+        if not versioned:
+            dlm_holder = _DLManagedTensorOwner._create(dlm_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlm_holder)
+            )
+        else:
+            dlmv_holder = _DLManagedTensorVersionedOwner._create(dlmv_tensor)
+            cpython.PyCapsule_SetName(py_caps, "used_dltensor_versioned")
+            return np.ctypeslib.as_array(
+                _numpy_array_interface_wrapper(ary_iface, dlmv_holder)
+            )
+    else:
+        raise BufferError(
+            "The DLPack tensor resides on unsupported device."
+        )
+
+cdef usm_ndarray _to_usm_ary_from_host_blob(object host_blob, dev : Device):
+    q = dev.sycl_queue
+    np_ary = np.asarray(host_blob)
+    dt = np_ary.dtype
+    if dt.char in "dD" and q.sycl_device.has_aspect_fp64 is False:
+        Xusm_dtype = (
+            "float32" if dt.char == "d" else "complex64"
+        )
+    else:
+        Xusm_dtype = dt
+    usm_mem = dpmem.MemoryUSMDevice(np_ary.nbytes, queue=q)
+    usm_ary = usm_ndarray(np_ary.shape, dtype=Xusm_dtype, buffer=usm_mem)
+    usm_mem.copy_from_host(np.reshape(np_ary.view(dtype="u1"), -1))
+    return usm_ary
+
+
+# only cdef to make it private
+cdef object _create_device(object device, object dl_device):
+    if isinstance(device, Device):
+        return device
+    elif isinstance(device, dpctl.SyclDevice):
+        return Device.create_device(device)
+    else:
+        root_device = dpctl.SyclDevice(str(<int>dl_device[1]))
+        return Device.create_device(root_device)
+
+
+def from_dlpack(x, /, *, device=None, copy=None):
+    """from_dlpack(x, /, *, device=None, copy=None)
+
+    Constructs :class:`dpctl.tensor.usm_ndarray` or :class:`numpy.ndarray`
+    instance from a Python object ``x`` that implements ``__dlpack__`` protocol.
+
+    Args:
+        x (object):
+            A Python object representing an array that supports
+            ``__dlpack__`` protocol.
+        device (
+            Optional[str, :class:`dpctl.SyclDevice`,
+            :class:`dpctl.SyclQueue`,
+            :class:`dpctl.tensor.Device`,
+            tuple([:class:`enum.IntEnum`, int])])
+        ):
+            Device where the output array is to be placed. ``device`` keyword
+            values can be:
+
+            * ``None``
+                The data remains on the same device.
+            * oneAPI filter selector string
+                SYCL device selected by :ref:`filter selector string
+                <filter_selector_string>`.
+            * :class:`dpctl.SyclDevice`
+                explicit SYCL device that must correspond to
+                a non-partitioned SYCL device.
+            * :class:`dpctl.SyclQueue`
+                implies SYCL device targeted by the SYCL queue.
+            * :class:`dpctl.tensor.Device`
+                implies SYCL device `device.sycl_queue`. The `Device` object
+                is obtained via :attr:`dpctl.tensor.usm_ndarray.device`.
+            * ``(device_type, device_id)``
+               2-tuple matching the format of the output of the
+               ``__dlpack_device__`` method: an integer enumerator representing
+               the device type followed by an integer representing the index of
+               the device. The only supported :class:`dpctl.tensor.DLDeviceType`
+               device types are ``"kDLCPU"`` and ``"kDLOneAPI"``.
+
+            Default: ``None``.
+
+        copy (bool, optional)
+            Boolean indicating whether or not to copy the input.
+
+            * If ``copy`` is ``True``, the input will always be
+              copied.
+            * If ``False``, a ``BufferError`` will be raised if a
+              copy is deemed necessary.
+            * If ``None``, a copy will be made only if deemed
+              necessary, otherwise, the existing memory buffer will
+              be reused.
+
+            Default: ``None``.
+
+    Returns:
+        Alternative[usm_ndarray, numpy.ndarray]:
+            An array containing the data in ``x``. When ``copy`` is
+            ``None`` or ``False``, this may be a view into the original
+            memory.
+
+            The type of the returned object
+            depends on where the data backing up input object ``x`` resides.
+            If it resides in a USM allocation on a SYCL device, the
+            type :class:`dpctl.tensor.usm_ndarray` is returned, otherwise if it
+            resides on ``"kDLCPU"`` device the type is :class:`numpy.ndarray`,
+            and otherwise an exception is raised.
+
+            .. note::
+
+                If the return type is :class:`dpctl.tensor.usm_ndarray`, the
+                associated SYCL queue is derived from the ``device`` keyword.
+                When ``device`` keyword value has type :class:`dpctl.SyclQueue`,
+                the explicit queue instance is used, when ``device`` keyword
+                value has type :class:`dpctl.tensor.Device`, the
+                ``device.sycl_queue`` is used. In all other cases, the cached
+                SYCL queue corresponding to the implied SYCL device is used.
+
+    Raises:
+        TypeError:
+            if ``x`` does not implement ``__dlpack__`` method
+        ValueError:
+            if data of the input object resides on an unsupported device
+
+    See https://dmlc.github.io/dlpack/latest/ for more details.
+
+    :Example:
+
+        .. code-block:: python
+
+            import dpctl
+            import dpctl.tensor as dpt
+
+            class Container:
+                "Helper class implementing `__dlpack__` protocol"
+                def __init__(self, array):
+                    self._array = array
+
+                def __dlpack__(self, stream=None):
+                    return self._array.__dlpack__(stream=stream)
+
+                def __dlpack_device__(self):
+                    return self._array.__dlpack_device__()
+
+            C = Container(dpt.linspace(0, 100, num=20, dtype="int16"))
+            # create usm_ndarray view
+            X = dpt.from_dlpack(C)
+            # migrate content of the container to device of type kDLCPU
+            Y = dpt.from_dlpack(C, device=(dpt.DLDeviceType.kDLCPU, 0))
+
+    """
+    dlpack_attr = getattr(x, "__dlpack__", None)
+    dlpack_dev_attr = getattr(x, "__dlpack_device__", None)
+    if not callable(dlpack_attr) or not callable(dlpack_dev_attr):
+        raise TypeError(
+            f"The argument of type {type(x)} does not implement "
+            "`__dlpack__` and `__dlpack_device__` methods."
+        )
+    # device is converted to a dlpack_device if necessary
+    dl_device = None
+    if device:
+        if isinstance(device, tuple):
+            dl_device = device
+            if len(dl_device) != 2:
+                raise ValueError(
+                    "Argument `device` specified as a tuple must have length 2"
+                )
+        else:
+            if not isinstance(device, dpctl.SyclDevice):
+                device = Device.create_device(device)
+                d = device.sycl_device
+            else:
+                d = device
+            dl_device = (device_OneAPI, d.get_device_id())
+    if dl_device is not None:
+        if (dl_device[0] not in [device_OneAPI, device_CPU]):
+            raise ValueError(
+                f"Argument `device`={device} is not supported."
+            )
+    got_type_error = False
+    got_buffer_error = False
+    got_other_error = False
+    saved_exception = None
+    # First DLPack version supporting dl_device, and copy
+    requested_ver = (1, 0)
+    cpu_dev = (device_CPU, 0)
+    try:
+        # setting max_version to minimal version that supports
+        # dl_device/copy keywords
+        dlpack_capsule = dlpack_attr(
+            max_version=requested_ver,
+            dl_device=dl_device,
+            copy=copy
+        )
+    except TypeError:
+        # exporter does not support max_version keyword
+        got_type_error = True
+    except (BufferError, NotImplementedError, ValueError) as e:
+        # Either dl_device, or copy cannot be satisfied
+        got_buffer_error = True
+        saved_exception = e
+    except Exception as e:
+        got_other_error = True
+        saved_exception = e
+    else:
+        # execution did not raise exceptions
+        return from_dlpack_capsule(dlpack_capsule)
+    finally:
+        if got_type_error:
+            # max_version/dl_device, copy keywords are not supported
+            # by __dlpack__
+            x_dldev = dlpack_dev_attr()
+            if (dl_device is None) or (dl_device == x_dldev):
+                dlpack_capsule = dlpack_attr()
+                return from_dlpack_capsule(dlpack_capsule)
+            # must copy via host
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            # when max_version/dl_device/copy are not supported
+            # we can only support importing to OneAPI devices
+            # from host, or from another oneAPI device
+            is_supported_x_dldev = (
+                x_dldev == cpu_dev or
+                (x_dldev[0] == device_OneAPI)
+            )
+            is_supported_dl_device = (
+                dl_device == cpu_dev or
+                dl_device[0] == device_OneAPI
+            )
+            if is_supported_x_dldev and is_supported_dl_device:
+                dlpack_capsule = dlpack_attr()
+                blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            dev = _create_device(device, dl_device)
+            if x_dldev == cpu_dev and dl_device == cpu_dev:
+                # both source and destination are CPU
+                return blob
+            elif x_dldev == cpu_dev:
+                # source is CPU, destination is oneAPI
+                return _to_usm_ary_from_host_blob(blob, dev)
+            elif dl_device == cpu_dev:
+                # source is oneAPI, destination is CPU
+                cpu_caps = blob.__dlpack__(
+                    max_version=get_build_dlpack_version(),
+                    dl_device=cpu_dev
+                )
+                return from_dlpack_capsule(cpu_caps)
+            else:
+                # TODO: revert to `import dpctl.tensor`
+                # when dpnp fully migrates dpctl/tensor
+                import dpctl_ext.tensor as dpt
+                return dpt.asarray(blob, device=dev)
+        elif got_buffer_error:
+            # we are here, because dlpack_attr could not deal with requested
+            # dl_device, or copying was required
+            if copy is False:
+                raise BufferError(
+                    "Importing data via DLPack requires copying, but "
+                    "copy=False was provided"
+                )
+            if dl_device is None:
+                raise saved_exception
+            # must copy via host
+            if dl_device[0] != device_OneAPI:
+                raise BufferError(
+                    f"Can not import to requested device {dl_device}"
+                )
+            x_dldev = dlpack_dev_attr()
+            if x_dldev == cpu_dev:
+                dlpack_capsule = dlpack_attr()
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            else:
+                dlpack_capsule = dlpack_attr(
+                    max_version=requested_ver,
+                    dl_device=cpu_dev,
+                    copy=copy
+                )
+                host_blob = from_dlpack_capsule(dlpack_capsule)
+            dev = _create_device(device, dl_device)
+            return _to_usm_ary_from_host_blob(host_blob, dev)
+        elif got_other_error:
+            raise saved_exception
diff --git a/dpctl_ext/tensor/_elementwise_common.py b/dpctl_ext/tensor/_elementwise_common.py
index 7fd9dabf9614..ffe849db9cad 100644
--- a/dpctl_ext/tensor/_elementwise_common.py
+++ b/dpctl_ext/tensor/_elementwise_common.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_pair_orderK
@@ -233,7 +232,7 @@ def __call__(self, x, /, *, out=None, order="K"):
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -252,7 +251,7 @@ def __call__(self, x, /, *, out=None, order="K"):
                 else:
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
-                    out = dpt_ext.empty_like(x, dtype=res_dt, order=order)
+                    out = dpt.empty_like(x, dtype=res_dt, order=order)
 
             dep_evs = _manager.submitted_events
             ht_unary_ev, unary_ev = self.unary_fn_(
@@ -275,7 +274,7 @@ def __call__(self, x, /, *, out=None, order="K"):
         else:
             if order == "A":
                 order = "F" if x.flags.f_contiguous else "C"
-            buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+            buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
         dep_evs = _manager.submitted_events
         ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -286,7 +285,7 @@ def __call__(self, x, /, *, out=None, order="K"):
             if order == "K":
                 out = _empty_like_orderK(buf, res_dt)
             else:
-                out = dpt_ext.empty_like(buf, dtype=res_dt, order=order)
+                out = dpt.empty_like(buf, dtype=res_dt, order=order)
 
         ht, uf_ev = self.unary_fn_(
             buf, out, sycl_queue=exec_q, depends=[copy_ev]
@@ -597,7 +596,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if isinstance(o1, dpt.usm_ndarray):
                 if ti._array_overlap(o1, out) and buf1_dt is None:
                     if not ti._same_logical_tensors(o1, out):
-                        out = dpt_ext.empty_like(out)
+                        out = dpt.empty_like(out)
                     elif self.binary_inplace_fn_ is not None:
                         # if there is a dedicated in-place kernel
                         # it can be called here, otherwise continues
@@ -610,12 +609,12 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             ):
                                 buf2_dt = o2_dtype
                         else:
-                            src2 = dpt_ext.asarray(
+                            src2 = dpt.asarray(
                                 o2, dtype=o2_dtype, sycl_queue=exec_q
                             )
                         if buf2_dt is None:
                             if src2.shape != res_shape:
-                                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                                src2 = dpt.broadcast_to(src2, res_shape)
                             dep_evs = _manager.submitted_events
                             ht_, comp_ev = self.binary_inplace_fn_(
                                 lhs=o1,
@@ -625,7 +624,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             )
                             _manager.add_event_pair(ht_, comp_ev)
                         else:
-                            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt)
+                            buf2 = dpt.empty_like(src2, dtype=buf2_dt)
                             dep_evs = _manager.submitted_events
                             (
                                 ht_copy_ev,
@@ -638,7 +637,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                             )
                             _manager.add_event_pair(ht_copy_ev, copy_ev)
 
-                            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                            buf2 = dpt.broadcast_to(buf2, res_shape)
                             ht_, bf_ev = self.binary_inplace_fn_(
                                 lhs=o1,
                                 rhs=buf2,
@@ -657,16 +656,16 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                 ):
                     # should not reach if out is reallocated
                     # after being checked against o1
-                    out = dpt_ext.empty_like(out)
+                    out = dpt.empty_like(out)
 
         if isinstance(o1, dpt.usm_ndarray):
             src1 = o1
         else:
-            src1 = dpt_ext.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
+            src1 = dpt.asarray(o1, dtype=o1_dtype, sycl_queue=exec_q)
         if isinstance(o2, dpt.usm_ndarray):
             src2 = o2
         else:
-            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
 
         if order == "A":
             order = (
@@ -688,7 +687,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         src1, src2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -696,9 +695,9 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         order=order,
                     )
             if src1.shape != res_shape:
-                src1 = dpt_ext.broadcast_to(src1, res_shape)
+                src1 = dpt.broadcast_to(src1, res_shape)
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             deps_ev = _manager.submitted_events
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
@@ -723,7 +722,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if order == "K":
                 buf2 = _empty_like_orderK(src2, buf2_dt)
             else:
-                buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+                buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
             dep_evs = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
@@ -735,7 +734,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         src1, buf2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -744,8 +743,8 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     )
 
             if src1.shape != res_shape:
-                src1 = dpt_ext.broadcast_to(src1, res_shape)
-            buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+                src1 = dpt.broadcast_to(src1, res_shape)
+            buf2 = dpt.broadcast_to(buf2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=src1,
                 src2=buf2,
@@ -769,7 +768,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
             if order == "K":
                 buf1 = _empty_like_orderK(src1, buf1_dt)
             else:
-                buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+                buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
             dep_evs = _manager.submitted_events
             ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
@@ -781,7 +780,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         buf1, src2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out = dpt_ext.empty(
+                    out = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         usm_type=res_usm_type,
@@ -789,9 +788,9 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                         order=order,
                     )
 
-            buf1 = dpt_ext.broadcast_to(buf1, res_shape)
+            buf1 = dpt.broadcast_to(buf1, res_shape)
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             ht_binary_ev, binary_ev = self.binary_fn_(
                 src1=buf1,
                 src2=src2,
@@ -820,7 +819,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         if order == "K":
             buf1 = _empty_like_orderK(src1, buf1_dt)
         else:
-            buf1 = dpt_ext.empty_like(src1, dtype=buf1_dt, order=order)
+            buf1 = dpt.empty_like(src1, dtype=buf1_dt, order=order)
         dep_evs = _manager.submitted_events
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=src1, dst=buf1, sycl_queue=exec_q, depends=dep_evs
@@ -829,7 +828,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
         if order == "K":
             buf2 = _empty_like_orderK(src2, buf2_dt)
         else:
-            buf2 = dpt_ext.empty_like(src2, dtype=buf2_dt, order=order)
+            buf2 = dpt.empty_like(src2, dtype=buf2_dt, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=src2, dst=buf2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -840,7 +839,7 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     buf1, buf2, res_dt, res_shape, res_usm_type, exec_q
                 )
             else:
-                out = dpt_ext.empty(
+                out = dpt.empty(
                     res_shape,
                     dtype=res_dt,
                     usm_type=res_usm_type,
@@ -848,8 +847,8 @@ def __call__(self, o1, o2, /, *, out=None, order="K"):
                     order=order,
                 )
 
-        buf1 = dpt_ext.broadcast_to(buf1, res_shape)
-        buf2 = dpt_ext.broadcast_to(buf2, res_shape)
+        buf1 = dpt.broadcast_to(buf1, res_shape)
+        buf2 = dpt.broadcast_to(buf2, res_shape)
         ht_, bf_ev = self.binary_fn_(
             src1=buf1,
             src2=buf2,
@@ -960,10 +959,10 @@ def _inplace_op(self, o1, o2):
             ):
                 buf_dt = o2_dtype
         else:
-            src2 = dpt_ext.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
+            src2 = dpt.asarray(o2, dtype=o2_dtype, sycl_queue=exec_q)
         if buf_dt is None:
             if src2.shape != res_shape:
-                src2 = dpt_ext.broadcast_to(src2, res_shape)
+                src2 = dpt.broadcast_to(src2, res_shape)
             dep_evs = _manager.submitted_events
             ht_, comp_ev = self.binary_inplace_fn_(
                 lhs=o1,
@@ -973,7 +972,7 @@ def _inplace_op(self, o1, o2):
             )
             _manager.add_event_pair(ht_, comp_ev)
         else:
-            buf = dpt_ext.empty_like(src2, dtype=buf_dt)
+            buf = dpt.empty_like(src2, dtype=buf_dt)
             dep_evs = _manager.submitted_events
             (
                 ht_copy_ev,
@@ -986,7 +985,7 @@ def _inplace_op(self, o1, o2):
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
 
-            buf = dpt_ext.broadcast_to(buf, res_shape)
+            buf = dpt.broadcast_to(buf, res_shape)
             ht_, bf_ev = self.binary_inplace_fn_(
                 lhs=o1,
                 rhs=buf,
diff --git a/dpctl_ext/tensor/_flags.pyx b/dpctl_ext/tensor/_flags.pyx
new file mode 100644
index 000000000000..322d52bd56c7
--- /dev/null
+++ b/dpctl_ext/tensor/_flags.pyx
@@ -0,0 +1,175 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+from libcpp cimport bool as cpp_bool
+
+from ._usmarray cimport (
+    USM_ARRAY_C_CONTIGUOUS,
+    USM_ARRAY_F_CONTIGUOUS,
+    USM_ARRAY_WRITABLE,
+    usm_ndarray,
+)
+
+
+cdef cpp_bool _check_bit(int flag, int mask):
+    return (flag & mask) == mask
+
+
+cdef class Flags:
+    """
+    Helper class to query the flags of a :class:`dpctl.tensor.usm_ndarray`
+    instance, which describe how the instance interfaces with its underlying
+    memory.
+    """
+    cdef int flags_
+    cdef usm_ndarray arr_
+
+    def __cinit__(self, usm_ndarray arr, int flags):
+        self.arr_ = arr
+        self.flags_ = flags
+
+    @property
+    def flags(self):
+        """
+        Integer representation of the memory layout flags of
+        :class:`dpctl.tensor.usm_ndarray` instance.
+        """
+        return self.flags_
+
+    @property
+    def c_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is C-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+
+    @property
+    def f_contiguous(self):
+        """
+        True if the memory layout of the
+        :class:`dpctl.tensor.usm_ndarray` instance is F-contiguous.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+
+    @property
+    def writable(self):
+        """
+        True if :class:`dpctl.tensor.usm_ndarray` instance is writable.
+        """
+        return _check_bit(self.flags_, USM_ARRAY_WRITABLE)
+
+    @writable.setter
+    def writable(self, new_val):
+        if not isinstance(new_val, bool):
+            raise TypeError("Expecting a boolean value")
+        self.arr_._set_writable_flag(new_val)
+
+    @property
+    def fc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           and _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def forc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous or F-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+           or _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+        )
+
+    @property
+    def fnc(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is F-contiguous and not C-contiguous.
+        """
+        return (
+           _check_bit(self.flags_, USM_ARRAY_F_CONTIGUOUS)
+           and not _check_bit(self.flags_, USM_ARRAY_C_CONTIGUOUS)
+        )
+
+    @property
+    def contiguous(self):
+        """
+        True if the memory layout of the :class:`dpctl.tensor.usm_ndarray`
+        instance is C-contiguous and F-contiguous.
+        Equivalent to `forc.`
+        """
+        return self.forc
+
+    def __getitem__(self, name):
+        if name in ["C_CONTIGUOUS", "C"]:
+            return self.c_contiguous
+        elif name in ["F_CONTIGUOUS", "F"]:
+            return self.f_contiguous
+        elif name in ["WRITABLE", "W"]:
+            return self.writable
+        elif name == "FC":
+            return self.fc
+        elif name == "FNC":
+            return self.fnc
+        elif name in ["FORC", "CONTIGUOUS"]:
+            return self.forc
+
+    def __setitem__(self, name, val):
+        if name in ["WRITABLE", "W"]:
+            self.writable = val
+        else:
+            raise ValueError(
+                "Only writable ('W' or 'WRITABLE') flag can be set"
+            )
+
+    def __repr__(self):
+        out = []
+        for name in "C_CONTIGUOUS", "F_CONTIGUOUS", "WRITABLE":
+            out.append("  {} : {}".format(name, self[name]))
+        return "\n".join(out)
+
+    def __eq__(self, other):
+        cdef Flags other_
+        if isinstance(other, self.__class__):
+            other_ = <Flags>other
+            return self.flags_ == other_.flags_
+        elif isinstance(other, int):
+            return self.flags_ == <int>other
+        else:
+            return False
diff --git a/dpctl_ext/tensor/_indexing_functions.py b/dpctl_ext/tensor/_indexing_functions.py
index 5b4eb1aaf7a2..08db81c1b166 100644
--- a/dpctl_ext/tensor/_indexing_functions.py
+++ b/dpctl_ext/tensor/_indexing_functions.py
@@ -29,12 +29,11 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import (
@@ -57,7 +56,7 @@ def _get_indexing_mode(name):
 
 
 def _range(sh_i, i, nd, q, usm_t, dt):
-    ind = dpt_ext.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
+    ind = dpt.arange(sh_i, dtype=dt, usm_type=usm_t, sycl_queue=q)
     ind.shape = tuple(sh_i if i == j else 1 for j in range(nd))
     return ind
 
@@ -177,7 +176,7 @@ def place(arr, mask, vals):
         raise dpctl.utils.ExecutionPlacementError
     if arr.shape != mask.shape or vals.ndim != 1:
         raise ValueError("Array sizes are not as required")
-    cumsum = dpt_ext.empty(mask.size, dtype="i8", sycl_queue=exec_q)
+    cumsum = dpt.empty(mask.size, dtype="i8", sycl_queue=exec_q)
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
     nz_count = ti.mask_positions(
@@ -190,7 +189,7 @@ def place(arr, mask, vals):
     if vals.dtype == arr.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, arr.dtype)
+        rhs = dpt.astype(vals, arr.dtype)
     hev, pl_ev = ti._place(
         dst=arr,
         cumsum=cumsum,
@@ -329,7 +328,7 @@ def put_vec_duplicates(vec, ind, vals):
         val_shape = indices.shape
 
     if not isinstance(vals, dpt.usm_ndarray):
-        vals = dpt_ext.asarray(
+        vals = dpt.asarray(
             vals, dtype=x.dtype, usm_type=vals_usm_type, sycl_queue=exec_q
         )
     # choose to throw here for consistency with `place`
@@ -340,8 +339,8 @@ def put_vec_duplicates(vec, ind, vals):
     if vals.dtype == x.dtype:
         rhs = vals
     else:
-        rhs = dpt_ext.astype(vals, x.dtype)
-    rhs = dpt_ext.broadcast_to(rhs, val_shape)
+        rhs = dpt.astype(vals, x.dtype)
+    rhs = dpt.broadcast_to(rhs, val_shape)
 
     _manager = dpctl.utils.SequentialOrderManager[exec_q]
     deps_ev = _manager.submitted_events
@@ -540,9 +539,9 @@ def take(x, indices, /, *, axis=None, out=None, mode="wrap"):
                 "Input and output allocation queues are not compatible"
             )
         if ti._array_overlap(x, out):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
diff --git a/dpctl_ext/tensor/_linear_algebra_functions.py b/dpctl_ext/tensor/_linear_algebra_functions.py
index 973050f93ac1..6dfb30e881b2 100644
--- a/dpctl_ext/tensor/_linear_algebra_functions.py
+++ b/dpctl_ext/tensor/_linear_algebra_functions.py
@@ -29,11 +29,11 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_linalg_impl as tli
diff --git a/dpctl_ext/tensor/_manipulation_functions.py b/dpctl_ext/tensor/_manipulation_functions.py
index e2d55c533bc0..33817dd0aa2e 100644
--- a/dpctl_ext/tensor/_manipulation_functions.py
+++ b/dpctl_ext/tensor/_manipulation_functions.py
@@ -30,13 +30,12 @@
 import operator
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils as dputils
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index, normalize_axis_tuple
@@ -174,7 +173,7 @@ def _concat_axis_None(arrays):
     res_shape = 0
     for array in arrays:
         res_shape += array.size
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -185,7 +184,7 @@ def _concat_axis_None(arrays):
         fill_end = fill_start + array.size
         if array.flags.c_contiguous:
             hev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-                src=dpt_ext.reshape(array, -1),
+                src=dpt.reshape(array, -1),
                 dst=res[fill_start:fill_end],
                 sycl_queue=exec_q,
                 depends=deps,
@@ -196,7 +195,7 @@ def _concat_axis_None(arrays):
             # _copy_usm_ndarray_for_reshape requires src and dst to have
             # the same data type
             if not array.dtype == res_dtype:
-                src2_ = dpt_ext.empty_like(src_, dtype=res_dtype)
+                src2_ = dpt.empty_like(src_, dtype=res_dtype)
                 ht_copy_ev, cpy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
                     src=src_, dst=src2_, sycl_queue=exec_q, depends=deps
                 )
@@ -334,7 +333,7 @@ def concat(arrays, /, *, axis=0):
         X0_shape[i] if i != axis else res_shape_axis for i in range(X0.ndim)
     )
 
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -402,7 +401,7 @@ def expand_dims(X, /, *, axis=0):
     shape_it = iter(X.shape)
     shape = tuple(1 if ax in axis else next(shape_it) for ax in range(out_ndim))
 
-    return dpt_ext.reshape(X, shape)
+    return dpt.reshape(X, shape)
 
 
 def flip(X, /, *, axis=None):
@@ -485,7 +484,7 @@ def moveaxis(X, source, destination, /):
     for src, dst in sorted(zip(destination, source)):
         ind.insert(src, dst)
 
-    return dpt_ext.permute_dims(X, tuple(ind))
+    return dpt.permute_dims(X, tuple(ind))
 
 
 def permute_dims(X, /, axes):
@@ -602,7 +601,7 @@ def repeat(x, repeats, /, *, axis=None):
             )
         )
         dpctl.utils.validate_usm_type(usm_type, allow_none=False)
-        if not dpt_ext.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
+        if not dpt.can_cast(repeats.dtype, dpt.int64, casting="same_kind"):
             raise TypeError(
                 f"'repeats' data type {repeats.dtype} cannot be cast to "
                 "'int64' according to the casting rule ''safe.''"
@@ -624,7 +623,7 @@ def repeat(x, repeats, /, *, axis=None):
                     "'repeats' array must be broadcastable to the size of "
                     "the repeated axis"
                 )
-            if not dpt_ext.all(repeats >= 0):
+            if not dpt.all(repeats >= 0):
                 raise ValueError("'repeats' elements must be positive")
 
     elif isinstance(repeats, (tuple, list, range)):
@@ -643,10 +642,10 @@ def repeat(x, repeats, /, *, axis=None):
                     "`repeats` sequence must have the same length as the "
                     "repeated axis"
                 )
-            repeats = dpt_ext.asarray(
+            repeats = dpt.asarray(
                 repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q
             )
-            if not dpt_ext.all(repeats >= 0):
+            if not dpt.all(repeats >= 0):
                 raise ValueError("`repeats` elements must be positive")
     else:
         raise TypeError(
@@ -662,7 +661,7 @@ def repeat(x, repeats, /, *, axis=None):
             res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :]
         else:
             res_shape = (res_axis_size,)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q
         )
         if res_axis_size > 0:
@@ -677,7 +676,7 @@ def repeat(x, repeats, /, *, axis=None):
             _manager.add_event_pair(ht_rep_ev, rep_ev)
     else:
         if repeats.dtype != dpt.int64:
-            rep_buf = dpt_ext.empty(
+            rep_buf = dpt.empty(
                 repeats.shape,
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -687,7 +686,7 @@ def repeat(x, repeats, /, *, axis=None):
                 src=repeats, dst=rep_buf, sycl_queue=exec_q, depends=dep_evs
             )
             _manager.add_event_pair(ht_copy_ev, copy_ev)
-            cumsum = dpt_ext.empty(
+            cumsum = dpt.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -703,7 +702,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt_ext.empty(
+            res = dpt.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -720,7 +719,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
                 _manager.add_event_pair(ht_rep_ev, rep_ev)
         else:
-            cumsum = dpt_ext.empty(
+            cumsum = dpt.empty(
                 (axis_size,),
                 dtype=dpt.int64,
                 usm_type=usm_type,
@@ -735,7 +734,7 @@ def repeat(x, repeats, /, *, axis=None):
                 )
             else:
                 res_shape = (res_axis_size,)
-            res = dpt_ext.empty(
+            res = dpt.empty(
                 res_shape,
                 dtype=x.dtype,
                 usm_type=usm_type,
@@ -792,7 +791,7 @@ def roll(x, /, shift, *, axis=None):
     _manager = dputils.SequentialOrderManager[exec_q]
     if axis is None:
         shift = operator.index(shift)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
         )
         sz = operator.index(x.size)
@@ -819,7 +818,7 @@ def roll(x, /, shift, *, axis=None):
         n_i = operator.index(shape[ax])
         shifted = shifts[ax] + operator.index(sh)
         shifts[ax] = (shifted % n_i) if n_i > 0 else 0
-    res = dpt_ext.empty(
+    res = dpt.empty(
         x.shape, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     dep_evs = _manager.submitted_events
@@ -872,7 +871,7 @@ def squeeze(X, /, axis=None):
     if new_shape == X.shape:
         return X
     else:
-        return dpt_ext.reshape(X, new_shape)
+        return dpt.reshape(X, new_shape)
 
 
 def stack(arrays, /, *, axis=0):
@@ -917,7 +916,7 @@ def stack(arrays, /, *, axis=0):
         for i in range(res_ndim)
     )
 
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape, dtype=res_dtype, usm_type=res_usm_type, sycl_queue=exec_q
     )
 
@@ -971,7 +970,7 @@ def swapaxes(X, axis1, axis2):
     ind = list(range(0, X.ndim))
     ind[axis1] = axis2
     ind[axis2] = axis1
-    return dpt_ext.permute_dims(X, tuple(ind))
+    return dpt.permute_dims(X, tuple(ind))
 
 
 def unstack(X, /, *, axis=0):
@@ -998,7 +997,7 @@ def unstack(X, /, *, axis=0):
         raise TypeError(f"Expected usm_ndarray type, got {type(X)}.")
 
     axis = normalize_axis_index(axis, X.ndim)
-    Y = dpt_ext.moveaxis(X, axis, 0)
+    Y = dpt.moveaxis(X, axis, 0)
 
     return tuple(Y[i] for i in range(Y.shape[0]))
 
@@ -1049,11 +1048,11 @@ def tile(x, repetitions, /):
     if rep_dims < x_dims:
         repetitions = (x_dims - rep_dims) * (1,) + repetitions
     elif x_dims < rep_dims:
-        x = dpt_ext.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
+        x = dpt.reshape(x, (rep_dims - x_dims) * (1,) + x.shape)
     res_shape = tuple(map(lambda sh, rep: sh * rep, x.shape, repetitions))
     # case of empty input
     if x.size == 0:
-        return dpt_ext.empty(
+        return dpt.empty(
             res_shape,
             dtype=x.dtype,
             usm_type=x.usm_type,
@@ -1061,7 +1060,7 @@ def tile(x, repetitions, /):
         )
     in_sh = x.shape
     if res_shape == in_sh:
-        return dpt_ext.copy(x)
+        return dpt.copy(x)
     expanded_sh = []
     broadcast_sh = []
     out_sz = 1
@@ -1082,12 +1081,12 @@ def tile(x, repetitions, /):
     exec_q = x.sycl_queue
     xdt = x.dtype
     xut = x.usm_type
-    res = dpt_ext.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
+    res = dpt.empty((out_sz,), dtype=xdt, usm_type=xut, sycl_queue=exec_q)
     # no need to copy data for empty output
     if out_sz > 0:
-        x = dpt_ext.broadcast_to(
+        x = dpt.broadcast_to(
             # this reshape should never copy
-            dpt_ext.reshape(x, expanded_sh),
+            dpt.reshape(x, expanded_sh),
             broadcast_sh,
         )
         # copy broadcast input into flat array
@@ -1097,4 +1096,4 @@ def tile(x, repetitions, /):
             src=x, dst=res, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(hev, cp_ev)
-    return dpt_ext.reshape(res, res_shape)
+    return dpt.reshape(res, res_shape)
diff --git a/dpctl_ext/tensor/_print.py b/dpctl_ext/tensor/_print.py
new file mode 100644
index 000000000000..5385eadb2537
--- /dev/null
+++ b/dpctl_ext/tensor/_print.py
@@ -0,0 +1,503 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import contextlib
+import itertools
+import operator
+
+import dpctl
+import dpctl.utils
+import numpy as np
+
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpctl_ext.tensor._tensor_impl as ti
+
+__doc__ = "Print functions for :class:`dpctl.tensor.usm_ndarray`."
+
+_print_options = {
+    "linewidth": 75,
+    "edgeitems": 3,
+    "threshold": 1000,
+    "precision": 8,
+    "floatmode": "maxprec",
+    "suppress": False,
+    "nanstr": "nan",
+    "infstr": "inf",
+    "sign": "-",
+}
+
+
+def _move_to_next_line(string, s, line_width, prefix):
+    """Move string to next line if it doesn't fit in the current line."""
+    bottom_len = len(s) - (s.rfind("\n") + 1)
+    next_line = bottom_len + len(string) + 1 > line_width
+    string = ",\n" + " " * len(prefix) + string if next_line else ", " + string
+
+    return string
+
+
+def _options_dict(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    if numpy:
+        numpy_options = np.get_printoptions()
+        options = {k: numpy_options[k] for k in _print_options.keys()}
+    else:
+        options = _print_options.copy()
+
+    if suppress:
+        options["suppress"] = True
+
+    local = dict(locals().items())
+    for int_arg in ["linewidth", "precision", "threshold", "edgeitems"]:
+        val = local[int_arg]
+        if val is not None:
+            options[int_arg] = operator.index(val)
+
+    for str_arg in ["nanstr", "infstr"]:
+        val = local[str_arg]
+        if val is not None:
+            if not isinstance(val, str):
+                raise TypeError(
+                    "`{}` ".format(str_arg) + "must be of `string` type."
+                )
+            options[str_arg] = val
+
+    signs = ["-", "+", " "]
+    if sign is not None:
+        if sign not in signs:
+            raise ValueError(
+                "`sign` must be one of"
+                + ", ".join("`{}`".format(s) for s in signs)
+            )
+        options["sign"] = sign
+
+    floatmodes = ["fixed", "unique", "maxprec", "maxprec_equal"]
+    if floatmode is not None:
+        if floatmode not in floatmodes:
+            raise ValueError(
+                "`floatmode` must be one of"
+                + ", ".join("`{}`".format(m) for m in floatmodes)
+            )
+        options["floatmode"] = floatmode
+
+    return options
+
+
+def set_print_options(
+    linewidth=None,
+    edgeitems=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    nanstr=None,
+    infstr=None,
+    sign=None,
+    numpy=False,
+):
+    """
+    set_print_options(linewidth=None, edgeitems=None, threshold=None,
+                      precision=None, floatmode=None, suppress=None,
+                      nanstr=None, infstr=None, sign=None, numpy=False)
+
+    Set options for printing :class:`dpctl.tensor.usm_ndarray` class.
+
+    Args:
+        linewidth (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if linewidth is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits
+                    for each number. This number is `precision` digits
+                    or fewer, if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        nanstr (str, optional):
+            String used to represent nan.
+            Raises `TypeError` if nanstr is not a string.
+            Default: `"nan"`.
+        infstr (str, optional):
+            String used to represent infinity.
+            Raises `TypeError` if infstr is not a string.
+            Default: `"inf"`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional): If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+    """
+    options = _options_dict(
+        linewidth=linewidth,
+        edgeitems=edgeitems,
+        threshold=threshold,
+        precision=precision,
+        floatmode=floatmode,
+        suppress=suppress,
+        nanstr=nanstr,
+        infstr=infstr,
+        sign=sign,
+        numpy=numpy,
+    )
+    _print_options.update(options)
+
+
+def get_print_options():
+    """get_print_options()
+
+    Returns a copy of current options for printing
+    :class:`dpctl.tensor.usm_ndarray` class.
+
+    Returns:
+        dict: dictionary with array
+           printing option settings.
+
+    Options:
+        - "linewidth" : int, default 75
+        - "edgeitems" : int, default 3
+        - "threshold" : int, default 1000
+        - "precision" : int, default 8
+        - "floatmode" : str, default "maxprec_equal"
+        - "suppress" : bool, default False
+        - "nanstr" : str, default "nan"
+        - "infstr" : str, default "inf"
+        - "sign" : str, default "-"
+    """
+    return _print_options.copy()
+
+
+@contextlib.contextmanager
+def print_options(*args, **kwargs):
+    """
+    Context manager for print options.
+
+    Set print options for the scope of a `with` block.
+    `as` yields dictionary of print options.
+    """
+    options = dpt.get_print_options()
+    try:
+        dpt.set_print_options(*args, **kwargs)
+        yield dpt.get_print_options()
+    finally:
+        dpt.set_print_options(**options)
+
+
+def _nd_corners(arr_in, edge_items):
+    _shape = arr_in.shape
+    max_shape = 2 * edge_items + 1
+    if max(_shape) <= max_shape:
+        return dpt.asnumpy(arr_in)
+    res_shape = tuple(
+        max_shape if _shape[i] > max_shape else _shape[i]
+        for i in range(arr_in.ndim)
+    )
+
+    exec_q = arr_in.sycl_queue
+    arr_out = dpt.empty(
+        res_shape,
+        dtype=arr_in.dtype,
+        usm_type=arr_in.usm_type,
+        sycl_queue=exec_q,
+    )
+
+    blocks = []
+    for i in range(len(_shape)):
+        if _shape[i] > max_shape:
+            blocks.append(
+                (
+                    np.s_[:edge_items],
+                    np.s_[-edge_items:],
+                )
+            )
+        else:
+            blocks.append((np.s_[:],))
+
+    _manager = dpctl.utils.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+    hev_list = []
+    for slc in itertools.product(*blocks):
+        hev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr_in[slc],
+            dst=arr_out[slc],
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        hev_list.append(hev)
+
+    dpctl.SyclEvent.wait_for(hev_list)
+    return dpt.asnumpy(arr_out)
+
+
+def usm_ndarray_str(
+    x,
+    line_width=None,
+    edge_items=None,
+    threshold=None,
+    precision=None,
+    floatmode=None,
+    suppress=None,
+    sign=None,
+    numpy=False,
+    separator=" ",
+    prefix="",
+    suffix="",
+):
+    """
+    usm_ndarray_str(x, line_width=None, edgeitems=None, threshold=None,
+                    precision=None, floatmode=None, suppress=None,
+                    sign=None, numpy=False, separator=" ", prefix="",
+                    suffix="")
+
+    Returns a string representing the elements of a
+    :class:`dpctl.tensor.usm_ndarray`.
+
+    Args:
+        x (usm_ndarray):
+            Input array.
+        line_width (int, optional):
+            Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        edgeitems (int, optional):
+            Number of elements at the beginning and end
+            when the printed array is abbreviated.
+            Raises `TypeError` if edgeitems is not an integer.
+            Default: `3`.
+        threshold (int, optional):
+            Number of elements that triggers array abbreviation.
+            Raises `TypeError` if threshold is not an integer.
+            Default: `1000`.
+        precision (int or None, optional):
+            Number of digits printed for floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        floatmode (str, optional):
+            Controls how floating point numbers are interpreted.
+                `"fixed:`:
+                    Always prints exactly `precision` digits.
+                `"unique"`:
+                    Ignores precision, prints the number of
+                    digits necessary to uniquely specify each number.
+                `"maxprec"`:
+                    Prints `precision` digits or fewer,
+                    if fewer will uniquely represent a number.
+                `"maxprec_equal"`:
+                    Prints an equal number of digits for each number.
+                    This number is `precision` digits or fewer,
+                    if fewer will uniquely represent each number.
+            Raises `ValueError` if floatmode is not one of
+            `fixed`, `unique`, `maxprec`, or `maxprec_equal`.
+            Default: "maxprec_equal"
+        suppress (bool, optional):
+            If `True,` numbers equal to zero in the current precision
+            will print as zero.
+            Default: `False`.
+        sign (str, optional):
+            Controls the sign of floating point numbers.
+                `"-"`:
+                    Omit the sign of positive numbers.
+                `"+"`:
+                    Always print the sign of positive numbers.
+                `" "`:
+                    Always print a whitespace in place of the
+                    sign of positive numbers.
+            Raises `ValueError` if sign is not one of
+            `"-"`, `"+"`, or `" "`.
+            Default: `"-"`.
+        numpy (bool, optional):
+            If `True,` then before other specified print
+            options are set, a dictionary of Numpy's print options
+            will be used to initialize dpctl's print options.
+            Default: "False"
+        separator (str, optional):
+            String inserted between elements of the array string.
+            Default: " "
+        prefix (str, optional):
+            String used to determine spacing to the left of the array string.
+            Default: ""
+        suffix (str, optional):
+            String that determines length of the last line of the array string.
+            Default: ""
+
+    Returns:
+        str: string representation of input array.
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    options = get_print_options()
+    options.update(
+        _options_dict(
+            linewidth=line_width,
+            edgeitems=edge_items,
+            threshold=threshold,
+            precision=precision,
+            floatmode=floatmode,
+            suppress=suppress,
+            sign=sign,
+            numpy=numpy,
+        )
+    )
+
+    threshold = options["threshold"]
+    edge_items = options["edgeitems"]
+
+    if x.size > threshold:
+        data = _nd_corners(x, edge_items)
+        options["threshold"] = 0
+    else:
+        data = dpt.asnumpy(x)
+    with np.printoptions(**options):
+        s = np.array2string(
+            data, separator=separator, prefix=prefix, suffix=suffix
+        )
+    return s
+
+
+def usm_ndarray_repr(
+    x, line_width=None, precision=None, suppress=None, prefix="usm_ndarray"
+):
+    """
+    usm_ndarray_repr(x, line_width=None, precision=None,
+                     suppress=None, prefix="")
+
+    Returns a formatted string representing the elements
+    of a :class:`dpctl.tensor.usm_ndarray` and its data type,
+    if not a default type.
+
+    Args:
+        x (usm_ndarray): Input array.
+        line_width (int, optional): Number of characters printed per line.
+            Raises `TypeError` if line_width is not an integer.
+            Default: `75`.
+        precision (int or None, optional): Number of digits printed for
+            floating point numbers.
+            Raises `TypeError` if precision is not an integer.
+            Default: `8`.
+        suppress (bool, optional): If `True,` numbers equal to zero
+            in the current precision will print as zero.
+            Default: `False`.
+        prefix (str, optional): String inserted at the start of the array
+            string.
+            Default: ""
+
+    Returns:
+        str: formatted string representing the input array
+    """
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
+
+    if line_width is None:
+        line_width = _print_options["linewidth"]
+
+    show_dtype = x.dtype not in [
+        dpt.bool,
+        dpt.int64,
+        dpt.float64,
+        dpt.complex128,
+    ]
+
+    prefix = prefix + "("
+    suffix = ")"
+
+    s = usm_ndarray_str(
+        x,
+        line_width=line_width,
+        precision=precision,
+        suppress=suppress,
+        separator=", ",
+        prefix=prefix,
+        suffix=suffix,
+    )
+
+    if show_dtype or x.size == 0:
+        dtype_str = f"dtype={x.dtype.name}"
+        dtype_str = _move_to_next_line(dtype_str, s, line_width, prefix)
+    else:
+        dtype_str = ""
+
+    options = get_print_options()
+    threshold = options["threshold"]
+    if (x.size == 0 and x.shape != (0,)) or x.size > threshold:
+        shape_str = f"shape={x.shape}"
+        shape_str = _move_to_next_line(shape_str, s, line_width, prefix)
+    else:
+        shape_str = ""
+
+    return prefix + s + shape_str + dtype_str + suffix
diff --git a/dpctl_ext/tensor/_reduction.py b/dpctl_ext/tensor/_reduction.py
index 2daf07b81d85..79e620605f07 100644
--- a/dpctl_ext/tensor/_reduction.py
+++ b/dpctl_ext/tensor/_reduction.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
@@ -58,7 +57,7 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
             axis = (axis,)
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
     red_nd = len(axis)
     if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
         raise ValueError("reduction cannot be performed over zero-size axes")
@@ -96,12 +95,12 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
@@ -138,7 +137,7 @@ def _comparison_over_axis(x, axis, keepdims, out, _reduction_fn):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -164,7 +163,7 @@ def _reduction_over_axis(
             axis = (axis,)
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     red_nd = len(axis)
     res_shape = arr.shape[: nd - red_nd]
     q = x.sycl_queue
@@ -212,12 +211,12 @@ def _reduction_over_axis(
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out) and implemented_types:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
 
@@ -253,7 +252,7 @@ def _reduction_over_axis(
             out = orig_out
     else:
         if _dtype_supported(res_dt, res_dt, res_usm_type, q):
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
@@ -270,14 +269,14 @@ def _reduction_over_axis(
             _manager.add_event_pair(ht_e_red, red_ev)
         else:
             buf_dt = _default_reduction_type_fn(inp_dt, q)
-            tmp = dpt_ext.empty(
+            tmp = dpt.empty(
                 arr.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
                 src=arr, dst=tmp, sycl_queue=q, depends=dep_evs
             )
             _manager.add_event_pair(ht_e_cpy, cpy_e)
-            tmp_res = dpt_ext.empty(
+            tmp_res = dpt.empty(
                 res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q
             )
             ht_e_red, r_e = _reduction_fn(
@@ -296,7 +295,7 @@ def _reduction_over_axis(
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -320,7 +319,7 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
             )
         axis = normalize_axis_tuple(axis, nd, "axis")
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
     axis = normalize_axis_tuple(axis, nd, "axis")
     red_nd = len(axis)
     if any([x_tmp.shape[i] == 0 for i in range(-red_nd, 0)]):
@@ -359,12 +358,12 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
                 "Input and output allocation queues are not compatible"
             )
         if keepdims:
-            out = dpt_ext.squeeze(out, axis=axis)
+            out = dpt.squeeze(out, axis=axis)
             orig_out = out
         if ti._array_overlap(x, out) and red_nd > 0:
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
         )
 
@@ -395,7 +394,7 @@ def _search_over_axis(x, axis, keepdims, out, _reduction_fn):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        out = dpt_ext.permute_dims(dpt_ext.reshape(out, res_shape), inv_perm)
+        out = dpt.permute_dims(dpt.reshape(out, res_shape), inv_perm)
     return out
 
 
@@ -506,7 +505,7 @@ def count_nonzero(x, /, *, axis=None, keepdims=False, out=None):
             type.
     """
     if x.dtype != dpt.bool:
-        x = dpt_ext.astype(x, dpt.bool, copy=False)
+        x = dpt.astype(x, dpt.bool, copy=False)
     return sum(
         x,
         axis=axis,
diff --git a/dpctl_ext/tensor/_reshape.py b/dpctl_ext/tensor/_reshape.py
index 23cf47a83568..7ecdace4fc42 100644
--- a/dpctl_ext/tensor/_reshape.py
+++ b/dpctl_ext/tensor/_reshape.py
@@ -28,13 +28,12 @@
 
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 
 from ._tensor_impl import (
     _copy_usm_ndarray_for_reshape,
@@ -189,7 +188,7 @@ def reshape(X, /, shape, *, order="C", copy=None):
                 src=X, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
         else:
-            X_t = dpt_ext.permute_dims(X, range(X.ndim - 1, -1, -1))
+            X_t = dpt.permute_dims(X, range(X.ndim - 1, -1, -1))
             hev, r_e = _copy_usm_ndarray_for_reshape(
                 src=X_t, dst=flat_res, sycl_queue=copy_q, depends=dep_evs
             )
diff --git a/dpctl_ext/tensor/_scalar_utils.py b/dpctl_ext/tensor/_scalar_utils.py
index 3ab92b42ad00..832121aea857 100644
--- a/dpctl_ext/tensor/_scalar_utils.py
+++ b/dpctl_ext/tensor/_scalar_utils.py
@@ -29,13 +29,14 @@
 import numbers
 
 import dpctl.memory as dpm
-import dpctl.tensor as dpt
 import numpy as np
-from dpctl.tensor._usmarray import _is_object_with_buffer_protocol as _is_buffer
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
+from dpctl_ext.tensor._usmarray import (
+    _is_object_with_buffer_protocol as _is_buffer,
+)
 
 from ._type_utils import (
     WeakBooleanType,
@@ -63,7 +64,7 @@ def _get_dtype(o, dev):
     if isinstance(o, dpt.usm_ndarray):
         return o.dtype
     if hasattr(o, "__sycl_usm_array_interface__"):
-        return dpt_ext.asarray(o).dtype
+        return dpt.asarray(o).dtype
     if _is_buffer(o):
         host_dt = np.array(o).dtype
         dev_dt = _to_device_supported_dtype(host_dt, dev)
diff --git a/dpctl_ext/tensor/_search_functions.py b/dpctl_ext/tensor/_search_functions.py
index 285a02b42bb8..aae185b64e2b 100644
--- a/dpctl_ext/tensor/_search_functions.py
+++ b/dpctl_ext/tensor/_search_functions.py
@@ -27,12 +27,11 @@
 # *****************************************************************************
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError, SequentialOrderManager
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._copy_utils import _empty_like_orderK, _empty_like_triple_orderK
@@ -111,7 +110,7 @@ def _resolve_two_weak_types(o1_dtype, o2_dtype, dev):
 
 
 def _where_result_type(dt1, dt2, dev):
-    res_dtype = dpt_ext.result_type(dt1, dt2)
+    res_dtype = dpt.result_type(dt1, dt2)
     fp16 = dev.has_aspect_fp16
     fp64 = dev.has_aspect_fp64
 
@@ -291,7 +290,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if ti._array_overlap(condition, out) and not ti._same_logical_tensors(
             condition, out
         ):
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
 
         if isinstance(x1, dpt.usm_ndarray):
             if (
@@ -299,7 +298,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x1, out)
                 and x1_dtype == out_dtype
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
         if isinstance(x2, dpt.usm_ndarray):
             if (
@@ -307,7 +306,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 and not ti._same_logical_tensors(x2, out)
                 and x2_dtype == out_dtype
             ):
-                out = dpt_ext.empty_like(out)
+                out = dpt.empty_like(out)
 
     if order == "A":
         order = (
@@ -323,9 +322,9 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             else "C"
         )
     if not isinstance(x1, dpt.usm_ndarray):
-        x1 = dpt_ext.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
+        x1 = dpt.asarray(x1, dtype=x1_dtype, sycl_queue=exec_q)
     if not isinstance(x2, dpt.usm_ndarray):
-        x2 = dpt_ext.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
+        x2 = dpt.asarray(x2, dtype=x2_dtype, sycl_queue=exec_q)
 
     if condition.size == 0:
         if out is not None:
@@ -342,7 +341,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                     exec_q,
                 )
             else:
-                return dpt_ext.empty(
+                return dpt.empty(
                     res_shape,
                     dtype=out_dtype,
                     order=order,
@@ -356,7 +355,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x1 = _empty_like_orderK(x1, out_dtype)
         else:
-            _x1 = dpt_ext.empty_like(x1, dtype=out_dtype, order=order)
+            _x1 = dpt.empty_like(x1, dtype=out_dtype, order=order)
         ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x1, dst=_x1, sycl_queue=exec_q, depends=dep_evs
         )
@@ -367,7 +366,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
         if order == "K":
             _x2 = _empty_like_orderK(x2, out_dtype)
         else:
-            _x2 = dpt_ext.empty_like(x2, dtype=out_dtype, order=order)
+            _x2 = dpt.empty_like(x2, dtype=out_dtype, order=order)
         ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x2, dst=_x2, sycl_queue=exec_q, depends=dep_evs
         )
@@ -380,7 +379,7 @@ def where(condition, x1, x2, /, *, order="K", out=None):
                 condition, x1, x2, out_dtype, res_shape, out_usm_type, exec_q
             )
         else:
-            out = dpt_ext.empty(
+            out = dpt.empty(
                 res_shape,
                 dtype=out_dtype,
                 order=order,
@@ -389,11 +388,11 @@ def where(condition, x1, x2, /, *, order="K", out=None):
             )
 
     if condition_shape != res_shape:
-        condition = dpt_ext.broadcast_to(condition, res_shape)
+        condition = dpt.broadcast_to(condition, res_shape)
     if x1_shape != res_shape:
-        x1 = dpt_ext.broadcast_to(x1, res_shape)
+        x1 = dpt.broadcast_to(x1, res_shape)
     if x2_shape != res_shape:
-        x2 = dpt_ext.broadcast_to(x2, res_shape)
+        x2 = dpt.broadcast_to(x2, res_shape)
 
     dep_evs = _manager.submitted_events
     hev, where_ev = ti._where(
diff --git a/dpctl_ext/tensor/_searchsorted.py b/dpctl_ext/tensor/_searchsorted.py
index 2d4807fb0d0c..4c680a49b07b 100644
--- a/dpctl_ext/tensor/_searchsorted.py
+++ b/dpctl_ext/tensor/_searchsorted.py
@@ -32,10 +32,6 @@
 import dpctl
 import dpctl.utils as du
 
-# TODO: revert to `from ._usmarray import...`
-# when dpnp fully migrates dpctl/tensor
-from dpctl.tensor._usmarray import usm_ndarray
-
 from ._copy_utils import _empty_like_orderK
 from ._ctors import empty
 from ._tensor_impl import _copy_usm_ndarray_into_usm_ndarray as ti_copy
@@ -46,6 +42,10 @@
 from ._tensor_sorting_impl import _searchsorted_left, _searchsorted_right
 from ._type_utils import isdtype, result_type
 
+# TODO: revert to `from ._usmarray import...`
+# when dpnp fully migrates dpctl/tensor
+from ._usmarray import usm_ndarray
+
 
 def searchsorted(
     x1: usm_ndarray,
diff --git a/dpctl_ext/tensor/_set_functions.py b/dpctl_ext/tensor/_set_functions.py
index 2672e082d18e..29e4914ad63b 100644
--- a/dpctl_ext/tensor/_set_functions.py
+++ b/dpctl_ext/tensor/_set_functions.py
@@ -28,12 +28,11 @@
 
 from typing import NamedTuple, Optional, Union
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 from dpctl_ext.tensor._tensor_elementwise_impl import _not_equal, _subtract
 
 from ._copy_utils import _empty_like_orderK
@@ -112,10 +111,10 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
     if fx.size == 0:
         return fx
-    s = dpt_ext.empty_like(fx, order="C")
+    s = dpt.empty_like(fx, order="C")
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     if fx.flags.c_contiguous:
@@ -128,7 +127,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -141,7 +140,7 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
             depends=[copy_ev],
         )
         _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -155,14 +154,14 @@ def unique_values(x: dpt.usm_ndarray) -> dpt.usm_ndarray:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
+    cumsum = dpt.empty(s.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
     )
     if n_uniques == fx.size:
         return s
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x.usm_type, sycl_queue=exec_q
     )
     ht_ev, ex_e = _extract(
@@ -206,11 +205,11 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
     ind_dt = default_device_index_type(exec_q)
     if fx.size == 0:
-        return UniqueCountsResult(fx, dpt_ext.empty_like(fx, dtype=ind_dt))
-    s = dpt_ext.empty_like(fx, order="C")
+        return UniqueCountsResult(fx, dpt.empty_like(fx, dtype=ind_dt))
+    s = dpt.empty_like(fx, order="C")
 
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -224,7 +223,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -237,7 +236,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
             depends=[copy_ev],
         )
         _manager.add_event_pair(ht_ev, sort_ev)
-    unique_mask = dpt_ext.empty(s.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(s.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -251,9 +250,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[one_ev, uneq_ev]
@@ -261,11 +258,11 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
     if n_uniques == fx.size:
         return UniqueCountsResult(
             s,
-            dpt_ext.ones(
+            dpt.ones(
                 n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
             ),
         )
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     # populate unique values
@@ -278,10 +275,10 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, ex_e)
-    unique_counts = dpt_ext.empty(
+    unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     # writing into new allocation, no dependency
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
@@ -300,7 +297,7 @@ def unique_counts(x: dpt.usm_ndarray) -> UniqueCountsResult:
         x.size, dst=unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(unique_counts[1:])
+    _counts = dpt.empty_like(unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=unique_counts[1:],
         src2=unique_counts[:-1],
@@ -342,11 +339,11 @@ def unique_inverse(x):
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
-        return UniqueInverseResult(fx, dpt_ext.reshape(unsorting_ids, x.shape))
+        return UniqueInverseResult(fx, dpt.reshape(unsorting_ids, x.shape))
 
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -360,7 +357,7 @@ def unique_inverse(x):
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -381,7 +378,7 @@ def unique_inverse(x):
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, argsort_ev)
-    s = dpt_ext.empty_like(fx)
+    s = dpt.empty_like(fx)
     # s = fx[sorting_ids]
     ht_ev, take_ev = _take(
         src=fx,
@@ -393,7 +390,7 @@ def unique_inverse(x):
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -407,16 +404,14 @@ def unique_inverse(x):
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
     )
     if n_uniques == fx.size:
-        return UniqueInverseResult(s, dpt_ext.reshape(unsorting_ids, x.shape))
-    unique_vals = dpt_ext.empty(
+        return UniqueInverseResult(s, dpt.reshape(unsorting_ids, x.shape))
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, uv_ev = _extract(
@@ -428,10 +423,10 @@ def unique_inverse(x):
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt_ext.empty(
+    cum_unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
     ht_ev, extr_ev = _extract(
@@ -448,7 +443,7 @@ def unique_inverse(x):
         x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    _counts = dpt.empty_like(cum_unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=cum_unique_counts[1:],
         src2=cum_unique_counts[:-1],
@@ -458,7 +453,7 @@ def unique_inverse(x):
     )
     _manager.add_event_pair(ht_ev, sub_ev)
 
-    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
     ht_ev, ssl_ev = _searchsorted_left(
         hay=unique_vals,
         needles=x,
@@ -513,17 +508,17 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     if x.ndim == 1:
         fx = x
     else:
-        fx = dpt_ext.reshape(x, (x.size,), order="C")
-    sorting_ids = dpt_ext.empty_like(fx, dtype=ind_dt, order="C")
-    unsorting_ids = dpt_ext.empty_like(sorting_ids, dtype=ind_dt, order="C")
+        fx = dpt.reshape(x, (x.size,), order="C")
+    sorting_ids = dpt.empty_like(fx, dtype=ind_dt, order="C")
+    unsorting_ids = dpt.empty_like(sorting_ids, dtype=ind_dt, order="C")
     if fx.size == 0:
         # original array contains no data
         # so it can be safely returned as values
         return UniqueAllResult(
             fx,
             sorting_ids,
-            dpt_ext.reshape(unsorting_ids, x.shape),
-            dpt_ext.empty_like(fx, dtype=ind_dt),
+            dpt.reshape(unsorting_ids, x.shape),
+            dpt.empty_like(fx, dtype=ind_dt),
         )
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
@@ -537,7 +532,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         )
         _manager.add_event_pair(ht_ev, sort_ev)
     else:
-        tmp = dpt_ext.empty_like(fx, order="C")
+        tmp = dpt.empty_like(fx, order="C")
         ht_ev, copy_ev = _copy_usm_ndarray_into_usm_ndarray(
             src=fx, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
@@ -558,7 +553,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, args_ev)
-    s = dpt_ext.empty_like(fx)
+    s = dpt.empty_like(fx)
     # s = fx[sorting_ids]
     ht_ev, take_ev = _take(
         src=fx,
@@ -570,7 +565,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         depends=[sort_ev],
     )
     _manager.add_event_pair(ht_ev, take_ev)
-    unique_mask = dpt_ext.empty(fx.shape, dtype="?", sycl_queue=exec_q)
+    unique_mask = dpt.empty(fx.shape, dtype="?", sycl_queue=exec_q)
     ht_ev, uneq_ev = _not_equal(
         src1=s[:-1],
         src2=s[1:],
@@ -583,24 +578,22 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         fill_value=True, dst=unique_mask[0], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, one_ev)
-    cumsum = dpt_ext.empty(
-        unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q
-    )
+    cumsum = dpt.empty(unique_mask.shape, dtype=dpt.int64, sycl_queue=exec_q)
     # synchronizing call
     n_uniques = mask_positions(
         unique_mask, cumsum, sycl_queue=exec_q, depends=[uneq_ev, one_ev]
     )
     if n_uniques == fx.size:
-        _counts = dpt_ext.ones(
+        _counts = dpt.ones(
             n_uniques, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
         )
         return UniqueAllResult(
             s,
             sorting_ids,
-            dpt_ext.reshape(unsorting_ids, x.shape),
+            dpt.reshape(unsorting_ids, x.shape),
             _counts,
         )
-    unique_vals = dpt_ext.empty(
+    unique_vals = dpt.empty(
         n_uniques, dtype=x.dtype, usm_type=x_usm_type, sycl_queue=exec_q
     )
     ht_ev, uv_ev = _extract(
@@ -612,10 +605,10 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         sycl_queue=exec_q,
     )
     _manager.add_event_pair(ht_ev, uv_ev)
-    cum_unique_counts = dpt_ext.empty(
+    cum_unique_counts = dpt.empty(
         n_uniques + 1, dtype=ind_dt, usm_type=x_usm_type, sycl_queue=exec_q
     )
-    idx = dpt_ext.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
+    idx = dpt.empty(x.size, dtype=ind_dt, sycl_queue=exec_q)
     ht_ev, id_ev = _linspace_step(start=0, dt=1, dst=idx, sycl_queue=exec_q)
     _manager.add_event_pair(ht_ev, id_ev)
     ht_ev, extr_ev = _extract(
@@ -632,7 +625,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
         x.size, dst=cum_unique_counts[-1], sycl_queue=exec_q
     )
     _manager.add_event_pair(ht_ev, set_ev)
-    _counts = dpt_ext.empty_like(cum_unique_counts[1:])
+    _counts = dpt.empty_like(cum_unique_counts[1:])
     ht_ev, sub_ev = _subtract(
         src1=cum_unique_counts[1:],
         src2=cum_unique_counts[:-1],
@@ -642,7 +635,7 @@ def unique_all(x: dpt.usm_ndarray) -> UniqueAllResult:
     )
     _manager.add_event_pair(ht_ev, sub_ev)
 
-    inv = dpt_ext.empty_like(x, dtype=ind_dt, order="C")
+    inv = dpt.empty_like(x, dtype=ind_dt, order="C")
     ht_ev, ssl_ev = _searchsorted_left(
         hay=unique_vals,
         needles=x,
@@ -734,26 +727,26 @@ def isin(
     x_sh = _get_shape(x)
     if isinstance(test_elements, dpt.usm_ndarray) and test_elements.size == 0:
         if invert:
-            return dpt_ext.ones(
+            return dpt.ones(
                 x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
             )
         else:
-            return dpt_ext.zeros(
+            return dpt.zeros(
                 x_sh, dtype=dpt.bool, usm_type=res_usm_type, sycl_queue=exec_q
             )
 
     dt1, dt2 = _resolve_weak_types_all_py_ints(x_dt, test_dt, sycl_dev)
-    dt = _to_device_supported_dtype(dpt_ext.result_type(dt1, dt2), sycl_dev)
+    dt = _to_device_supported_dtype(dpt.result_type(dt1, dt2), sycl_dev)
 
     if not isinstance(x, dpt.usm_ndarray):
-        x_arr = dpt_ext.asarray(
+        x_arr = dpt.asarray(
             x, dtype=dt1, usm_type=res_usm_type, sycl_queue=exec_q
         )
     else:
         x_arr = x
 
     if not isinstance(test_elements, dpt.usm_ndarray):
-        test_arr = dpt_ext.asarray(
+        test_arr = dpt.asarray(
             test_elements, dtype=dt2, usm_type=res_usm_type, sycl_queue=exec_q
         )
     else:
@@ -773,7 +766,7 @@ def isin(
 
     if test_dt != dt:
         # copy into C-contiguous memory, because the array will be flattened
-        test_buf = dpt_ext.empty_like(
+        test_buf = dpt.empty_like(
             test_arr, dtype=dt, order="C", usm_type=res_usm_type
         )
         ht_ev, ev = _copy_usm_ndarray_into_usm_ndarray(
@@ -783,10 +776,10 @@ def isin(
     else:
         test_buf = test_arr
 
-    test_buf = dpt_ext.reshape(test_buf, -1)
-    test_buf = dpt_ext.sort(test_buf)
+    test_buf = dpt.reshape(test_buf, -1)
+    test_buf = dpt.sort(test_buf)
 
-    dst = dpt_ext.empty_like(
+    dst = dpt.empty_like(
         x_buf, dtype=dpt.bool, usm_type=res_usm_type, order="C"
     )
 
diff --git a/dpctl_ext/tensor/_slicing.pxi b/dpctl_ext/tensor/_slicing.pxi
new file mode 100644
index 000000000000..86db56013e23
--- /dev/null
+++ b/dpctl_ext/tensor/_slicing.pxi
@@ -0,0 +1,383 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import numbers
+from operator import index
+from cpython.buffer cimport PyObject_CheckBuffer
+from numpy import ndarray
+
+
+cdef bint _is_buffer(object o):
+    return PyObject_CheckBuffer(o)
+
+
+cdef Py_ssize_t _slice_len(
+    Py_ssize_t sl_start,
+    Py_ssize_t sl_stop,
+    Py_ssize_t sl_step
+):
+    """
+    Compute len(range(sl_start, sl_stop, sl_step))
+    """
+    if sl_start == sl_stop:
+        return 0
+    if sl_step > 0:
+        if sl_start > sl_stop:
+            return 0
+        # 1 + argmax k such htat sl_start + sl_step*k < sl_stop
+        return 1 + ((sl_stop - sl_start - 1) // sl_step)
+    else:
+        if sl_start < sl_stop:
+            return 0
+        return 1 + ((sl_stop - sl_start + 1) // sl_step)
+
+
+cdef bint _is_integral(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "ui":
+            return False
+        return True
+    if isinstance(x, bool):
+        return False
+    if isinstance(x, int):
+        return True
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "bBhHiIlLqQ"
+        else:
+            return False
+    if callable(getattr(x, "__index__", None)):
+        try:
+            index(x)
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+cdef bint _is_boolean(object x) except *:
+    """Gives True if x is an integral slice spec"""
+    if isinstance(x, (ndarray, usm_ndarray)):
+        if x.ndim > 0:
+            return False
+        if x.dtype.kind not in "b":
+            return False
+        return True
+    if isinstance(x, bool):
+        return True
+    if isinstance(x, (int, float, complex)):
+        return False
+    if _is_buffer(x):
+        mbuf = memoryview(x)
+        if mbuf.ndim == 0:
+            f = mbuf.format
+            return f in "?"
+        else:
+            return False
+    if callable(getattr(x, "__bool__", None)):
+        try:
+            x.__bool__()
+        except (TypeError, ValueError):
+            return False
+        return True
+    return False
+
+
+def _basic_slice_meta(ind, shape : tuple, strides : tuple, offset : int):
+    """
+    Give basic slicing index `ind` and array layout information produce
+    a 5-tuple (resulting_shape, resulting_strides, resulting_offset,
+       advanced_ind, resulting_advanced_ind_pos)
+    used to construct a view into underlying array over which advanced
+    indexing, if any, is to be performed.
+
+    Raises IndexError for invalid index `ind`.
+    """
+    _no_advanced_ind = tuple()
+    _no_advanced_pos = -1
+    if ind is Ellipsis:
+        return (shape, strides, offset, _no_advanced_ind, _no_advanced_pos)
+    elif ind is None:
+        return (
+            (1,) + shape,
+            (0,) + strides,
+            offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif isinstance(ind, slice):
+        sl_start, sl_stop, sl_step = ind.indices(shape[0])
+        sh0 = _slice_len(sl_start, sl_stop, sl_step)
+        str0 = sl_step * strides[0]
+        new_strides = (
+            strides if (sl_step == 1 or sh0 == 0) else (str0,) + strides[1:]
+        )
+        new_shape = (sh0, ) + shape[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        new_offset = offset if is_empty else offset + sl_start * strides[0]
+        return (
+            new_shape,
+            new_strides,
+            new_offset,
+            _no_advanced_ind,
+            _no_advanced_pos,
+        )
+    elif _is_boolean(ind):
+        if ind:
+            return (
+                (1,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            return (
+                (0,) + shape,
+                (0,) + strides,
+                offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+    elif _is_integral(ind):
+        ind = index(ind)
+        new_shape = shape[1:]
+        new_strides = strides[1:]
+        is_empty = any(sh_i == 0 for sh_i in new_shape)
+        if 0 <= ind < shape[0]:
+            new_offset = offset if is_empty else offset + ind * strides[0]
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        elif -shape[0] <= ind < 0:
+            new_offset = (
+                offset if is_empty else offset + (shape[0] + ind) * strides[0]
+            )
+            return (
+                new_shape,
+                new_strides,
+                new_offset,
+                _no_advanced_ind,
+                _no_advanced_pos,
+            )
+        else:
+            raise IndexError(
+                "Index {0} is out of range for axes 0 with "
+                "size {1}".format(ind, shape[0]))
+    elif isinstance(ind, (ndarray, usm_ndarray)):
+        return (shape, strides, offset, (ind,), 0)
+    elif isinstance(ind, tuple):
+        axes_referenced = 0
+        ellipses_count = 0
+        newaxis_count = 0
+        explicit_index = 0
+        seen_arrays_yet = False
+        array_streak_started = False
+        array_streak_interrupted = False
+        for i in ind:
+            if i is None:
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif i is Ellipsis:
+                ellipses_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif isinstance(i, slice):
+                axes_referenced += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_boolean(i):
+                newaxis_count += 1
+                if array_streak_started:
+                    array_streak_interrupted = True
+            elif _is_integral(i):
+                axes_referenced += 1
+                if not array_streak_started and array_streak_interrupted:
+                    explicit_index += 1
+            elif isinstance(i, (ndarray, usm_ndarray)):
+                if not seen_arrays_yet:
+                    seen_arrays_yet = True
+                    array_streak_started = True
+                    array_streak_interrupted = False
+                if array_streak_interrupted:
+                    raise IndexError(
+                        "Advanced indexing array specs may not be "
+                        "separated by basic slicing specs."
+                    )
+                dt_k = i.dtype.kind
+                if dt_k == "b" and i.ndim > 0:
+                    axes_referenced += i.ndim
+                elif dt_k in "ui" and i.ndim > 0:
+                    axes_referenced += 1
+                else:
+                    raise IndexError(
+                        "arrays used as indices must be of integer "
+                        "(or boolean) type"
+                    )
+            else:
+                raise IndexError(
+                    "Only integers, slices (`:`), ellipsis (`...`), "
+                    "dpctl.tensor.newaxis (`None`) and integer and "
+                    "boolean arrays are valid indices."
+                )
+        if ellipses_count > 1:
+            raise IndexError(
+                "an index can only have a single ellipsis ('...')")
+        if axes_referenced > len(shape):
+            raise IndexError(
+                "too many indices for an array, array is "
+                "{0}-dimensional, but {1} were indexed".format(
+                    len(shape), axes_referenced))
+        if ellipses_count:
+            ellipses_count = len(shape) - axes_referenced
+        new_shape_len = (newaxis_count + ellipses_count
+                         + axes_referenced - explicit_index)
+        new_shape = list()
+        new_strides = list()
+        new_advanced_ind = list()
+        k = 0
+        new_advanced_start_pos = -1
+        advanced_start_pos_set = False
+        new_offset = offset
+        is_empty = False
+        array_streak = False
+        for i in range(len(ind)):
+            ind_i = ind[i]
+            if (ind_i is Ellipsis):
+                k_new = k + ellipses_count
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                if any(dim == 0 for dim in shape[k:k_new]):
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif ind_i is None:
+                new_shape.append(1)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif isinstance(ind_i, slice):
+                k_new = k + 1
+                sl_start, sl_stop, sl_step = ind_i.indices(shape[k])
+                sh_i = _slice_len(sl_start, sl_stop, sl_step)
+                str_i = (1 if sh_i == 0 else sl_step) * strides[k]
+                new_shape.append(sh_i)
+                new_strides.append(str_i)
+                if sh_i > 0 and not is_empty:
+                    new_offset = new_offset + sl_start * strides[k]
+                if sh_i == 0:
+                    is_empty = True
+                    new_offset = offset
+                k = k_new
+                if array_streak:
+                    array_streak = False
+            elif _is_boolean(ind_i):
+                new_shape.append(1 if ind_i else 0)
+                new_strides.append(0)
+                if array_streak:
+                    array_streak = False
+            elif _is_integral(ind_i):
+                if array_streak:
+                    if not isinstance(ind_i, (ndarray, usm_ndarray)):
+                        ind_i = index(ind_i)
+                        # integer will be converted to an array,
+                        # still raise if OOB
+                        if not (
+                            0 <= ind_i < shape[k] or -shape[k] <= ind_i < 0
+                        ):
+                            raise IndexError(
+                                "Index {0} is out of range for axes "
+                                "{1} with size {2}".format(ind_i, k, shape[k])
+                            )
+                    new_advanced_ind.append(ind_i)
+                    k_new = k + 1
+                    new_shape.extend(shape[k:k_new])
+                    new_strides.extend(strides[k:k_new])
+                    k = k_new
+                else:
+                    ind_i = index(ind_i)
+                    if 0 <= ind_i < shape[k]:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = new_offset + ind_i * strides[k]
+                        k = k_new
+                    elif -shape[k] <= ind_i < 0:
+                        k_new = k + 1
+                        if not is_empty:
+                            new_offset = (
+                                new_offset + (shape[k] + ind_i) * strides[k]
+                            )
+                        k = k_new
+                    else:
+                        raise IndexError(
+                            "Index {0} is out of range for axes "
+                            "{1} with size {2}".format(ind_i, k, shape[k])
+                        )
+            elif isinstance(ind_i, (ndarray, usm_ndarray)):
+                if not array_streak:
+                    array_streak = True
+                if not advanced_start_pos_set:
+                    new_advanced_start_pos = len(new_shape)
+                    advanced_start_pos_set = True
+                new_advanced_ind.append(ind_i)
+                dt_k = ind_i.dtype.kind
+                if dt_k == "b":
+                    k_new = k + ind_i.ndim
+                else:
+                    k_new = k + 1
+                new_shape.extend(shape[k:k_new])
+                new_strides.extend(strides[k:k_new])
+                k = k_new
+        new_shape.extend(shape[k:])
+        new_strides.extend(strides[k:])
+        new_shape_len += len(shape) - k
+        return (
+            tuple(new_shape),
+            tuple(new_strides),
+            new_offset,
+            tuple(new_advanced_ind),
+            new_advanced_start_pos
+        )
+    else:
+        raise IndexError(
+            "Only integers, slices (`:`), ellipsis (`...`), "
+            "dpctl.tensor.newaxis (`None`) and integer and "
+            "boolean arrays are valid indices."
+        )
diff --git a/dpctl_ext/tensor/_sorting.py b/dpctl_ext/tensor/_sorting.py
index 24693a408889..42cd9e1b44be 100644
--- a/dpctl_ext/tensor/_sorting.py
+++ b/dpctl_ext/tensor/_sorting.py
@@ -29,12 +29,11 @@
 import operator
 from typing import NamedTuple
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 from ._numpy_helper import normalize_axis_index
@@ -98,7 +97,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt_ext.copy(x, order="C")
+        return dpt.copy(x, order="C")
     else:
         axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
     a1 = axis + 1
@@ -109,7 +108,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     if kind is None:
         kind = "stable"
     if not isinstance(kind, str) or kind not in [
@@ -138,7 +137,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
     _manager = du.SequentialOrderManager[exec_q]
     dep_evs = _manager.submitted_events
     if arr.flags.c_contiguous:
-        res = dpt_ext.empty_like(arr, order="C")
+        res = dpt.empty_like(arr, order="C")
         ht_ev, impl_ev = impl_fn(
             src=arr,
             trailing_dims_to_sort=1,
@@ -148,12 +147,12 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt_ext.empty_like(arr, order="C")
+        res = dpt.empty_like(arr, order="C")
         ht_ev, impl_ev = impl_fn(
             src=tmp,
             trailing_dims_to_sort=1,
@@ -164,7 +163,7 @@ def sort(x, /, *, axis=-1, descending=False, stable=True, kind=None):
         _manager.add_event_pair(ht_ev, impl_ev)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(res, inv_perm)
+        res = dpt.permute_dims(res, inv_perm)
     return res
 
 
@@ -214,7 +213,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
     nd = x.ndim
     if nd == 0:
         axis = normalize_axis_index(axis, ndim=1, msg_prefix="axis")
-        return dpt_ext.zeros_like(
+        return dpt.zeros_like(
             x, dtype=ti.default_device_index_type(x.sycl_queue), order="C"
         )
     else:
@@ -227,7 +226,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         perm = [i for i in range(nd) if i != axis] + [
             axis,
         ]
-        arr = dpt_ext.permute_dims(x, perm)
+        arr = dpt.permute_dims(x, perm)
     if kind is None:
         kind = "stable"
     if not isinstance(kind, str) or kind not in [
@@ -257,7 +256,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
     dep_evs = _manager.submitted_events
     index_dt = ti.default_device_index_type(exec_q)
     if arr.flags.c_contiguous:
-        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
         ht_ev, impl_ev = impl_fn(
             src=arr,
             trailing_dims_to_sort=1,
@@ -267,12 +266,12 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        res = dpt_ext.empty_like(arr, dtype=index_dt, order="C")
+        res = dpt.empty_like(arr, dtype=index_dt, order="C")
         ht_ev, impl_ev = impl_fn(
             src=tmp,
             trailing_dims_to_sort=1,
@@ -283,7 +282,7 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         _manager.add_event_pair(ht_ev, impl_ev)
     if a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(res, inv_perm)
+        res = dpt.permute_dims(res, inv_perm)
     return res
 
 
@@ -354,8 +353,8 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
             if k > 1:
                 raise ValueError(f"`k`={k} is out of bounds 1")
             return TopKResult(
-                dpt_ext.copy(x, order="C"),
-                dpt_ext.zeros_like(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
                     x, dtype=ti.default_device_index_type(x.sycl_queue)
                 ),
             )
@@ -373,7 +372,7 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
             perm = [i for i in range(nd) if i != axis] + [
                 axis,
             ]
-            arr = dpt_ext.permute_dims(x, perm)
+            arr = dpt.permute_dims(x, perm)
         n_search_dims = 1
         res_sh = arr.shape[: nd - 1] + (k,)
 
@@ -386,14 +385,14 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
 
     res_usm_type = arr.usm_type
     if arr.flags.c_contiguous:
-        vals = dpt_ext.empty(
+        vals = dpt.empty(
             res_sh,
             dtype=arr.dtype,
             usm_type=res_usm_type,
             order="C",
             sycl_queue=exec_q,
         )
-        inds = dpt_ext.empty(
+        inds = dpt.empty(
             res_sh,
             dtype=ti.default_device_index_type(exec_q),
             usm_type=res_usm_type,
@@ -412,19 +411,19 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
         )
         _manager.add_event_pair(ht_ev, impl_ev)
     else:
-        tmp = dpt_ext.empty_like(arr, order="C")
+        tmp = dpt.empty_like(arr, order="C")
         ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
         )
         _manager.add_event_pair(ht_ev, copy_ev)
-        vals = dpt_ext.empty(
+        vals = dpt.empty(
             res_sh,
             dtype=arr.dtype,
             usm_type=res_usm_type,
             order="C",
             sycl_queue=exec_q,
         )
-        inds = dpt_ext.empty(
+        inds = dpt.empty(
             res_sh,
             dtype=ti.default_device_index_type(exec_q),
             usm_type=res_usm_type,
@@ -444,7 +443,7 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
         _manager.add_event_pair(ht_ev, impl_ev)
     if axis is not None and a1 != nd:
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        vals = dpt_ext.permute_dims(vals, inv_perm)
-        inds = dpt_ext.permute_dims(inds, inv_perm)
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
 
     return TopKResult(vals, inds)
diff --git a/dpctl_ext/tensor/_statistical_functions.py b/dpctl_ext/tensor/_statistical_functions.py
index 5513dfa7a65f..c1544b84c6a7 100644
--- a/dpctl_ext/tensor/_statistical_functions.py
+++ b/dpctl_ext/tensor/_statistical_functions.py
@@ -25,12 +25,11 @@
 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 # THE POSSIBILITY OF SUCH DAMAGE.
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_elementwise_impl as tei
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
@@ -66,7 +65,7 @@ def _var_impl(x, axis, correction, keepdims):
     _manager = du.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
     if inp_dt != res_dt:
-        buf = dpt_ext.empty_like(x, dtype=res_dt)
+        buf = dpt.empty_like(x, dtype=res_dt)
         ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray(
             src=x, dst=buf, sycl_queue=q, depends=dep_evs
         )
@@ -74,18 +73,18 @@ def _var_impl(x, axis, correction, keepdims):
     else:
         buf = x
     # calculate mean
-    buf2 = dpt_ext.permute_dims(buf, perm)
+    buf2 = dpt.permute_dims(buf, perm)
     res_shape = buf2.shape[: nd - red_nd]
     # use keepdims=True path for later broadcasting
     if red_nd == 0:
-        mean_ary = dpt_ext.empty_like(buf)
+        mean_ary = dpt.empty_like(buf)
         dep_evs = _manager.submitted_events
         ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray(
             src=buf, dst=mean_ary, sycl_queue=q, depends=dep_evs
         )
         _manager.add_event_pair(ht_e1, c_e2)
     else:
-        mean_ary = dpt_ext.empty(
+        mean_ary = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -103,8 +102,8 @@ def _var_impl(x, axis, correction, keepdims):
 
         mean_ary_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        mean_ary = dpt_ext.permute_dims(
-            dpt_ext.reshape(mean_ary, mean_ary_shape), inv_perm
+        mean_ary = dpt.permute_dims(
+            dpt.reshape(mean_ary, mean_ary_shape), inv_perm
         )
     # divide in-place to get mean
     mean_ary_shape = mean_ary.shape
@@ -116,9 +115,9 @@ def _var_impl(x, axis, correction, keepdims):
     _manager.add_event_pair(ht_e2, d_e1)
 
     # subtract mean from original array to get deviations
-    dev_ary = dpt_ext.empty_like(buf)
+    dev_ary = dpt.empty_like(buf)
     if mean_ary_shape != buf.shape:
-        mean_ary = dpt_ext.broadcast_to(mean_ary, buf.shape)
+        mean_ary = dpt.broadcast_to(mean_ary, buf.shape)
     ht_e4, su_e = tei._subtract(
         src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1]
     )
@@ -130,11 +129,11 @@ def _var_impl(x, axis, correction, keepdims):
     _manager.add_event_pair(ht_e5, sq_e)
 
     # take sum of squared deviations
-    dev_ary2 = dpt_ext.permute_dims(dev_ary, perm)
+    dev_ary2 = dpt.permute_dims(dev_ary, perm)
     if red_nd == 0:
         res = dev_ary
     else:
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape,
             dtype=res_dt,
             usm_type=res_usm_type,
@@ -152,9 +151,7 @@ def _var_impl(x, axis, correction, keepdims):
         if keepdims:
             res_shape = res_shape + (1,) * red_nd
             inv_perm = sorted(range(nd), key=lambda d: perm[d])
-            res = dpt_ext.permute_dims(
-                dpt_ext.reshape(res, res_shape), inv_perm
-            )
+            res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
     res_shape = res.shape
     # when nelems - correction <= 0, yield nans
     div = max(nelems - correction, 0)
@@ -215,7 +212,7 @@ def mean(x, axis=None, keepdims=False):
             nelems *= x.shape[i]
     sum_nd = len(axis)
     perm = perm + list(axis)
-    arr2 = dpt_ext.permute_dims(x, perm)
+    arr2 = dpt.permute_dims(x, perm)
     res_shape = arr2.shape[: nd - sum_nd]
     q = x.sycl_queue
     inp_dt = x.dtype
@@ -226,12 +223,12 @@ def mean(x, axis=None, keepdims=False):
     )
     res_usm_type = x.usm_type
     if sum_nd == 0:
-        return dpt_ext.astype(x, res_dt, copy=True)
+        return dpt.astype(x, res_dt, copy=True)
 
     _manager = du.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
     if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q):
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e1, r_e = tri._sum_over_axis(
@@ -243,14 +240,14 @@ def mean(x, axis=None, keepdims=False):
         )
         _manager.add_event_pair(ht_e1, r_e)
     else:
-        tmp = dpt_ext.empty(
+        tmp = dpt.empty(
             arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray(
             src=arr2, dst=tmp, sycl_queue=q, depends=dep_evs
         )
         _manager.add_event_pair(ht_e_cpy, cpy_e)
-        res = dpt_ext.empty(
+        res = dpt.empty(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q
         )
         ht_e_red, r_e = tri._sum_over_axis(
@@ -265,7 +262,7 @@ def mean(x, axis=None, keepdims=False):
     if keepdims:
         res_shape = res_shape + (1,) * sum_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
 
     dep_evs = _manager.submitted_events
     ht_e2, div_e = tei._divide_by_scalar(
diff --git a/dpctl_ext/tensor/_stride_utils.pxi b/dpctl_ext/tensor/_stride_utils.pxi
new file mode 100644
index 000000000000..3caf8dd8fd1f
--- /dev/null
+++ b/dpctl_ext/tensor/_stride_utils.pxi
@@ -0,0 +1,314 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+from cpython.mem cimport PyMem_Malloc
+from cpython.ref cimport Py_INCREF
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+
+cdef int ERROR_MALLOC = 1
+cdef int ERROR_INTERNAL = -1
+cdef int ERROR_INCORRECT_ORDER = 2
+cdef int ERROR_UNEXPECTED_STRIDES = 3
+
+cdef int USM_ARRAY_C_CONTIGUOUS = 1
+cdef int USM_ARRAY_F_CONTIGUOUS = 2
+cdef int USM_ARRAY_WRITABLE = 4
+
+
+cdef Py_ssize_t shape_to_elem_count(int nd, Py_ssize_t *shape_arr):
+    """
+    Computes number of elements in an array.
+    """
+    cdef Py_ssize_t count = 1
+    for i in range(nd):
+        count *= shape_arr[i]
+    return count
+
+
+cdef int _from_input_shape_strides(
+    int nd, object shape, object strides, int itemsize, char order,
+    Py_ssize_t **shape_ptr, Py_ssize_t **strides_ptr,
+    Py_ssize_t *nelems, Py_ssize_t *min_disp, Py_ssize_t *max_disp,
+    int *contig
+):
+    """
+    Arguments: nd, shape, strides, itemsize, order
+    Modifies:
+        shape_ptr - pointer to C array for shape values
+        stride_ptr - pointer to C array for strides values
+        nelems - Number of elements in array
+        min_disp = min( dot(strides, index), index for shape)
+        max_disp = max( dor(strides, index), index for shape)
+        contig = enumeration for array contiguity
+    Returns: 0 on success, error code otherwise.
+        On success pointers point to allocated arrays,
+        Otherwise they are set to NULL
+    """
+    cdef int i
+    cdef int j
+    cdef bint all_incr = 1
+    cdef bint all_decr = 1
+    cdef bint strides_inspected = 0
+    cdef Py_ssize_t elem_count = 1
+    cdef Py_ssize_t min_shift = 0
+    cdef Py_ssize_t max_shift = 0
+    cdef Py_ssize_t str_i
+    cdef Py_ssize_t* shape_arr
+    cdef Py_ssize_t* strides_arr
+
+    if (int(order) not in [ord("C"), ord("F"), ord("c"), ord("f")]):
+        return ERROR_INCORRECT_ORDER
+
+    # 0-d array
+    if (nd == 0):
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+
+    shape_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+    if (not shape_arr):
+        return ERROR_MALLOC
+    shape_ptr[0] = shape_arr
+    for i in range(0, nd):
+        shape_arr[i] = <Py_ssize_t> shape[i]
+        elem_count *= shape_arr[i]
+    if elem_count == 0:
+        contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+        nelems[0] = 1
+        min_disp[0] = 0
+        max_disp[0] = 0
+        if strides is None:
+            strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        else:
+            strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+            if (not strides_arr):
+                PyMem_Free(shape_ptr[0])
+                shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+                return ERROR_MALLOC
+            strides_ptr[0] = strides_arr
+            for i in range(0, nd):
+                strides_arr[i] = <Py_ssize_t> strides[i]
+        return 0
+    nelems[0] = elem_count
+    if (strides is None):
+        # no need to allocate and populate strides
+        if order == <char> ord("C") or order == <char> ord("c"):
+            contig[0] = USM_ARRAY_C_CONTIGUOUS
+        else:
+            contig[0] = USM_ARRAY_F_CONTIGUOUS
+        if nd == 1:
+            contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        else:
+            j = 0
+            for i in range(nd):
+                if shape_arr[i] > 1:
+                    j = j + 1
+            if j < 2:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+        min_disp[0] = 0
+        max_disp[0] = (elem_count - 1)
+        strides_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return 0
+    elif ((isinstance(strides, (list, tuple)) or hasattr(strides, "tolist"))
+          and len(strides) == nd):
+        strides_arr = <Py_ssize_t*>PyMem_Malloc(nd * sizeof(Py_ssize_t))
+        if (not strides_arr):
+            PyMem_Free(shape_ptr[0])
+            shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+            return ERROR_MALLOC
+        strides_ptr[0] = strides_arr
+        for i in range(0, nd):
+            str_i = <Py_ssize_t> strides[i]
+            strides_arr[i] = str_i
+            if str_i > 0:
+                max_shift += str_i * (shape_arr[i] - 1)
+            else:
+                min_shift += str_i * (shape_arr[i] - 1)
+        min_disp[0] = min_shift
+        max_disp[0] = max_shift
+        if max_shift == min_shift + (elem_count - 1):
+            if elem_count == 1:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+                return 0
+            if nd == 1:
+                if strides_arr[0] == 1:
+                    contig[0] = USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS
+                else:
+                    contig[0] = 0
+                return 0
+            i = 0
+            while i < nd:
+                if shape_arr[i] == 1:
+                    i = i + 1
+                    continue
+                j = i + 1
+                while (j < nd and shape_arr[j] == 1):
+                    j = j + 1
+                if j < nd:
+                    strides_inspected = 1
+                    if all_incr:
+                        all_incr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] <= strides_arr[j])
+                        )
+                    if all_decr:
+                        all_decr = (
+                            (strides_arr[i] > 0) and
+                            (strides_arr[j] > 0) and
+                            (strides_arr[i] >= strides_arr[j])
+                        )
+                    i = j
+                else:
+                    if not strides_inspected:
+                        # all dimensions have size 1 except
+                        # dimension 'i'. Array is both C and F
+                        # contiguous
+                        strides_inspected = 1
+                        all_incr = (strides_arr[i] == 1)
+                        all_decr = all_incr
+                    break
+            # should only set contig flags on actually obtained
+            # values, rather than default values
+            all_incr = all_incr and strides_inspected
+            all_decr = all_decr and strides_inspected
+            if all_incr and all_decr:
+                contig[0] = (USM_ARRAY_C_CONTIGUOUS | USM_ARRAY_F_CONTIGUOUS)
+            elif all_incr:
+                contig[0] = USM_ARRAY_F_CONTIGUOUS
+            elif all_decr:
+                contig[0] = USM_ARRAY_C_CONTIGUOUS
+            else:
+                contig[0] = 0
+            return 0
+        else:
+            contig[0] = 0  # non-contiguous
+        return 0
+    else:
+        PyMem_Free(shape_ptr[0])
+        shape_ptr[0] = <Py_ssize_t *>(<size_t>0)
+        return ERROR_UNEXPECTED_STRIDES
+    # return ERROR_INTERNAL
+
+
+cdef object _make_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    if (ary):
+        res = PyTuple_New(nd)
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _make_reversed_int_tuple(int nd, const Py_ssize_t *ary):
+    """
+    Makes Python reversed tuple from C array
+    """
+    cdef tuple res
+    cdef object tmp
+    cdef int i
+    cdef int nd_1
+    if (ary):
+        res = PyTuple_New(nd)
+        nd_1 = nd - 1
+        for i in range(nd):
+            tmp = <object>ary[i]
+            Py_INCREF(tmp)  # SetItem steals the reference
+            PyTuple_SetItem(res, nd_1 - i, tmp)
+        return res
+    else:
+        return None
+
+
+cdef object _c_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of C-contiguous array
+    """
+    cdef tuple cc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    cdef int i
+    cdef int nd_1 = nd - 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(cc_strides, nd_1 - i, si)
+        si = si * shape[nd_1 - i]
+    return cc_strides
+
+
+cdef object _f_contig_strides(int nd, Py_ssize_t *shape):
+    """
+    Makes Python tuple for strides of F-contiguous array
+    """
+    cdef tuple fc_strides = PyTuple_New(nd)
+    cdef object si = 1
+    for i in range(0, nd):
+        Py_INCREF(si)  # SetItem steals the reference
+        PyTuple_SetItem(fc_strides, i, si)
+        si = si * shape[i]
+    return fc_strides
+
+cdef object _swap_last_two(tuple t):
+    """
+    Swap last two elements of a tuple
+    """
+    cdef int nd = len(t)
+    cdef tuple res
+    cdef int i
+    cdef object tmp
+    if (nd < 2):
+        return t
+    res = PyTuple_New(nd)
+    # copy all elements except the last two
+    for i in range(0, nd-2):
+        tmp = t[i]
+        Py_INCREF(tmp)  # SetItem steals the reference
+        PyTuple_SetItem(res, i, tmp)
+    # swap the last two elements
+    tmp = t[nd-1]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 2, tmp)
+    tmp = t[nd-2]
+    Py_INCREF(tmp)  # SetItem steals
+    PyTuple_SetItem(res, nd - 1, tmp)
+    return res
diff --git a/dpctl_ext/tensor/_testing.py b/dpctl_ext/tensor/_testing.py
index 5c7e9be0e2e3..4c9f5ebac9a4 100644
--- a/dpctl_ext/tensor/_testing.py
+++ b/dpctl_ext/tensor/_testing.py
@@ -26,13 +26,12 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 
 from ._manipulation_functions import _broadcast_shape_impl
 from ._type_utils import _to_device_supported_dtype
@@ -44,82 +43,74 @@ def _allclose_complex_fp(z1, z2, atol, rtol, equal_nan):
     z2r = dpt.real(z2)
     z2i = dpt.imag(z2)
     if equal_nan:
-        check1 = dpt_ext.all(
-            dpt_ext.isnan(z1r) == dpt_ext.isnan(z2r)
-        ) and dpt_ext.all(dpt_ext.isnan(z1i) == dpt_ext.isnan(z2i))
+        check1 = dpt.all(dpt.isnan(z1r) == dpt.isnan(z2r)) and dpt.all(
+            dpt.isnan(z1i) == dpt.isnan(z2i)
+        )
     else:
         check1 = (
-            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1r)))
-            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z1i)))
+            dpt.logical_not(dpt.any(dpt.isnan(z1r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z1i)))
         ) and (
-            dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2r)))
-            and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(z2i)))
+            dpt.logical_not(dpt.any(dpt.isnan(z2r)))
+            and dpt.logical_not(dpt.any(dpt.isnan(z2i)))
         )
     if not check1:
         return check1
-    mr = dpt_ext.isinf(z1r)
-    mi = dpt_ext.isinf(z1i)
-    check2 = dpt_ext.all(mr == dpt_ext.isinf(z2r)) and dpt_ext.all(
-        mi == dpt_ext.isinf(z2i)
-    )
+    mr = dpt.isinf(z1r)
+    mi = dpt.isinf(z1i)
+    check2 = dpt.all(mr == dpt.isinf(z2r)) and dpt.all(mi == dpt.isinf(z2i))
     if not check2:
         return check2
-    check3 = dpt_ext.all(z1r[mr] == z2r[mr]) and dpt_ext.all(z1i[mi] == z2i[mi])
+    check3 = dpt.all(z1r[mr] == z2r[mr]) and dpt.all(z1i[mi] == z2i[mi])
     if not check3:
         return check3
-    mr = dpt_ext.isfinite(z1r)
-    mi = dpt_ext.isfinite(z1i)
+    mr = dpt.isfinite(z1r)
+    mi = dpt.isfinite(z1i)
     mv1 = z1r[mr]
     mv2 = z2r[mr]
-    check4 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        < dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        < dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     if not check4:
         return check4
     mv1 = z1i[mi]
     mv2 = z2i[mi]
-    check5 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        <= dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check5 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check5
 
 
 def _allclose_real_fp(r1, r2, atol, rtol, equal_nan):
     if equal_nan:
-        check1 = dpt_ext.all(dpt_ext.isnan(r1) == dpt_ext.isnan(r2))
+        check1 = dpt.all(dpt.isnan(r1) == dpt.isnan(r2))
     else:
-        check1 = dpt_ext.logical_not(
-            dpt_ext.any(dpt_ext.isnan(r1))
-        ) and dpt_ext.logical_not(dpt_ext.any(dpt_ext.isnan(r2)))
+        check1 = dpt.logical_not(dpt.any(dpt.isnan(r1))) and dpt.logical_not(
+            dpt.any(dpt.isnan(r2))
+        )
     if not check1:
         return check1
-    mr = dpt_ext.isinf(r1)
-    check2 = dpt_ext.all(mr == dpt_ext.isinf(r2))
+    mr = dpt.isinf(r1)
+    check2 = dpt.all(mr == dpt.isinf(r2))
     if not check2:
         return check2
-    check3 = dpt_ext.all(r1[mr] == r2[mr])
+    check3 = dpt.all(r1[mr] == r2[mr])
     if not check3:
         return check3
-    m = dpt_ext.isfinite(r1)
+    m = dpt.isfinite(r1)
     mv1 = r1[m]
     mv2 = r2[m]
-    check4 = dpt_ext.all(
-        dpt_ext.abs(mv1 - mv2)
-        <= dpt_ext.maximum(
-            atol, rtol * dpt_ext.maximum(dpt_ext.abs(mv1), dpt_ext.abs(mv2))
-        )
+    check4 = dpt.all(
+        dpt.abs(mv1 - mv2)
+        <= dpt.maximum(atol, rtol * dpt.maximum(dpt.abs(mv1), dpt.abs(mv2)))
     )
     return check4
 
 
 def _allclose_others(r1, r2):
-    return dpt_ext.all(r1 == r2)
+    return dpt.all(r1 == r2)
 
 
 def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
@@ -160,11 +151,11 @@ def allclose(a1, a2, atol=1e-8, rtol=1e-5, equal_nan=False):
     else:
         res_dt = np.promote_types(b1.dtype, b2.dtype)
         res_dt = _to_device_supported_dtype(res_dt, exec_q.sycl_device)
-        b1 = dpt_ext.astype(b1, res_dt)
-        b2 = dpt_ext.astype(b2, res_dt)
+        b1 = dpt.astype(b1, res_dt)
+        b2 = dpt.astype(b2, res_dt)
 
-    b1 = dpt_ext.broadcast_to(b1, res_sh)
-    b2 = dpt_ext.broadcast_to(b2, res_sh)
+    b1 = dpt.broadcast_to(b1, res_sh)
+    b2 = dpt.broadcast_to(b2, res_sh)
 
     k = b1.dtype.kind
     if k == "c":
diff --git a/dpctl_ext/tensor/_type_utils.py b/dpctl_ext/tensor/_type_utils.py
index 1e386e15dfa3..8c15053cb4c1 100644
--- a/dpctl_ext/tensor/_type_utils.py
+++ b/dpctl_ext/tensor/_type_utils.py
@@ -28,12 +28,11 @@
 
 from __future__ import annotations
 
-import dpctl.tensor as dpt
 import numpy as np
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 
 
@@ -450,7 +449,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o1_dtype, WeakIntegralType
             ):
                 o1_val = o1_dtype.get()
-                o2_iinfo = dpt_ext.iinfo(o2_dtype)
+                o2_iinfo = dpt.iinfo(o2_dtype)
                 if (o1_val < o2_iinfo.min) or (o1_val > o2_iinfo.max):
                     return dpt.dtype(np.min_scalar_type(o1_val)), o2_dtype
             return o2_dtype, o2_dtype
@@ -473,7 +472,7 @@ def _resolve_weak_types_all_py_ints(o1_dtype, o2_dtype, dev):
                 o2_dtype, WeakIntegralType
             ):
                 o2_val = o2_dtype.get()
-                o1_iinfo = dpt_ext.iinfo(o1_dtype)
+                o1_iinfo = dpt.iinfo(o1_dtype)
                 if (o2_val < o1_iinfo.min) or (o2_val > o1_iinfo.max):
                     return o1_dtype, dpt.dtype(np.min_scalar_type(o2_val))
             return o1_dtype, o1_dtype
@@ -936,8 +935,8 @@ def _default_accumulation_dtype(inp_dt, q):
             res_dt = inp_dt
     elif inp_kind in "u":
         res_dt = dpt.dtype(ti.default_device_uint_type(q))
-        res_ii = dpt_ext.iinfo(res_dt)
-        inp_ii = dpt_ext.iinfo(inp_dt)
+        res_ii = dpt.iinfo(res_dt)
+        inp_ii = dpt.iinfo(inp_dt)
         if inp_ii.min >= res_ii.min and inp_ii.max <= res_ii.max:
             pass
         else:
@@ -956,7 +955,7 @@ def _default_accumulation_dtype_fp_types(inp_dt, q):
     inp_kind = inp_dt.kind
     if inp_kind in "biu":
         res_dt = dpt.dtype(ti.default_device_fp_type(q))
-        can_cast_v = dpt_ext.can_cast(inp_dt, res_dt)
+        can_cast_v = dpt.can_cast(inp_dt, res_dt)
         if not can_cast_v:
             _fp64 = q.sycl_device.has_aspect_fp64
             res_dt = dpt.float64 if _fp64 else dpt.float32
diff --git a/dpctl_ext/tensor/_types.pxi b/dpctl_ext/tensor/_types.pxi
new file mode 100644
index 000000000000..090750658f4b
--- /dev/null
+++ b/dpctl_ext/tensor/_types.pxi
@@ -0,0 +1,169 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# these typenum values are aligned to values in NumPy
+cdef:
+    int UAR_BOOL = 0  # pragma: no cover
+    int UAR_BYTE = 1  # pragma: no cover
+    int UAR_UBYTE = 2  # pragma: no cover
+    int UAR_SHORT = 3  # pragma: no cover
+    int UAR_USHORT = 4  # pragma: no cover
+    int UAR_INT = 5  # pragma: no cover
+    int UAR_UINT = 6  # pragma: no cover
+    int UAR_LONG = 7  # pragma: no cover
+    int UAR_ULONG = 8  # pragma: no cover
+    int UAR_LONGLONG = 9  # pragma: no cover
+    int UAR_ULONGLONG = 10  # pragma: no cover
+    int UAR_FLOAT = 11  # pragma: no cover
+    int UAR_DOUBLE = 12  # pragma: no cover
+    int UAR_CFLOAT = 14  # pragma: no cover
+    int UAR_CDOUBLE = 15  # pragma: no cover
+    int UAR_TYPE_SENTINEL = 17  # pragma: no cover
+    int UAR_HALF = 23  # pragma: no cover
+
+cdef int type_bytesize(int typenum):
+    """
+    NPY_BOOL=0         : 1
+    NPY_BYTE=1         : 1
+    NPY_UBYTE=2        : 1
+    NPY_SHORT=3        : 2
+    NPY_USHORT=4       : 2
+    NPY_INT=5          : sizeof(int)
+    NPY_UINT=6         : sizeof(unsigned int)
+    NPY_LONG=7         : sizeof(long)
+    NPY_ULONG=8        : sizeof(unsigned long)
+    NPY_LONGLONG=9     : 8
+    NPY_ULONGLONG=10   : 8
+    NPY_FLOAT=11       : 4
+    NPY_DOUBLE=12      : 8
+    NPY_LONGDOUBLE=13  : N/A
+    NPY_CFLOAT=14      : 8
+    NPY_CDOUBLE=15     : 16
+    NPY_CLONGDOUBLE=16 : N/A
+    NPY_HALF=23        : 2
+    """
+    cdef int *type_to_bytesize = [
+        1,
+        sizeof(char),
+        sizeof(unsigned char),
+        sizeof(short),
+        sizeof(unsigned short),
+        sizeof(int),
+        sizeof(unsigned int),
+        sizeof(long),
+        sizeof(unsigned long),
+        sizeof(long long),
+        sizeof(unsigned long long),
+        sizeof(float),
+        sizeof(double), -1,
+        sizeof(float complex),
+        sizeof(double complex), -1]
+
+    if typenum < 0:  # pragma: no cover
+        return -1
+    if typenum > 16:
+        if typenum == 23:
+            return 2
+        return -1
+
+    return type_to_bytesize[typenum]
+
+
+cdef str _make_typestr(int typenum):
+    """
+    Make typestring from type number
+    """
+    cdef type_to_str = ["|b", "|i", "|u", "|i", "|u",
+                        "|i", "|u", "|i", "|u", "|i", "|u",
+                        "|f", "|f", "", "|c", "|c", ""]
+
+    if (typenum < 0):  # pragma: no cover
+        return ""
+    if (typenum > 16):
+        if (typenum == 23):
+            return "|f2"
+        return ""  # pragma: no cover
+
+    return type_to_str[typenum] + str(type_bytesize(typenum))
+
+
+cdef int typenum_from_format(str s):
+    """
+    Internal utility to convert string describing type format
+
+    Format is [<|=>][biufc]#
+    Shortcuts for formats are i, u, d, D
+    """
+    if not s:
+        return -1
+    try:
+        dt = np.dtype(s)
+    except Exception:
+        return -1
+    if (dt.byteorder == ">"):
+        return -2
+    return dt.num
+
+
+cdef int descr_to_typenum(object dtype):
+    """
+    Returns typenum for argumentd dtype that has attribute descr,
+    assumed numpy.dtype
+    """
+    obj = getattr(dtype, "descr")
+    if (not isinstance(obj, list) or len(obj) != 1):
+        return -1    # token for ValueError
+    obj = obj[0]
+    if (
+        not isinstance(obj, tuple) or len(obj) != 2 or obj[0]
+    ):  # pragma: no cover
+        return -1
+    obj = obj[1]
+    if not isinstance(obj, str):  # pragma: no cover
+        return -1
+    return typenum_from_format(obj)
+
+
+cdef int dtype_to_typenum(dtype):
+    if isinstance(dtype, str):
+        return typenum_from_format(dtype)
+    elif isinstance(dtype, bytes):
+        return typenum_from_format(dtype.decode("UTF-8"))
+    elif hasattr(dtype, "descr"):
+        return descr_to_typenum(dtype)
+    else:
+        try:
+            dt = np.dtype(dtype)
+        except TypeError:
+            return -3
+        except Exception:  # pragma: no cover
+            return -1
+        if hasattr(dt, "descr"):
+            return descr_to_typenum(dt)
+        else:  # pragma: no cover
+            return -3  # token for TypeError
diff --git a/dpctl_ext/tensor/_usmarray.pxd b/dpctl_ext/tensor/_usmarray.pxd
new file mode 100644
index 000000000000..ccb8f4c796b7
--- /dev/null
+++ b/dpctl_ext/tensor/_usmarray.pxd
@@ -0,0 +1,88 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+
+cimport dpctl
+
+
+cdef public api int USM_ARRAY_C_CONTIGUOUS
+cdef public api int USM_ARRAY_F_CONTIGUOUS
+cdef public api int USM_ARRAY_WRITABLE
+
+cdef public api int UAR_BOOL
+cdef public api int UAR_BYTE
+cdef public api int UAR_UBYTE
+cdef public api int UAR_SHORT
+cdef public api int UAR_USHORT
+cdef public api int UAR_INT
+cdef public api int UAR_UINT
+cdef public api int UAR_LONG
+cdef public api int UAR_ULONG
+cdef public api int UAR_LONGLONG
+cdef public api int UAR_ULONGLONG
+cdef public api int UAR_FLOAT
+cdef public api int UAR_DOUBLE
+cdef public api int UAR_CFLOAT
+cdef public api int UAR_CDOUBLE
+cdef public api int UAR_TYPE_SENTINEL
+cdef public api int UAR_HALF
+
+
+cdef api class usm_ndarray [object PyUSMArrayObject, type PyUSMArrayType]:
+    # data fields
+    cdef char* data_
+    cdef int nd_
+    cdef Py_ssize_t *shape_
+    cdef Py_ssize_t *strides_
+    cdef int typenum_
+    cdef int flags_
+    cdef object base_
+    cdef object array_namespace_
+    # make usm_ndarray weak-referenceable
+    cdef object __weakref__
+
+    cdef void _reset(usm_ndarray self)
+    cdef void _cleanup(usm_ndarray self)
+    cdef Py_ssize_t get_offset(usm_ndarray self) except *
+
+    cdef char* get_data(self)
+    cdef int get_ndim(self)
+    cdef Py_ssize_t * get_shape(self)
+    cdef Py_ssize_t * get_strides(self)
+    cdef int get_typenum(self)
+    cdef int get_itemsize(self)
+    cdef int get_flags(self)
+    cdef object get_base(self)
+    cdef dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *
+    cdef dpctl.SyclQueue get_sycl_queue(self)
+
+    cdef _set_writable_flag(self, int)
+
+    cdef __cythonbufferdefaults__ = {"mode": "strided"}
diff --git a/dpctl_ext/tensor/_usmarray.pyx b/dpctl_ext/tensor/_usmarray.pyx
new file mode 100644
index 000000000000..f5bca9b1635d
--- /dev/null
+++ b/dpctl_ext/tensor/_usmarray.pyx
@@ -0,0 +1,1986 @@
+# *****************************************************************************
+# Copyright (c) 2026, Intel Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# - Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# - Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# - Neither the name of the copyright holder nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+import dpctl
+import dpctl.memory as dpmem
+import numpy as np
+
+from dpctl._backend cimport DPCTLSyclUSMRef
+from dpctl._sycl_device_factory cimport _cached_default_device
+
+# TODO: remote it when dpnp fully migrates dpctl/tensor
+import dpctl_ext
+
+from ._data_types import bool as dpt_bool
+from ._device import Device
+from ._print import usm_ndarray_repr, usm_ndarray_str
+
+cimport dpctl as c_dpctl
+cimport dpctl.memory as c_dpmem
+from cpython.mem cimport PyMem_Free
+from cpython.tuple cimport PyTuple_New, PyTuple_SetItem
+
+from . cimport _dlpack as c_dlpack
+
+from enum import IntEnum
+
+from . import _flags
+from ._dlpack import get_build_dlpack_version
+from ._tensor_impl import default_device_fp_type
+
+include "_stride_utils.pxi"
+include "_types.pxi"
+include "_slicing.pxi"
+
+
+class DLDeviceType(IntEnum):
+    """
+    An :class:`enum.IntEnum` for the types of DLDevices supported by the DLPack
+    protocol.
+
+        ``kDLCPU``:
+            CPU (host) device
+        ``kDLCUDA``:
+            CUDA GPU device
+        ``kDLCUDAHost``:
+            Pinned CUDA CPU memory by cudaMallocHost
+        ``kDLOpenCL``:
+            OpenCL device
+        ``kDLVulkan``:
+            Vulkan buffer
+        ``kDLMetal``:
+            Metal for Apple GPU
+        ``kDLVPI``:
+            Verilog simulator buffer
+        ``kDLROCM``:
+            ROCm GPU device
+        ``kDLROCMHost``:
+            Pinned ROCm CPU memory allocated by hipMallocHost
+        ``kDLExtDev``:
+            Reserved extension device type used to test new devices
+        ``kDLCUDAManaged``:
+            CUDA managed/unified memory allocated by cudaMallocManaged
+        ``kDLOneAPI``:
+            Unified shared memory allocated on a oneAPI non-partitioned device
+        ``kDLWebGPU``:
+            Device support for WebGPU standard
+        ``kDLHexagon``:
+            Qualcomm Hexagon DSP
+        ``kDLMAIA``:
+            Microsoft MAIA device
+        ``kDLTrn``:
+            AWS Trainium device
+    """
+    kDLCPU = c_dlpack.device_CPU
+    kDLCUDA = c_dlpack.device_CUDA
+    kDLCUDAHost = c_dlpack.device_CUDAHost
+    kDLCUDAManaged = c_dlpack.device_CUDAManaged
+    kDLROCM = c_dlpack.device_DLROCM
+    kDLROCMHost = c_dlpack.device_ROCMHost
+    kDLOpenCL = c_dlpack.device_OpenCL
+    kDLVulkan = c_dlpack.device_Vulkan
+    kDLMetal = c_dlpack.device_Metal
+    kDLVPI = c_dlpack.device_VPI
+    kDLOneAPI = c_dlpack.device_OneAPI
+    kDLWebGPU = c_dlpack.device_WebGPU
+    kDLHexagon = c_dlpack.device_Hexagon
+    kDLMAIA = c_dlpack.device_MAIA
+    kDLTrn = c_dlpack.device_Trn
+
+
+cdef class InternalUSMArrayError(Exception):
+    """
+    An InternalUSMArrayError exception is raised when internal
+    inconsistency has been detected in :class:`.usm_ndarray`.
+    """
+    pass
+
+
+cdef object _as_zero_dim_ndarray(object usm_ary):
+    "Convert size-1 array to NumPy 0d array"
+    mem_view = dpmem.as_usm_memory(usm_ary)
+    usm_ary.sycl_queue.wait()
+    host_buf = mem_view.copy_to_host()
+    view = host_buf.view(usm_ary.dtype)
+    view.shape = tuple()
+    return view
+
+
+cdef inline void _check_0d_scalar_conversion(object usm_ary) except *:
+    "Raise TypeError if array cannot be converted to a Python scalar"
+    if (usm_ary.ndim != 0):
+        raise TypeError(
+            "only 0-dimensional arrays can be converted to Python scalars"
+        )
+
+
+cdef int _copy_writable(int lhs_flags, int rhs_flags):
+    "Copy the WRITABLE flag to lhs_flags from rhs_flags"
+    return (lhs_flags & ~USM_ARRAY_WRITABLE) | (rhs_flags & USM_ARRAY_WRITABLE)
+
+
+cdef bint _is_host_cpu(object dl_device):
+    "Check if dl_device denotes (kDLCPU, 0)"
+    cdef object dl_type
+    cdef object dl_id
+    cdef Py_ssize_t n_elems = -1
+
+    try:
+        n_elems = len(dl_device)
+    except TypeError:
+        pass
+
+    if n_elems != 2:
+        return False
+
+    dl_type = dl_device[0]
+    dl_id = dl_device[1]
+    if isinstance(dl_type, str):
+        return (dl_type == "kDLCPU" and dl_id == 0)
+
+    return (dl_type == DLDeviceType.kDLCPU) and (dl_id == 0)
+
+
+cdef void _validate_and_use_stream(
+    object stream, c_dpctl.SyclQueue self_queue
+) except *:
+    if (stream is None or stream == self_queue):
+        pass
+    else:
+        if not isinstance(stream, dpctl.SyclQueue):
+            raise TypeError(
+                "stream argument type was expected to be dpctl.SyclQueue,"
+                f" got {type(stream)} instead"
+            )
+        ev = self_queue.submit_barrier()
+        stream.submit_barrier(dependent_events=[ev])
+
+cdef class usm_ndarray:
+    """ usm_ndarray(shape, dtype=None, strides=None, buffer="device", \
+           offset=0, order="C", buffer_ctor_kwargs=dict(), \
+           array_namespace=None)
+
+    An array object represents a multidimensional tensor of numeric
+    elements stored in a USM allocation on a SYCL device.
+
+    Arg:
+        shape (int, tuple):
+            Shape of the array to be created.
+        dtype (str, dtype):
+            Array data type, i.e. the type of array elements.
+            If ``dtype`` has the value ``None``, it is determined by default
+            floating point type supported by target device.
+            The supported types are
+
+                ``bool``:
+                    boolean type
+                ``int8``, ``int16``, ``int32``, ``int64``:
+                    signed integer types
+                ``uint8``, ``uint16``, ``uint32``, ``uint64``:
+                    unsigned integer types
+                ``float16``:
+                    half-precision floating type,
+                    supported if target device's property
+                    ``has_aspect_fp16`` is ``True``
+                ``float32``, ``complex64``:
+                    single-precision real and complex floating types
+                ``float64``, ``complex128``:
+                    double-precision real and complex floating
+                    types, supported if target device's property
+                    ``has_aspect_fp64`` is ``True``.
+
+            Default: ``None``.
+        strides (tuple, optional):
+            Strides of the array to be created in elements.
+            If ``strides`` has the value ``None``, it is determined by the
+            ``shape`` of the array and the requested ``order``.
+            Default: ``None``.
+        buffer (str, object, optional):
+            A string corresponding to the type of USM allocation to make,
+            or a Python object representing a USM memory allocation, i.e.
+            :class:`dpctl.memory.MemoryUSMDevice`,
+            :class:`dpctl.memory.MemoryUSMShared`, or
+            :class:`dpctl.memory.MemoryUSMHost`. Recognized strings are
+            ``"device"``, ``"shared"``, or ``"host"``. Additional arguments to
+            the USM memory allocators can be passed in a dictionary specified
+            via ``buffer_ctor_kwrds`` keyword parameter.
+            Default: ``"device"``.
+        offset (int, optional):
+            Offset of the array element with all zero indexes relative to the
+            start of the provided `buffer` in elements. The argument is ignored
+            if the ``buffer`` value is a string and the memory is allocated by
+            the constructor. Default: ``0``.
+        order ({"C", "F"}, optional):
+            The memory layout of the array when constructing using a new
+            allocation. Value ``"C"`` corresponds to C-contiguous, or row-major
+            memory layout, while value ``"F"`` corresponds to F-contiguous, or
+            column-major layout. Default: ``"C"``.
+        buffer_ctor_kwargs (dict, optional):
+            Dictionary with keyword parameters to use when creating a new USM
+            memory allocation. See :class:`dpctl.memory.MemoryUSMShared` for
+            supported keyword arguments.
+        array_namespace (module, optional):
+            Array namespace module associated with this array.
+            Default: ``None``.
+
+    ``buffer`` can be ``"shared"``, ``"host"``, ``"device"`` to allocate
+    new device memory by calling respective constructor with
+    the specified ``buffer_ctor_kwrds``; ``buffer`` can be an
+    instance of :class:`dpctl.memory.MemoryUSMShared`,
+    :class:`dpctl.memory.MemoryUSMDevice`, or
+    :class:`dpctl.memory.MemoryUSMHost`; ``buffer`` can also be
+    another :class:`dpctl.tensor.usm_ndarray` instance, in which case its
+    underlying ``MemoryUSM*`` buffer is used.
+    """
+
+    cdef void _reset(usm_ndarray self):
+        """
+        Initializes member fields
+        """
+        self.base_ = None
+        self.array_namespace_ = None
+        self.nd_ = -1
+        self.data_ = <char *>0
+        self.shape_ = <Py_ssize_t *>0
+        self.strides_ = <Py_ssize_t *>0
+        self.flags_ = 0
+
+    cdef void _cleanup(usm_ndarray self):
+        if (self.shape_):
+            PyMem_Free(self.shape_)
+        if (self.strides_):
+            PyMem_Free(self.strides_)
+        self._reset()
+
+    def __cinit__(self, shape, dtype=None, strides=None, buffer="device",
+                  Py_ssize_t offset=0, order="C",
+                  buffer_ctor_kwargs=dict(),
+                  array_namespace=None):
+        """
+        strides and offset must be given in units of array elements.
+        buffer can be strings ('device'|'shared'|'host' to allocate new memory)
+        or ``dpctl.memory.MemoryUSM*`` buffers, or ``usm_ndarray`` instances.
+        """
+        cdef int nd = 0
+        cdef int typenum = 0
+        cdef int itemsize = 0
+        cdef int err = 0
+        cdef int contig_flag = 0
+        cdef int writable_flag = USM_ARRAY_WRITABLE
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t ary_nelems = 0
+        cdef Py_ssize_t ary_nbytes = 0
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t _offset = offset
+        cdef Py_ssize_t ary_min_displacement = 0
+        cdef Py_ssize_t ary_max_displacement = 0
+        cdef bint is_fp64 = False
+        cdef bint is_fp16 = False
+
+        self._reset()
+        if not isinstance(shape, (list, tuple)):
+            if hasattr(shape, "tolist"):
+                fn = getattr(shape, "tolist")
+                if callable(fn):
+                    shape = shape.tolist()
+            if not isinstance(shape, (list, tuple)):
+                try:
+                    <Py_ssize_t> shape
+                    shape = [shape, ]
+                except Exception as e:
+                    raise TypeError(
+                        "Argument shape must a non-negative integer, "
+                        "or a list/tuple of such integers."
+                    ) from e
+        nd = len(shape)
+        if dtype is None:
+            if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)):
+                q = buffer.sycl_queue
+            else:
+                q = buffer_ctor_kwargs.get("queue")
+            if q is not None:
+                dtype = default_device_fp_type(q)
+            else:
+                dev = _cached_default_device()
+                dtype = "f8" if dev.has_aspect_fp64 else "f4"
+        typenum = dtype_to_typenum(dtype)
+        if (typenum < 0):
+            if typenum == -2:
+                raise ValueError(
+                    "Data type '" + str(dtype) +
+                    "' can only have native byteorder."
+                )
+            elif typenum == -1:
+                raise ValueError(
+                    "Data type '" + str(dtype) + "' is not understood."
+                )
+            raise TypeError(
+                f"Expected string or a dtype object, got {type(dtype)}"
+            )
+        itemsize = type_bytesize(typenum)
+        if (itemsize < 1):
+            raise TypeError(
+                "dtype=" + np.dtype(dtype).name + " is not supported."
+            )
+        # allocate host C-arrays for shape, strides
+        err = _from_input_shape_strides(
+            nd, shape, strides, itemsize, <char> ord(order),
+            &shape_ptr, &strides_ptr, &ary_nelems,
+            &ary_min_displacement, &ary_max_displacement, &contig_flag
+        )
+        if (err):
+            self._cleanup()
+            if err == ERROR_MALLOC:
+                raise MemoryError("Memory allocation for shape/strides "
+                                  "array failed.")
+            elif err == ERROR_INCORRECT_ORDER:
+                raise ValueError(
+                    "Unsupported order='{}' given. "
+                    "Supported values are 'C' or 'F'.".format(order))
+            elif err == ERROR_UNEXPECTED_STRIDES:
+                raise ValueError(
+                    "strides={} is not understood".format(strides))
+            else:
+                raise InternalUSMArrayError(
+                    " .. while processing shape and strides.")
+        ary_nbytes = (ary_max_displacement -
+                      ary_min_displacement + 1) * itemsize
+        if isinstance(buffer, dpmem._memory._Memory):
+            _buffer = buffer
+        elif isinstance(buffer, (str, bytes)):
+            if isinstance(buffer, bytes):
+                buffer = buffer.decode("UTF-8")
+            _offset = -ary_min_displacement
+            if (buffer == "shared"):
+                _buffer = dpmem.MemoryUSMShared(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "device"):
+                _buffer = dpmem.MemoryUSMDevice(ary_nbytes,
+                                                **buffer_ctor_kwargs)
+            elif (buffer == "host"):
+                _buffer = dpmem.MemoryUSMHost(ary_nbytes,
+                                              **buffer_ctor_kwargs)
+            else:
+                self._cleanup()
+                raise ValueError(
+                    "buffer='{}' is not understood. "
+                    "Recognized values are 'device', 'shared',  'host', "
+                    "an instance of `MemoryUSM*` object, or a usm_ndarray"
+                    "".format(buffer)
+                )
+        elif isinstance(buffer, usm_ndarray):
+            if not buffer.flags.writable:
+                writable_flag = 0
+            _buffer = buffer.usm_data
+        else:
+            self._cleanup()
+            raise ValueError("buffer='{}' was not understood.".format(buffer))
+        if (shape_to_elem_count(nd, shape_ptr) > 0 and
+            (_offset + ary_min_displacement < 0 or
+             (_offset + ary_max_displacement + 1) * itemsize > _buffer.nbytes)):
+            self._cleanup()
+            raise ValueError(("buffer='{}' can not accommodate "
+                              "the requested array.").format(buffer))
+        is_fp64 = (typenum == UAR_DOUBLE or typenum == UAR_CDOUBLE)
+        is_fp16 = (typenum == UAR_HALF)
+        if (is_fp64 or is_fp16):
+            if (
+                (is_fp64 and not _buffer.sycl_device.has_aspect_fp64) or
+                (is_fp16 and not _buffer.sycl_device.has_aspect_fp16)
+            ):
+                raise ValueError(
+                    f"Device {_buffer.sycl_device.name} does"
+                    f" not support {dtype} natively."
+                )
+        self.base_ = _buffer
+        self.data_ = (<char *> (<size_t> _buffer._pointer)) + itemsize * _offset
+        self.shape_ = shape_ptr
+        self.strides_ = strides_ptr
+        self.typenum_ = typenum
+        self.flags_ = (contig_flag | writable_flag)
+        self.nd_ = nd
+        self.array_namespace_ = array_namespace
+
+    def __dealloc__(self):
+        self._cleanup()
+
+    @property
+    def _pointer(self):
+        """
+        Returns USM pointer to the start of array (element with zero
+        multi-index) encoded as integer.
+        """
+        return <size_t> self.get_data()
+
+    cdef Py_ssize_t get_offset(self) except *:
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = self.get_data()
+        mem_ptr = <char *>(<size_t> self.base_._pointer)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        return byte_offset // item_size
+
+    @property
+    def _element_offset(self):
+        """Returns the offset of the zero-index element of the array, in
+        elements, relative to the start of memory allocation"""
+        return self.get_offset()
+
+    @property
+    def _byte_bounds(self):
+        """Returns a 2-tuple with pointers to the end-points of the array
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor
+
+                x = tensor.ones((3, 10, 7))
+                y = tensor.flip(x[:, 1::2], axis=1)
+
+                beg_p, end_p = y._byte_bounds
+                # Bytes taken to store this array
+                bytes_extent = end_p - beg_p
+
+                # C-contiguous copy is more compact
+                yc = tensor.copy(y, order="C")
+                beg_pc, end_pc = yc._byte_bounds
+                assert bytes_extent < end_pc - beg_pc
+        """
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef Py_ssize_t step_ = 0
+        cdef Py_ssize_t dim_ = 0
+        cdef int it = 0
+        cdef Py_ssize_t _itemsize = self.get_itemsize()
+
+        if (
+            (self.flags_ & USM_ARRAY_C_CONTIGUOUS)
+            or (self.flags_ & USM_ARRAY_F_CONTIGUOUS)
+        ):
+            return (
+                self._pointer,
+                self._pointer + shape_to_elem_count(
+                    self.nd_, self.shape_
+                ) * _itemsize
+            )
+
+        for it in range(self.nd_):
+            dim_ = self.shape[it]
+            if dim_ > 0:
+                step_ = self.strides[it]
+                if step_ > 0:
+                    max_disp += step_ * (dim_ - 1)
+                else:
+                    min_disp += step_ * (dim_ - 1)
+
+        return (
+            self._pointer + min_disp * _itemsize,
+            self._pointer + (max_disp + 1) * _itemsize
+        )
+
+    cdef char* get_data(self):
+        """Returns the USM pointer for this array."""
+        return self.data_
+
+    cdef int get_ndim(self):
+        """
+        Returns the number of indices needed to address
+        an element of this array.
+        """
+        return self.nd_
+
+    cdef Py_ssize_t* get_shape(self):
+        """
+        Returns pointer to shape C-array for this array.
+
+        C-array has at least ``ndim`` non-negative elements,
+        which determine the range of permissible indices
+        addressing individual elements of this array.
+        """
+        return self.shape_
+
+    cdef Py_ssize_t* get_strides(self):
+        """
+        Returns pointer to strides C-array for this array.
+
+        The pointer can be NULL (contiguous array), or the
+        array size is at least ``ndim`` elements
+        """
+        return self.strides_
+
+    cdef int get_typenum(self):
+        """Returns typenum corresponding to values of this array"""
+        return self.typenum_
+
+    cdef int get_itemsize(self):
+        """
+        Returns itemsize of this arrays in bytes
+        """
+        return type_bytesize(self.typenum_)
+
+    cdef int get_flags(self):
+        """Returns flags of this array"""
+        return self.flags_
+
+    cdef object get_base(self):
+        """Returns the object owning the USM data addressed by this array"""
+        return self.base_
+
+    cdef c_dpctl.SyclQueue get_sycl_queue(self):
+        cdef c_dpmem._Memory mem
+        if not isinstance(self.base_, dpctl.memory._Memory):
+            raise InternalUSMArrayError(
+                "This array has unexpected memory owner"
+            )
+        mem = <c_dpmem._Memory> self.base_
+        return mem.queue
+
+    cdef c_dpctl.DPCTLSyclQueueRef get_queue_ref(self) except *:
+        """
+        Returns a copy of DPCTLSyclQueueRef associated with array
+        """
+        cdef c_dpctl.SyclQueue q = self.get_sycl_queue()
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = q.get_queue_ref()
+        cdef c_dpctl.DPCTLSyclQueueRef QRefCopy = NULL
+        if QRef is not NULL:
+            QRefCopy = c_dpctl.DPCTLQueue_Copy(QRef)
+            return QRefCopy
+        else:
+            raise InternalUSMArrayError(
+                "Memory owner of this array is corrupted"
+            )
+
+    @property
+    def __sycl_usm_array_interface__(self):
+        """
+        Gives ``__sycl_usm_array_interface__`` dictionary describing
+        the array.
+        """
+        cdef Py_ssize_t byte_offset = -1
+        cdef int item_size = -1
+        cdef Py_ssize_t elem_offset = -1
+        cdef char *mem_ptr = NULL
+        cdef char *ary_ptr = NULL
+        if (not isinstance(self.base_, dpmem._memory._Memory)):
+            raise InternalUSMArrayError(
+                "Invalid instance of usm_ndarray encountered. "
+                "Private field base_ has an unexpected type {}.".format(
+                    type(self.base_)
+                )
+            )
+        ary_iface = self.base_.__sycl_usm_array_interface__
+        mem_ptr = <char *>(<size_t> ary_iface["data"][0])
+        ary_ptr = <char *>(<size_t> self.data_)
+        ro_flag = False if (self.flags_ & USM_ARRAY_WRITABLE) else True
+        ary_iface["data"] = (<size_t> mem_ptr, ro_flag)
+        ary_iface["shape"] = self.shape
+        if (self.strides_):
+            ary_iface["strides"] = _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                ary_iface["strides"] = None
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                ary_iface["strides"] = _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise InternalUSMArrayError(
+                    "USM Array is not contiguous and has empty strides"
+                )
+        ary_iface["typestr"] = _make_typestr(self.typenum_)
+        byte_offset = ary_ptr - mem_ptr
+        item_size = self.get_itemsize()
+        if (byte_offset % item_size):
+            raise InternalUSMArrayError(
+                "byte_offset is not a multiple of item_size.")
+        elem_offset = byte_offset // item_size
+        ary_iface["offset"] = elem_offset
+        # must wait for content of the memory to finalize
+        self.sycl_queue.wait()
+        return ary_iface
+
+    @property
+    def ndim(self):
+        """
+        Gives the number of indices needed to address elements of this array.
+        """
+        return self.nd_
+
+    @property
+    def usm_data(self):
+        """
+        Gives USM memory object underlying :class:`.usm_ndarray` instance.
+        """
+        return self.get_base()
+
+    @property
+    def shape(self):
+        """
+        Elements of the shape tuple give the lengths of the
+        respective array dimensions.
+
+        Setting shape is allowed only when reshaping to the requested
+        dimensions can be returned as view, otherwise :exc:`AttributeError`
+        is raised. Use :func:`dpctl.tensor.reshape` to reshape the array
+        in all cases.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor
+
+                x = tensor.arange(899)
+                x.shape = (29, 31)
+        """
+        if self.nd_ > 0:
+            return _make_int_tuple(self.nd_, self.shape_)
+        else:
+            return tuple()
+
+    @shape.setter
+    def shape(self, new_shape):
+        """
+        Modifies usm_ndarray instance in-place by changing its metadata
+        about the shape and the strides of the array, or raises
+        `AttributeError` exception if in-place change is not possible.
+
+        Args:
+            new_shape: (tuple, int)
+                New shape. Only non-negative values are supported.
+                The new shape may not lead to the change in the
+                number of elements in the array.
+
+        Whether the array can be reshape in-place depends on its
+        strides. Use :func:`dpctl.tensor.reshape` function which
+        always succeeds to reshape the array by performing a copy
+        if necessary.
+        """
+        cdef int new_nd = -1
+        cdef Py_ssize_t nelems = -1
+        cdef int err = 0
+        cdef Py_ssize_t min_disp = 0
+        cdef Py_ssize_t max_disp = 0
+        cdef int contig_flag = 0
+        cdef Py_ssize_t *shape_ptr = NULL
+        cdef Py_ssize_t *strides_ptr = NULL
+        cdef Py_ssize_t size = -1
+        import operator
+
+        from ._reshape import reshaped_strides
+
+        try:
+            new_nd = len(new_shape)
+        except TypeError:
+            new_nd = 1
+            new_shape = (new_shape,)
+        try:
+            new_shape = tuple(operator.index(dim) for dim in new_shape)
+        except TypeError:
+            raise TypeError(
+                "Target shape must be a finite iterable of integers"
+            )
+        size = shape_to_elem_count(self.nd_, self.shape_)
+        if not np.prod(new_shape) == size:
+            raise TypeError(
+                f"Can not reshape array of size {self.size} into {new_shape}"
+            )
+        if size > 0:
+            new_strides = reshaped_strides(
+               self.shape,
+               self.strides,
+               new_shape
+            )
+        else:
+            new_strides = (1,) * len(new_shape)
+        if new_strides is None:
+            raise AttributeError(
+                "Incompatible shape for in-place modification. "
+                "Use `reshape()` to make a copy with the desired shape."
+            )
+        err = _from_input_shape_strides(
+            new_nd, new_shape, new_strides,
+            self.get_itemsize(),
+            b"C",
+            &shape_ptr, &strides_ptr,
+            &nelems, &min_disp, &max_disp, &contig_flag
+        )
+        if (err == 0):
+            if (self.shape_):
+                PyMem_Free(self.shape_)
+            if (self.strides_):
+                PyMem_Free(self.strides_)
+            self.flags_ = (contig_flag | (self.flags_ & USM_ARRAY_WRITABLE))
+            self.nd_ = new_nd
+            self.shape_ = shape_ptr
+            self.strides_ = strides_ptr
+        else:
+            raise InternalUSMArrayError(
+                "Encountered in shape setter, error code {err}".format(err)
+            )
+
+    @property
+    def strides(self):
+        """
+        Returns memory displacement in array elements, upon unit
+        change of respective index.
+
+        For example, for strides ``(s1, s2, s3)`` and multi-index
+        ``(i1, i2, i3)`` position of the respective element relative
+        to zero multi-index element is ``s1*s1 + s2*i2 + s3*i3``.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor
+
+                x = tensor.zeros((20, 30))
+                xv = x[10:, :15]
+
+                multi_id = (3, 5)
+                byte_displacement = xv[multi_id]._pointer - xv[0, 0]._pointer
+                element_displacement = sum(
+                    i * s for i, s in zip(multi_id, xv.strides)
+                )
+                assert byte_displacement == element_displacement * xv.itemsize
+        """
+        if (self.strides_):
+            return _make_int_tuple(self.nd_, self.strides_)
+        else:
+            if (self.flags_ & USM_ARRAY_C_CONTIGUOUS):
+                return _c_contig_strides(self.nd_, self.shape_)
+            elif (self.flags_ & USM_ARRAY_F_CONTIGUOUS):
+                return _f_contig_strides(self.nd_, self.shape_)
+            else:
+                raise ValueError("Inconsistent usm_ndarray data")
+
+    @property
+    def flags(self):
+        """
+        Returns :class:`dpctl.tensor._flags.Flags` object.
+        """
+        return _flags.Flags(self, self.flags_)
+
+    cdef _set_writable_flag(self, int flag):
+        cdef int mask = (USM_ARRAY_WRITABLE if flag else 0)
+        self.flags_ = _copy_writable(self.flags_, mask)
+
+    @property
+    def usm_type(self):
+        """
+        USM type of underlying memory. Possible values are:
+
+            * ``"device"``
+                USM-device allocation in device memory, only accessible
+                to kernels executed on the device
+            * ``"shared"``
+                USM-shared allocation in device memory, accessible both
+                from the device and from host
+            * ``"host"``
+                USM-host allocation in host memory, accessible both
+                from the device and from host
+
+        See: https://docs.oneapi.com/versions/latest/dpcpp/iface/usm.html
+        """
+        return self.base_.get_usm_type()
+
+    @property
+    def itemsize(self):
+        """
+        Size of array element in bytes.
+        """
+        return self.get_itemsize()
+
+    @property
+    def nbytes(self):
+        """
+        Total bytes consumed by the elements of the array.
+        """
+        return (
+            shape_to_elem_count(self.nd_, self.shape_) *
+            self.get_itemsize())
+
+    @property
+    def size(self):
+        """
+        Number of elements in the array.
+        """
+        return shape_to_elem_count(self.nd_, self.shape_)
+
+    @property
+    def dtype(self):
+        """
+        Returns NumPy's dtype corresponding to the type of the array elements.
+        """
+        return np.dtype(_make_typestr(self.typenum_))
+
+    @property
+    def sycl_queue(self):
+        """
+        Returns :class:`dpctl.SyclQueue` object associated with USM data.
+        """
+        return self.get_sycl_queue()
+
+    @property
+    def sycl_device(self):
+        """
+        Returns :class:`dpctl.SyclDevice` object on which USM data
+        was allocated.
+        """
+        q = self.sycl_queue
+        return q.sycl_device
+
+    @property
+    def device(self):
+        """
+        Returns :class:`dpctl.tensor.Device` object representing
+        residence of the array data.
+
+        The ``Device`` object represents Array API notion of the
+        device, and contains :class:`dpctl.SyclQueue` associated
+        with this array. Hence, ``.device`` property provides
+        information distinct from ``.sycl_device`` property.
+
+        :Example:
+
+            .. code-block:: python
+
+                >>> from dpctl import tensor
+                >>> x = tensor.ones(10)
+                >>> x.device
+                Device(level_zero:gpu:0)
+        """
+        return Device.create_device(self.sycl_queue)
+
+    @property
+    def sycl_context(self):
+        """
+        Returns :class:`dpctl.SyclContext` object to which USM data is bound.
+        """
+        q = self.sycl_queue
+        return q.sycl_context
+
+    @property
+    def T(self):
+        """Returns transposed array for 2D array, raises ``ValueError``
+        otherwise.
+        """
+        if self.nd_ == 2:
+            return _transpose(self)
+        else:
+            raise ValueError(
+                "array.T requires array to have 2 dimensions. "
+                "Use array.mT to transpose stacks of matrices and "
+                "dpctl.tensor.permute_dims() to permute dimensions."
+            )
+
+    @property
+    def mT(self):
+        """ Returns array (a view) where the last two dimensions are
+        transposed.
+        """
+        if self.nd_ < 2:
+            raise ValueError(
+                "array.mT requires array to have at least 2 dimensions."
+            )
+        return _m_transpose(self)
+
+    @property
+    def real(self):
+        """
+        Returns view into real component for arrays with
+        complex data-types and returns itself for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor
+
+                # Create complex array from
+                # arrays of real and imaginary parts
+
+                re = tensor.linspace(-1, 1, num=100, dtype="f4")
+                im = tensor.full_like(re, fill_value=tensor.pi)
+
+                z = tensor.empty_like(re, dtype="c8")
+                z.real[:] = re
+                z.imag[:] = im
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return self
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _real_view(self)
+
+    @property
+    def imag(self):
+        """ Returns view into imaginary component for arrays with
+        complex data-types and returns new zero array for all other
+        data-types.
+
+        :Example:
+
+            .. code-block:: python
+
+                from dpctl import tensor
+
+                # Reset imaginary part of complex array
+
+                z = tensor.ones(100, dtype="c8")
+                z.imag[:] = dpt.pi/2
+        """
+        # explicitly check for UAR_HALF, which is greater than UAR_CFLOAT
+        if (self.typenum_ < UAR_CFLOAT or self.typenum_ == UAR_HALF):
+            # elements are real
+            return _zero_like(self)
+        if (self.typenum_ < UAR_TYPE_SENTINEL):
+            return _imag_view(self)
+
+    def __getitem__(self, ind):
+        cdef tuple _meta = _basic_slice_meta(
+            ind, (<object>self).shape, (<object> self).strides,
+            self.get_offset())
+        cdef usm_ndarray res
+        cdef int i = 0
+        cdef bint matching = 1
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        res = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2]
+        )
+        res.array_namespace_ = self.array_namespace_
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        from ._copy_utils import _extract_impl, _nonzero_impl, _take_multi_index
+
+        # if len(adv_ind == 1), the (only) element is always an array
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            key_ = adv_ind[0]
+            adv_ind_end_p = key_.ndim + adv_ind_start_p
+            if adv_ind_end_p > res.ndim:
+                raise IndexError("too many indices for the array")
+            key_shape = key_.shape
+            arr_shape = res.shape[adv_ind_start_p:adv_ind_end_p]
+            for i in range(key_.ndim):
+                if matching:
+                    if not key_shape[i] == arr_shape[i] and key_shape[i] > 0:
+                        matching = 0
+            if not matching:
+                raise IndexError(
+                    "boolean index did not match indexed array in dimensions"
+                )
+            res = _extract_impl(res, key_, axis=adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            res = _take_multi_index(res, tuple(adv_ind_int), adv_ind_start_p)
+            res.flags_ = _copy_writable(res.flags_, self.flags_)
+            return res
+
+        res = _take_multi_index(res, adv_ind, adv_ind_start_p)
+        res.flags_ = _copy_writable(res.flags_, self.flags_)
+        return res
+
+    def to_device(self, target_device, /, *, stream=None):
+        """ to_device(target_device, /, *, stream=None)
+
+        Transfers this array to specified target device.
+
+        :Example:
+            .. code-block:: python
+
+                import dpctl
+                import dpctl.tensor as dpt
+
+                x = dpt.full(10**6, 2, dtype="int64")
+                q_prof = dpctl.SyclQueue(
+                    x.sycl_device, property="enable_profiling")
+                # return a view with profile-enabled queue
+                y = x.to_device(q_prof)
+                timer = dpctl.SyclTimer()
+                with timer(q_prof):
+                    z = y * y
+                print(timer.dt)
+
+        Args:
+            target_device (object):
+                Array API concept of target device.
+                It can be a oneAPI filter selector string,
+                an instance of :class:`dpctl.SyclDevice` corresponding to a
+                non-partitioned SYCL device, an instance of
+                :class:`dpctl.SyclQueue`, or a :class:`dpctl.tensor.Device`
+                object returned by :attr:`dpctl.tensor.usm_ndarray.device`.
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with. If ``None``,
+                synchronization is not performed.
+
+        Returns:
+            usm_ndarray:
+                A view if data copy is not required, and a copy otherwise.
+                If copying is required, it is done by copying from the original
+                allocation device to the host, followed by copying from host
+                to the target device.
+        """
+        cdef c_dpctl.DPCTLSyclQueueRef QRef = NULL
+        cdef c_dpmem._Memory arr_buf
+        d = Device.create_device(target_device)
+
+        _validate_and_use_stream(stream, self.sycl_queue)
+
+        if (d.sycl_context == self.sycl_context):
+            arr_buf = <c_dpmem._Memory> self.usm_data
+            QRef = (<c_dpctl.SyclQueue> d.sycl_queue).get_queue_ref()
+            view_buffer = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+                <DPCTLSyclUSMRef>arr_buf.get_data_ptr(),
+                arr_buf.nbytes,
+                QRef,
+                memory_owner=arr_buf
+            )
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=view_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+        else:
+            nbytes = self.usm_data.nbytes
+            copy_buffer = type(self.usm_data)(
+                nbytes, queue=d.sycl_queue
+            )
+            copy_buffer.copy_from_device(self.usm_data)
+            res = usm_ndarray(
+                self.shape,
+                self.dtype,
+                buffer=copy_buffer,
+                strides=self.strides,
+                offset=self.get_offset()
+            )
+            res.flags_ = self.flags_
+            return res
+
+    def _set_namespace(self, mod):
+        """ Sets array namespace to given module `mod`. """
+        self.array_namespace_ = mod
+
+    def __array_namespace__(self, api_version=None):
+        """
+        Returns array namespace, member functions of which
+        implement data API.
+
+        Args:
+            api_version (str, optional)
+                Request namespace compliant with given version of
+                array API. If ``None``, namespace for the most
+                recent supported version is returned.
+                Default: ``None``.
+        """
+        if api_version is not None:
+            from ._array_api import __array_api_version__
+            if not isinstance(api_version, str):
+                raise TypeError(f"Expected type str, got {type(api_version)}")
+            if api_version != __array_api_version__:
+                raise ValueError(f"Only {__array_api_version__} is supported")
+        return (
+            self.array_namespace_
+            if self.array_namespace_ is not None
+            # TODO: revert to `else dpctl.tensor`
+            # when dpnp fully migrates dpctl/tensor
+            else dpctl_ext.tensor
+        )
+
+    def __bool__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__bool__()
+
+        if self.size == 0:
+            raise ValueError(
+                "The truth value of an empty array is ambiguous"
+            )
+
+        raise ValueError(
+            "The truth value of an array with more than one element is "
+            "ambiguous. Use dpctl.tensor.any() or dpctl.tensor.all()"
+        )
+
+    def __float__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__float__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __complex__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__complex__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __int__(self):
+        if self.size == 1:
+            _check_0d_scalar_conversion(self)
+            view = _as_zero_dim_ndarray(self)
+            return view.__int__()
+
+        raise ValueError(
+            "only size-1 arrays can be converted to Python scalars"
+        )
+
+    def __index__(self):
+        if np.issubdtype(self.dtype, np.integer):
+            return int(self)
+
+        raise IndexError("only integer arrays are valid indices")
+
+    def __abs__(self):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.abs(self)
+
+    def __add__(self, other):
+        """
+        Implementation for operator.add
+        """
+        return dpctl_ext.tensor.add(self, other)
+
+    def __and__(self, other):
+        "Implementation for operator.and"
+        return dpctl_ext.tensor.bitwise_and(self, other)
+
+    def __dlpack__(
+        self, *, stream=None, max_version=None, dl_device=None, copy=None
+    ):
+        """
+        Produces DLPack capsule.
+
+        Args:
+            stream (:class:`dpctl.SyclQueue`, optional):
+                Execution queue to synchronize with.
+                If ``None``, synchronization is not performed.
+                Default: ``None``.
+            max_version (tuple[int, int], optional):
+                The maximum DLPack version the consumer (caller of
+                ``__dlpack__``) supports. As ``__dlpack__`` may not
+                always return a DLPack capsule with version
+                `max_version`, the consumer must verify the version
+                even if this argument is passed.
+                Default: ``None``.
+            dl_device (tuple[enum.Enum, int], optional):
+                The device the returned DLPack capsule will be
+                placed on.
+                The device must be a 2-tuple matching the format of
+                ``__dlpack_device__`` method, an integer enumerator
+                representing the device type followed by an integer
+                representing the index of the device.
+                Default: ``None``.
+            copy (bool, optional):
+                Boolean indicating whether or not to copy the input.
+
+                * If ``copy`` is ``True``, the input will always be
+                  copied.
+                * If ``False``, a ``BufferError`` will be raised if a
+                  copy is deemed necessary.
+                * If ``None``, a copy will be made only if deemed
+                  necessary, otherwise, the existing memory buffer will
+                  be reused.
+
+                Default: ``None``.
+
+        Raises:
+            MemoryError:
+                when host memory can not be allocated.
+            DLPackCreationError:
+                when array is allocated on a partitioned
+                SYCL device, or with a non-default context.
+            BufferError:
+                when a copy is deemed necessary but ``copy``
+                is ``False`` or when the provided ``dl_device``
+                cannot be handled.
+        """
+        if max_version is None:
+            # legacy path for DLManagedTensor
+            # copy kwarg ignored because copy flag can't be set
+            _caps = c_dlpack.to_dlpack_capsule(self)
+            _validate_and_use_stream(stream, self.sycl_queue)
+            return _caps
+        else:
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise TypeError(
+                    "`__dlpack__` expects `max_version` to be a "
+                    "2-tuple of integers `(major, minor)`, instead "
+                    f"got {max_version}"
+                )
+            dpctl_dlpack_version = get_build_dlpack_version()
+            if max_version[0] >= dpctl_dlpack_version[0]:
+                # DLManagedTensorVersioned path
+                if dl_device is not None:
+                    if not isinstance(dl_device, tuple) or len(dl_device) != 2:
+                        raise TypeError(
+                            "`__dlpack__` expects `dl_device` to be a 2-tuple "
+                            "of `(device_type, device_id)`, instead "
+                            f"got {dl_device}"
+                        )
+                    if dl_device != self.__dlpack_device__():
+                        if copy is False:
+                            raise BufferError(
+                                "array cannot be placed on the requested "
+                                "device without a copy"
+                            )
+                        if _is_host_cpu(dl_device):
+                            if stream is not None:
+                                raise ValueError(
+                                    "`stream` must be `None` when `dl_device` "
+                                    "is of type `kDLCPU`"
+                                )
+                            from ._copy_utils import _copy_to_numpy
+                            _arr = _copy_to_numpy(self)
+                            _arr.flags["W"] = self.flags["W"]
+                            return c_dlpack.numpy_to_dlpack_versioned_capsule(
+                                _arr, True
+                            )
+                        else:
+                            raise BufferError(
+                                f"targeting `dl_device` {dl_device} with "
+                                "`__dlpack__` is not yet implemented"
+                            )
+                if copy is None:
+                    copy = False
+                # TODO: strategy for handling stream on different device
+                # from dl_device
+                if copy:
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                    nbytes = self.usm_data.nbytes
+                    copy_buffer = type(self.usm_data)(
+                        nbytes, queue=self.sycl_queue
+                    )
+                    copy_buffer.copy_from_device(self.usm_data)
+                    _copied_arr = usm_ndarray(
+                        self.shape,
+                        self.dtype,
+                        buffer=copy_buffer,
+                        strides=self.strides,
+                        offset=self.get_offset()
+                    )
+                    _copied_arr.flags_ = self.flags_
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(
+                        _copied_arr, copy
+                    )
+                else:
+                    _caps = c_dlpack.to_dlpack_versioned_capsule(self, copy)
+                    _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+            else:
+                # legacy path for DLManagedTensor
+                _caps = c_dlpack.to_dlpack_capsule(self)
+                _validate_and_use_stream(stream, self.sycl_queue)
+                return _caps
+
+    def __dlpack_device__(self):
+        """
+        Gives a tuple (``device_type``, ``device_id``) corresponding to
+        ``DLDevice`` entry in ``DLTensor`` in DLPack protocol.
+
+        The tuple describes the non-partitioned device where the array has been
+        allocated, or the non-partitioned parent device of the allocation
+        device.
+
+        See :class:`dpctl.tensor.DLDeviceType` for a list of devices supported
+        by the DLPack protocol.
+
+        Raises:
+            DLPackCreationError:
+                when the ``device_id`` could not be determined.
+        """
+        try:
+            dev_id = self.sycl_device.get_device_id()
+        except ValueError as e:
+            raise c_dlpack.DLPackCreationError(
+                "Could not determine id of the device where array was "
+                "allocated."
+            )
+        return (
+            DLDeviceType.kDLOneAPI,
+            dev_id,
+        )
+
+    def __eq__(self, other):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.equal(self, other)
+
+    def __floordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide(self, other)
+
+    def __ge__(self, other):
+        return dpctl_ext.tensor.greater_equal(self, other)
+
+    def __gt__(self, other):
+        return dpctl_ext.tensor.greater(self, other)
+
+    def __invert__(self):
+        return dpctl_ext.tensor.bitwise_invert(self)
+
+    def __le__(self, other):
+        return dpctl_ext.tensor.less_equal(self, other)
+
+    def __len__(self):
+        if (self.nd_):
+            return self.shape[0]
+        else:
+            raise TypeError("len() of unsized object")
+
+    def __lshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift(self, other)
+
+    def __lt__(self, other):
+        return dpctl_ext.tensor.less(self, other)
+
+    def __matmul__(self, other):
+        return dpctl_ext.tensor.matmul(self, other)
+
+    def __mod__(self, other):
+        return dpctl_ext.tensor.remainder(self, other)
+
+    def __mul__(self, other):
+        return dpctl_ext.tensor.multiply(self, other)
+
+    def __ne__(self, other):
+        return dpctl_ext.tensor.not_equal(self, other)
+
+    def __neg__(self):
+        return dpctl_ext.tensor.negative(self)
+
+    def __or__(self, other):
+        return dpctl_ext.tensor.bitwise_or(self, other)
+
+    def __pos__(self):
+        return dpctl_ext.tensor.positive(self)
+
+    def __pow__(self, other):
+        return dpctl_ext.tensor.pow(self, other)
+
+    def __rshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift(self, other)
+
+    def __setitem__(self, key, rhs):
+        cdef tuple _meta
+        cdef usm_ndarray Xv
+
+        if (self.flags_ & USM_ARRAY_WRITABLE) == 0:
+            raise ValueError("Can not modify read-only array.")
+
+        _meta = _basic_slice_meta(
+            key, (<object>self).shape, (<object> self).strides,
+            self.get_offset()
+        )
+
+        if len(_meta) < 5:
+            raise RuntimeError
+
+        Xv = usm_ndarray.__new__(
+            usm_ndarray,
+            _meta[0],
+            dtype=_make_typestr(self.typenum_),
+            strides=_meta[1],
+            buffer=self.base_,
+            offset=_meta[2],
+        )
+        # set namespace
+        Xv.array_namespace_ = self.array_namespace_
+
+        from ._copy_utils import (
+            _copy_from_numpy_into,
+            _copy_from_usm_ndarray_to_usm_ndarray,
+            _nonzero_impl,
+            _place_impl,
+            _put_multi_index,
+        )
+
+        adv_ind = _meta[3]
+        adv_ind_start_p = _meta[4]
+
+        if adv_ind_start_p < 0:
+            # basic slicing
+            if isinstance(rhs, usm_ndarray):
+                _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs)
+            else:
+                if hasattr(rhs, "__sycl_usm_array_interface__"):
+                    from dpctl_ext.tensor import asarray
+                    try:
+                        rhs_ar = asarray(rhs)
+                        _copy_from_usm_ndarray_to_usm_ndarray(Xv, rhs_ar)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "converted to usm_ndarray"
+                        )
+                else:
+                    rhs_np = np.asarray(rhs)
+                    if type_bytesize(rhs_np.dtype.num) < 0:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} can not be "
+                            "assigned to usm_ndarray because of "
+                            f"unsupported data type '{rhs_np.dtype}'"
+                        )
+                    try:
+                        _copy_from_numpy_into(Xv, rhs_np)
+                    except Exception:
+                        raise ValueError(
+                            f"Input of type {type(rhs)} could not be "
+                            "copied into dpctl.tensor.usm_ndarray"
+                        )
+            return
+
+        if len(adv_ind) == 1 and adv_ind[0].dtype == dpt_bool:
+            _place_impl(Xv, adv_ind[0], rhs, axis=adv_ind_start_p)
+            return
+
+        if any(
+            (
+                isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool
+            ) for ind in adv_ind
+        ):
+            adv_ind_int = list()
+            for ind in adv_ind:
+                if isinstance(ind, usm_ndarray) and ind.dtype == dpt_bool:
+                    adv_ind_int.extend(_nonzero_impl(ind))
+                else:
+                    adv_ind_int.append(ind)
+            _put_multi_index(Xv, tuple(adv_ind_int), adv_ind_start_p, rhs)
+            return
+
+        _put_multi_index(Xv, adv_ind, adv_ind_start_p, rhs)
+        return
+
+    def __sub__(self, other):
+        # TODO: revert to `return dpctl.tensor...`
+        # when dpnp fully migrates dpctl/tensor
+        return dpctl_ext.tensor.subtract(self, other)
+
+    def __truediv__(self, other):
+        return dpctl_ext.tensor.divide(self, other)
+
+    def __xor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor(self, other)
+
+    def __radd__(self, other):
+        return dpctl_ext.tensor.add(other, self)
+
+    def __rand__(self, other):
+        return dpctl_ext.tensor.bitwise_and(other, self)
+
+    def __rfloordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide(other, self)
+
+    def __rlshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift(other, self)
+
+    def __rmatmul__(self, other):
+        return dpctl_ext.tensor.matmul(other, self)
+
+    def __rmod__(self, other):
+        return dpctl_ext.tensor.remainder(other, self)
+
+    def __rmul__(self, other):
+        return dpctl_ext.tensor.multiply(other, self)
+
+    def __ror__(self, other):
+        return dpctl_ext.tensor.bitwise_or(other, self)
+
+    def __rpow__(self, other):
+        return dpctl_ext.tensor.pow(other, self)
+
+    def __rrshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift(other, self)
+
+    def __rsub__(self, other):
+        return dpctl_ext.tensor.subtract(other, self)
+
+    def __rtruediv__(self, other):
+        return dpctl_ext.tensor.divide(other, self)
+
+    def __rxor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor(other, self)
+
+    def __iadd__(self, other):
+        return dpctl_ext.tensor.add._inplace_op(self, other)
+
+    def __iand__(self, other):
+        return dpctl_ext.tensor.bitwise_and._inplace_op(self, other)
+
+    def __ifloordiv__(self, other):
+        return dpctl_ext.tensor.floor_divide._inplace_op(self, other)
+
+    def __ilshift__(self, other):
+        return dpctl_ext.tensor.bitwise_left_shift._inplace_op(self, other)
+
+    def __imatmul__(self, other):
+        return dpctl_ext.tensor.matmul(self, other, out=self, dtype=self.dtype)
+
+    def __imod__(self, other):
+        return dpctl_ext.tensor.remainder._inplace_op(self, other)
+
+    def __imul__(self, other):
+        return dpctl_ext.tensor.multiply._inplace_op(self, other)
+
+    def __ior__(self, other):
+        return dpctl_ext.tensor.bitwise_or._inplace_op(self, other)
+
+    def __ipow__(self, other):
+        return dpctl_ext.tensor.pow._inplace_op(self, other)
+
+    def __irshift__(self, other):
+        return dpctl_ext.tensor.bitwise_right_shift._inplace_op(self, other)
+
+    def __isub__(self, other):
+        return dpctl_ext.tensor.subtract._inplace_op(self, other)
+
+    def __itruediv__(self, other):
+        return dpctl_ext.tensor.divide._inplace_op(self, other)
+
+    def __ixor__(self, other):
+        return dpctl_ext.tensor.bitwise_xor._inplace_op(self, other)
+
+    def __str__(self):
+        return usm_ndarray_str(self)
+
+    def __repr__(self):
+        return usm_ndarray_repr(self)
+
+    def __array__(self, dtype=None, /, *, copy=None):
+        """NumPy's array protocol method to disallow implicit conversion.
+
+        Without this definition, `numpy.asarray(usm_ar)` converts
+        usm_ndarray instance into NumPy array with data type `object`
+        and every element being 0d usm_ndarray.
+
+        https://github.com/IntelPython/dpctl/pull/1384#issuecomment-1707212972
+        """
+        raise TypeError(
+            "Implicit conversion to a NumPy array is not allowed. "
+            "Use `dpctl.tensor.asnumpy` to copy data from this "
+            "`dpctl.tensor.usm_ndarray` instance to NumPy array"
+        )
+
+
+cdef usm_ndarray _real_view(usm_ndarray ary):
+    """
+    View into real parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_real_view call on array of non-complex type.")
+
+    offset_elems = ary.get_offset() * 2
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _imag_view(usm_ndarray ary):
+    """
+    View into imaginary parts of a complex type array
+    """
+    cdef int r_typenum_ = -1
+    cdef usm_ndarray r = None
+    cdef Py_ssize_t offset_elems = 0
+
+    if (ary.typenum_ == UAR_CFLOAT):
+        r_typenum_ = UAR_FLOAT
+    elif (ary.typenum_ == UAR_CDOUBLE):
+        r_typenum_ = UAR_DOUBLE
+    else:
+        raise InternalUSMArrayError(
+            "_imag_view call on array of non-complex type.")
+
+    # displace pointer to imaginary part
+    offset_elems = 2 * ary.get_offset() + 1
+    r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=_make_typestr(r_typenum_),
+        strides=tuple(2 * si for si in ary.strides),
+        buffer=ary.base_,
+        offset=offset_elems,
+        order=("C" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "F")
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    r.array_namespace_ = ary.array_namespace_
+    return r
+
+
+cdef usm_ndarray _transpose(usm_ndarray ary):
+    """
+    Construct transposed array without copying the data
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _make_reversed_int_tuple(ary.nd_, ary.shape_),
+        dtype=_make_typestr(ary.typenum_),
+        strides=(
+            _make_reversed_int_tuple(ary.nd_, ary.strides_)
+            if (ary.strides_) else None),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _m_transpose(usm_ndarray ary):
+    """
+    Construct matrix transposed array
+    """
+    cdef usm_ndarray r = usm_ndarray.__new__(
+        usm_ndarray,
+        _swap_last_two(_make_int_tuple(ary.nd_, ary.shape_)),
+        dtype=_make_typestr(ary.typenum_),
+        strides=_swap_last_two(ary.strides),
+        buffer=ary.base_,
+        order=("F" if (ary.flags_ & USM_ARRAY_C_CONTIGUOUS) else "C"),
+        offset=ary.get_offset()
+    )
+    r.flags_ = _copy_writable(r.flags_, ary.flags_)
+    return r
+
+
+cdef usm_ndarray _zero_like(usm_ndarray ary):
+    """
+    Make C-contiguous array of zero elements with same shape,
+    type, device, and sycl_queue as ary.
+    """
+    cdef dt = _make_typestr(ary.typenum_)
+    cdef usm_ndarray r = usm_ndarray(
+        _make_int_tuple(ary.nd_, ary.shape_) if ary.nd_ > 0 else tuple(),
+        dtype=dt,
+        buffer=ary.base_.get_usm_type(),
+        buffer_ctor_kwargs={"queue": ary.get_sycl_queue()},
+    )
+    r.base_.memset()
+    return r
+
+
+cdef api char* UsmNDArray_GetData(usm_ndarray arr):
+    """Get allocation pointer of zero index element of array """
+    return arr.get_data()
+
+
+cdef api int UsmNDArray_GetNDim(usm_ndarray arr):
+    """Get array rank: length of its shape"""
+    return arr.get_ndim()
+
+
+cdef api Py_ssize_t* UsmNDArray_GetShape(usm_ndarray arr):
+    """Get host pointer to shape vector"""
+    return arr.get_shape()
+
+
+cdef api Py_ssize_t* UsmNDArray_GetStrides(usm_ndarray arr):
+    """Get host pointer to strides vector"""
+    return arr.get_strides()
+
+
+cdef api int UsmNDArray_GetTypenum(usm_ndarray arr):
+    """Get type number for data type of array elements"""
+    return arr.get_typenum()
+
+
+cdef api int UsmNDArray_GetElementSize(usm_ndarray arr):
+    """Get array element size in bytes"""
+    return arr.get_itemsize()
+
+
+cdef api int UsmNDArray_GetFlags(usm_ndarray arr):
+    """Get flags of array"""
+    return arr.get_flags()
+
+
+cdef api c_dpctl.DPCTLSyclQueueRef UsmNDArray_GetQueueRef(usm_ndarray arr):
+    """Get DPCTLSyclQueueRef for queue associated with the array"""
+    return arr.get_queue_ref()
+
+
+cdef api Py_ssize_t UsmNDArray_GetOffset(usm_ndarray arr):
+    """Get offset of zero-index array element from the beginning of the USM
+    allocation"""
+    return arr.get_offset()
+
+
+cdef api object UsmNDArray_GetUSMData(usm_ndarray arr):
+    """Get USM data object underlying the array"""
+    return arr.get_base()
+
+
+cdef api void UsmNDArray_SetWritableFlag(usm_ndarray arr, int flag):
+    """Set/unset USM_ARRAY_WRITABLE in the given array `arr`."""
+    arr._set_writable_flag(flag)
+
+
+cdef api object UsmNDArray_MakeSimpleFromMemory(
+    int nd, const Py_ssize_t *shape, int typenum,
+    c_dpmem._Memory mobj, Py_ssize_t offset, char order
+):
+    """Create contiguous usm_ndarray.
+
+    Args:
+        nd: number of dimensions (non-negative)
+        shape: array of nd non-negative array's sizes along each dimension
+        typenum: array elemental type number
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        offset: distance between element with zero multi-index and the
+                start of allocation
+        order: Memory layout of the array. Use 'C' for C-contiguous or
+               row-major layout; 'F' for F-contiguous or column-major layout
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef object shape_tuple = _make_int_tuple(nd, <Py_ssize_t *>shape)
+    cdef usm_ndarray arr = usm_ndarray(
+        shape_tuple,
+        dtype=_make_typestr(typenum),
+        buffer=mobj,
+        offset=offset,
+        order=<bytes>(order)
+    )
+    return arr
+
+
+cdef api object UsmNDArray_MakeSimpleFromPtr(
+    size_t nelems,
+    int typenum,
+    c_dpctl.DPCTLSyclUSMRef ptr,
+    c_dpctl.DPCTLSyclQueueRef QRef,
+    object owner
+):
+    """Create 1D contiguous usm_ndarray from pointer.
+
+    Args:
+        nelems: number of elements in array
+        typenum: array elemental type number
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        owner: Python object managing lifetime of USM allocation.
+               Value None implies transfer of USM allocation ownership
+               to the created array object.
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef int itemsize = type_bytesize(typenum)
+    if (itemsize < 1):
+        raise ValueError(
+            "dtype with typenum=" + str(typenum) + " is not supported."
+        )
+    cdef size_t nbytes = (<size_t> itemsize) * nelems
+    cdef c_dpmem._Memory mobj
+    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+        ptr, nbytes, QRef, memory_owner=owner
+    )
+    cdef usm_ndarray arr = usm_ndarray(
+        (nelems,),
+        dtype=_make_typestr(typenum),
+        buffer=mobj
+    )
+    return arr
+
+cdef api object UsmNDArray_MakeFromPtr(
+    int nd,
+    const Py_ssize_t *shape,
+    int typenum,
+    const Py_ssize_t *strides,
+    c_dpctl.DPCTLSyclUSMRef ptr,
+    c_dpctl.DPCTLSyclQueueRef QRef,
+    Py_ssize_t offset,
+    object owner
+):
+    """
+    General usm_ndarray constructor from externally made USM-allocation.
+
+    Args:
+        nd: number of dimensions (non-negative)
+        shape: array of nd non-negative array's sizes along each dimension
+        typenum: array elemental type number
+        strides: array of nd strides along each dimension in elements
+        ptr: pointer to the start of allocation
+        QRef: DPCTLSyclQueueRef associated with the allocation
+        offset: distance between element with zero multi-index and the
+                start of allocation
+        owner: Python object managing lifetime of USM allocation.
+               Value None implies transfer of USM allocation ownership
+               to the created array object.
+    Returns:
+        Created usm_ndarray instance
+    """
+    cdef int itemsize = type_bytesize(typenum)
+    cdef size_t nelems = 1
+    cdef Py_ssize_t min_disp = 0
+    cdef Py_ssize_t max_disp = 0
+    cdef Py_ssize_t step_ = 0
+    cdef Py_ssize_t dim_ = 0
+    cdef it = 0
+    cdef c_dpmem._Memory mobj
+    cdef usm_ndarray arr
+    cdef object obj_shape
+    cdef object obj_strides
+
+    if (itemsize < 1):
+        raise ValueError(
+            "dtype with typenum=" + str(typenum) + " is not supported."
+        )
+    if (nd < 0):
+        raise ValueError("Dimensionality must be non-negative")
+    if (ptr is NULL or QRef is NULL):
+        raise ValueError(
+            "Non-null USM allocation pointer and QRef are expected"
+        )
+    if (nd == 0):
+        # case of 0d scalars
+        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+            ptr, itemsize, QRef, memory_owner=owner
+        )
+        arr = usm_ndarray(
+            tuple(),
+            dtype=_make_typestr(typenum),
+            buffer=mobj
+        )
+        return arr
+    if (shape is NULL or strides is NULL):
+        raise ValueError("Both shape and stride vectors are required")
+    for it in range(nd):
+        dim_ = shape[it]
+        if dim_ < 0:
+            raise ValueError(
+                f"Dimension along axis {it} must be non-negative"
+            )
+        nelems *= dim_
+        if dim_ > 0:
+            step_ = strides[it]
+            if step_ > 0:
+                max_disp += step_ * (dim_ - 1)
+            else:
+                min_disp += step_ * (dim_ - 1)
+
+    obj_shape = _make_int_tuple(nd, shape)
+    obj_strides = _make_int_tuple(nd, strides)
+    if nelems == 0:
+        mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+            ptr, itemsize, QRef, memory_owner=owner
+        )
+        arr = usm_ndarray(
+            obj_shape,
+            dtype=_make_typestr(typenum),
+            strides=obj_strides,
+            buffer=mobj,
+            offset=0
+        )
+        return arr
+    if offset + min_disp < 0:
+        raise ValueError(
+            "Given shape, strides and offset reference out-of-bound memory"
+        )
+    nbytes = (<size_t> itemsize) * (offset + max_disp + 1)
+    mobj = c_dpmem._Memory.create_from_usm_pointer_size_qref(
+        ptr, nbytes, QRef, memory_owner=owner
+    )
+    arr = usm_ndarray(
+        obj_shape,
+        dtype=_make_typestr(typenum),
+        strides=obj_strides,
+        buffer=mobj,
+        offset=offset
+    )
+    return arr
+
+
+def _is_object_with_buffer_protocol(o):
+    "Returns True if object supports Python buffer protocol"
+    return _is_buffer(o)
diff --git a/dpctl_ext/tensor/_utility_functions.py b/dpctl_ext/tensor/_utility_functions.py
index 821f0954017a..c892d777102d 100644
--- a/dpctl_ext/tensor/_utility_functions.py
+++ b/dpctl_ext/tensor/_utility_functions.py
@@ -29,12 +29,11 @@
 import builtins
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils as du
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpctl_ext.tensor._tensor_reductions_impl as tri
 
@@ -60,7 +59,7 @@ def _boolean_reduction(x, axis, keepdims, func):
         red_nd = nd
         # case of a scalar
         if red_nd == 0:
-            return dpt_ext.astype(x, dpt.bool)
+            return dpt.astype(x, dpt.bool)
         x_tmp = x
         res_shape = ()
         perm = list(range(nd))
@@ -72,9 +71,9 @@ def _boolean_reduction(x, axis, keepdims, func):
         red_nd = len(axis)
         # check for axis=()
         if red_nd == 0:
-            return dpt_ext.astype(x, dpt.bool)
+            return dpt.astype(x, dpt.bool)
         perm = [i for i in range(nd) if i not in axis] + list(axis)
-        x_tmp = dpt_ext.permute_dims(x, perm)
+        x_tmp = dpt.permute_dims(x, perm)
         res_shape = x_tmp.shape[: nd - red_nd]
 
     exec_q = x.sycl_queue
@@ -85,7 +84,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     # always allocate the temporary as
     # int32 and usm-device  to ensure that atomic updates
     # are supported
-    res_tmp = dpt_ext.empty(
+    res_tmp = dpt.empty(
         res_shape,
         dtype=dpt.int32,
         usm_type="device",
@@ -101,7 +100,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     _manager.add_event_pair(hev0, ev0)
 
     # copy to boolean result array
-    res = dpt_ext.empty(
+    res = dpt.empty(
         res_shape,
         dtype=dpt.bool,
         usm_type=res_usm_type,
@@ -115,7 +114,7 @@ def _boolean_reduction(x, axis, keepdims, func):
     if keepdims:
         res_shape = res_shape + (1,) * red_nd
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        res = dpt_ext.permute_dims(dpt_ext.reshape(res, res_shape), inv_perm)
+        res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm)
     return res
 
 
@@ -292,7 +291,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(prepend, dpt.usm_ndarray):
             a_prepend = prepend
         else:
-            a_prepend = dpt_ext.asarray(
+            a_prepend = dpt.asarray(
                 prepend,
                 dtype=prepend_dtype,
                 usm_type=coerced_usm_type,
@@ -301,7 +300,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(append, dpt.usm_ndarray):
             a_append = append
         else:
-            a_append = dpt_ext.asarray(
+            a_append = dpt.asarray(
                 append,
                 dtype=append_dtype,
                 usm_type=coerced_usm_type,
@@ -309,11 +308,11 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not prepend_shape:
             prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
         if not append_shape:
             append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt_ext.broadcast_to(a_append, append_shape)
-        return dpt_ext.concat((a_prepend, arr, a_append), axis=axis)
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((a_prepend, arr, a_append), axis=axis)
     elif prepend is not None:
         q1, x_usm_type = arr.sycl_queue, arr.usm_type
         q2, prepend_usm_type = _get_queue_usm_type(prepend)
@@ -361,7 +360,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(prepend, dpt.usm_ndarray):
             a_prepend = prepend
         else:
-            a_prepend = dpt_ext.asarray(
+            a_prepend = dpt.asarray(
                 prepend,
                 dtype=prepend_dtype,
                 usm_type=coerced_usm_type,
@@ -369,8 +368,8 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not prepend_shape:
             prepend_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_prepend = dpt_ext.broadcast_to(a_prepend, prepend_shape)
-        return dpt_ext.concat((a_prepend, arr), axis=axis)
+            a_prepend = dpt.broadcast_to(a_prepend, prepend_shape)
+        return dpt.concat((a_prepend, arr), axis=axis)
     elif append is not None:
         q1, x_usm_type = arr.sycl_queue, arr.usm_type
         q2, append_usm_type = _get_queue_usm_type(append)
@@ -416,7 +415,7 @@ def _concat_diff_input(arr, axis, prepend, append):
         if isinstance(append, dpt.usm_ndarray):
             a_append = append
         else:
-            a_append = dpt_ext.asarray(
+            a_append = dpt.asarray(
                 append,
                 dtype=append_dtype,
                 usm_type=coerced_usm_type,
@@ -424,8 +423,8 @@ def _concat_diff_input(arr, axis, prepend, append):
             )
         if not append_shape:
             append_shape = arr_shape[:axis] + (1,) + arr_shape[axis + 1 :]
-            a_append = dpt_ext.broadcast_to(a_append, append_shape)
-        return dpt_ext.concat((arr, a_append), axis=axis)
+            a_append = dpt.broadcast_to(a_append, append_shape)
+        return dpt.concat((arr, a_append), axis=axis)
     else:
         arr1 = arr
     return arr1
@@ -489,7 +488,7 @@ def diff(x, /, *, axis=-1, n=1, prepend=None, append=None):
         slice(None) if i != axis else slice(None, -1) for i in range(x_nd)
     )
 
-    diff_op = dpt_ext.not_equal if x.dtype == dpt.bool else dpt_ext.subtract
+    diff_op = dpt.not_equal if x.dtype == dpt.bool else dpt.subtract
     if n > 1:
         arr_tmp0 = diff_op(arr[sl0], arr[sl1])
         arr_tmp1 = diff_op(arr_tmp0[sl0], arr_tmp0[sl1])
diff --git a/dpctl_ext/tensor/include/dlpack/LICENSE.third-party b/dpctl_ext/tensor/include/dlpack/LICENSE.third-party
new file mode 100644
index 000000000000..20a9c8a7b4dc
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/LICENSE.third-party
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2017 by Contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/dpctl_ext/tensor/include/dlpack/README.md b/dpctl_ext/tensor/include/dlpack/README.md
new file mode 100644
index 000000000000..3a7bc6d422cd
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/README.md
@@ -0,0 +1,7 @@
+# DLPack header
+
+The header `dlpack.h` downloaded from `https://github.com/dmlc/dlpack.git` remote at tag v1.0rc commit [`62100c1`](https://github.com/dmlc/dlpack/commit/62100c123144ae7a80061f4220be2dbd3cbaefc7).
+
+The file can also be viewed using github web interface at https://github.com/dmlc/dlpack/blob/62100c123144ae7a80061f4220be2dbd3cbaefc7/include/dlpack/dlpack.h
+
+License file was retrieved from https://github.com/dmlc/dlpack/blob/main/LICENSE
diff --git a/dpctl_ext/tensor/include/dlpack/dlpack.h b/dpctl_ext/tensor/include/dlpack/dlpack.h
new file mode 100644
index 000000000000..cd71e799be3c
--- /dev/null
+++ b/dpctl_ext/tensor/include/dlpack/dlpack.h
@@ -0,0 +1,675 @@
+/*!
+ *  Copyright (c) 2017 -  by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 2
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    /*!
+     * \brief The DLPack version.
+     *
+     * A change in major version indicates that we have changed the
+     * data layout of the ABI - DLManagedTensorVersioned.
+     *
+     * A change in minor version indicates that we have added new
+     * code, such as a new device type, but the ABI is kept the same.
+     *
+     * If an obtained DLPack tensor has a major version that disagrees
+     * with the version number specified in this header file
+     * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+     * (and it is safe to do so). It is not safe to access any other fields
+     * as the memory layout will have changed.
+     *
+     * In the case of a minor version mismatch, the tensor can be safely used as
+     * long as the consumer knows how to interpret all fields. Minor version
+     * updates indicate the addition of enumeration values.
+     */
+    typedef struct
+    {
+        /*! \brief DLPack major version. */
+        uint32_t major;
+        /*! \brief DLPack minor version. */
+        uint32_t minor;
+    } DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+    typedef enum : int32_t
+    {
+#else
+typedef enum
+{
+#endif
+        /*! \brief CPU device */
+        kDLCPU = 1,
+        /*! \brief CUDA GPU device */
+        kDLCUDA = 2,
+        /*!
+         * \brief Pinned CUDA CPU memory by cudaMallocHost
+         */
+        kDLCUDAHost = 3,
+        /*! \brief OpenCL devices. */
+        kDLOpenCL = 4,
+        /*! \brief Vulkan buffer for next generation graphics. */
+        kDLVulkan = 7,
+        /*! \brief Metal for Apple GPU. */
+        kDLMetal = 8,
+        /*! \brief Verilog simulator buffer */
+        kDLVPI = 9,
+        /*! \brief ROCm GPUs for AMD GPUs */
+        kDLROCM = 10,
+        /*!
+         * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+         */
+        kDLROCMHost = 11,
+        /*!
+         * \brief Reserved extension device type,
+         * used for quickly test extension device
+         * The semantics can differ depending on the implementation.
+         */
+        kDLExtDev = 12,
+        /*!
+         * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+         */
+        kDLCUDAManaged = 13,
+        /*!
+         * \brief Unified shared memory allocated on a oneAPI non-partititioned
+         * device. Call to oneAPI runtime is required to determine the device
+         * type, the USM allocation type and the sycl context it is bound to.
+         *
+         */
+        kDLOneAPI = 14,
+        /*! \brief GPU support for next generation WebGPU standard. */
+        kDLWebGPU = 15,
+        /*! \brief Qualcomm Hexagon DSP */
+        kDLHexagon = 16,
+        /*! \brief Microsoft MAIA devices */
+        kDLMAIA = 17,
+        /*! \brief AWS Trainium */
+        kDLTrn = 18,
+    } DLDeviceType;
+
+    /*!
+     * \brief A Device for Tensor and operator.
+     */
+    typedef struct
+    {
+        /*! \brief The device type used in the device. */
+        DLDeviceType device_type;
+        /*!
+         * \brief The device index.
+         * For vanilla CPU memory, pinned memory, or managed memory, this is set
+         * to 0.
+         */
+        int32_t device_id;
+    } DLDevice;
+
+    /*!
+     * \brief The type code options DLDataType.
+     */
+    typedef enum
+    {
+        /*! \brief signed integer */
+        kDLInt = 0U,
+        /*! \brief unsigned integer */
+        kDLUInt = 1U,
+        /*! \brief IEEE floating point */
+        kDLFloat = 2U,
+        /*!
+         * \brief Opaque handle type, reserved for testing purposes.
+         * Frameworks need to agree on the handle data type for the exchange to
+         * be well-defined.
+         */
+        kDLOpaqueHandle = 3U,
+        /*! \brief bfloat16 */
+        kDLBfloat = 4U,
+        /*!
+         * \brief complex number
+         * (C/C++/Python layout: compact struct per complex number)
+         */
+        kDLComplex = 5U,
+        /*! \brief boolean */
+        kDLBool = 6U,
+        /*! \brief FP8 data types */
+        kDLFloat8_e3m4 = 7U,
+        kDLFloat8_e4m3 = 8U,
+        kDLFloat8_e4m3b11fnuz = 9U,
+        kDLFloat8_e4m3fn = 10U,
+        kDLFloat8_e4m3fnuz = 11U,
+        kDLFloat8_e5m2 = 12U,
+        kDLFloat8_e5m2fnuz = 13U,
+        kDLFloat8_e8m0fnu = 14U,
+        /*! \brief FP6 data types
+         * Setting bits != 6 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat6_e2m3fn = 15U,
+        kDLFloat6_e3m2fn = 16U,
+        /*! \brief FP4 data types
+         * Setting bits != 4 is currently unspecified, and the producer must
+         * ensure it is set while the consumer must stop importing if the value
+         * is unexpected.
+         */
+        kDLFloat4_e2m1fn = 17U,
+    } DLDataTypeCode;
+
+    /*!
+     * \brief The data type the tensor can hold. The data type is assumed to
+     * follow the native endian-ness. An explicit error message should be raised
+     * when attempting to export an array with non-native endianness
+     *
+     *  Examples
+     *   - float: type_code = 2, bits = 32, lanes = 1
+     *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+     *   - int8: type_code = 0, bits = 8, lanes = 1
+     *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+     *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library
+     * convention, the underlying storage size of bool is 8 bits)
+     *   - float8_e4m3: type_code = 8, bits = 8, lanes = 1 (packed in memory)
+     *   - float6_e3m2fn: type_code = 16, bits = 6, lanes = 1 (packed in memory)
+     *   - float4_e2m1fn: type_code = 17, bits = 4, lanes = 1 (packed in memory)
+     *
+     *  When a sub-byte type is packed, DLPack requires the data to be in little
+     * bit-endian, i.e., for a packed data set D ((D >> (i * bits)) && bit_mask)
+     * stores the i-th element.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief Type code of base types.
+         * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+         * footprint, but the value should be one of DLDataTypeCode enum values.
+         * */
+        uint8_t code;
+        /*!
+         * \brief Number of bits, common choices are 8, 16, 32.
+         */
+        uint8_t bits;
+        /*! \brief Number of lanes in the type, used for vector types. */
+        uint16_t lanes;
+    } DLDataType;
+
+    /*!
+     * \brief Plain C Tensor object, does not manage memory.
+     */
+    typedef struct
+    {
+        /*!
+         * \brief The data pointer points to the allocated data. This will be
+         * CUDA device pointer or cl_mem handle in OpenCL. It may be opaque on
+         * some device types. This pointer is always aligned to 256 bytes as in
+         * CUDA. The `byte_offset` field should be used to point to the
+         * beginning of the data.
+         *
+         * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch,
+         * TensorFlow, TVM, perhaps others) do not adhere to this 256 byte
+         * alignment requirement on CPU/CUDA/ROCm, and always use
+         * `byte_offset=0`.  This must be fixed (after which this note will be
+         * updated); at the moment it is recommended to not rely on the data
+         * pointer being correctly aligned.
+         *
+         * For given DLTensor, the size of memory required to store the contents
+         * of data is calculated as follows:
+         *
+         * \code{.c}
+         * static inline size_t GetDataSize(const DLTensor* t) {
+         *   size_t size = 1;
+         *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+         *     size *= t->shape[i];
+         *   }
+         *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+         *   return size;
+         * }
+         * \endcode
+         *
+         * Note that if the tensor is of size zero, then the data pointer should
+         * be set to `NULL`.
+         */
+        void *data;
+        /*! \brief The device of the tensor */
+        DLDevice device;
+        /*! \brief Number of dimensions */
+        int32_t ndim;
+        /*! \brief The data type of the pointer*/
+        DLDataType dtype;
+        /*!
+         * \brief The shape of the tensor
+         *
+         *  When ndim == 0, shape can be set to NULL.
+         */
+        int64_t *shape;
+        /*!
+         * \brief strides of the tensor (in number of elements, not bytes),
+         *  can not be NULL if ndim != 0, must points to
+         *  an array of ndim elements that specifies the strides,
+         *  so consumer can always rely on strides[dim] being valid for 0 <= dim
+         * < ndim.
+         *
+         *  When ndim == 0, strides can be set to NULL.
+         *
+         *  \note Before DLPack v1.2, strides can be NULL to indicate contiguous
+         * data. This is not allowed in DLPack v1.2 and later. The rationale is
+         * to simplify the consumer handling.
+         */
+        int64_t *strides;
+        /*! \brief The offset in bytes to the beginning pointer to data */
+        uint64_t byte_offset;
+    } DLTensor;
+
+    /*!
+     * \brief C Tensor object, manage memory of DLTensor. This data structure is
+     *  intended to facilitate the borrowing of DLTensor by another framework.
+     * It is not meant to transfer the tensor. When the borrowing framework
+     * doesn't need the tensor, it should call the deleter to notify the host
+     * that the resource is no longer needed.
+     *
+     * \note This data structure is used as Legacy DLManagedTensor
+     *       in DLPack exchange and is deprecated after DLPack v0.8
+     *       Use DLManagedTensorVersioned instead.
+     *       This data structure may get renamed or deleted in future versions.
+     *
+     * \sa DLManagedTensorVersioned
+     */
+    typedef struct DLManagedTensor
+    {
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+        /*! \brief the context of the original host framework of DLManagedTensor
+         * in which DLManagedTensor is used in the framework. It can also be
+         * NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor - this should be called
+         * to destruct the manager_ctx  which backs the DLManagedTensor. It can
+         * be NULL if there is no way for the caller to provide a reasonable
+         * destructor. The destructor deletes the argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensor *self);
+    } DLManagedTensor;
+
+// bit masks used in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief bit mask to indicate that the tensor is a copy made by the producer.
+ *
+ * If set, the tensor is considered solely owned throughout its lifetime by the
+ * consumer, until the producer-provided deleter is invoked.
+ */
+#define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL)
+
+/*!
+ * \brief bit mask to indicate that whether a sub-byte type is packed or padded.
+ *
+ * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can
+ * be set by the producer to signal that a tensor of sub-byte type is padded.
+ */
+#define DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED (1UL << 2UL)
+
+    /*!
+     * \brief A versioned and managed C Tensor object, manage memory of
+     * DLTensor.
+     *
+     * This data structure is intended to facilitate the borrowing of DLTensor
+     * by another framework. It is not meant to transfer the tensor. When the
+     * borrowing framework doesn't need the tensor, it should call the deleter
+     * to notify the host that the resource is no longer needed.
+     *
+     * \note This is the current standard DLPack exchange data structure.
+     */
+    typedef struct DLManagedTensorVersioned
+    {
+        /*!
+         * \brief The API and ABI version of the current managed Tensor
+         */
+        DLPackVersion version;
+        /*!
+         * \brief the context of the original host framework.
+         *
+         * Stores DLManagedTensorVersioned is used in the
+         * framework. It can also be NULL.
+         */
+        void *manager_ctx;
+        /*!
+         * \brief Destructor.
+         *
+         * This should be called to destruct manager_ctx which holds the
+         * DLManagedTensorVersioned. It can be NULL if there is no way for the
+         * caller to provide a reasonable destructor. The destructor deletes the
+         * argument self as well.
+         */
+        void (*deleter)(struct DLManagedTensorVersioned *self);
+        /*!
+         * \brief Additional bitmask flags information about the tensor.
+         *
+         * By default the flags should be set to 0.
+         *
+         * \note Future ABI changes should keep everything until this field
+         *       stable, to ensure that deleter can be correctly called.
+         *
+         * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+         * \sa DLPACK_FLAG_BITMASK_IS_COPIED
+         */
+        uint64_t flags;
+        /*! \brief DLTensor which is being memory managed */
+        DLTensor dl_tensor;
+    } DLManagedTensorVersioned;
+
+    //----------------------------------------------------------------------
+    // DLPack `__c_dlpack_exchange_api__` fast exchange protocol definitions
+    //----------------------------------------------------------------------
+    /*!
+     * \brief Request a producer library to create a new tensor.
+     *
+     * Create a new `DLManagedTensorVersioned` within the context of the
+     * producer library. The allocation is defined via the prototype DLTensor.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+     *        and device fields are used.
+     * \param out The output DLManagedTensorVersioned.
+     * \param error_ctx Context for `SetError`.
+     * \param SetError The function to set the error.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure.
+     *         SetError is called exactly when NULL is returned (the implementer
+     *         must ensure this).
+     * \note - As a C function, must not thrown C++ exceptions.
+     *       - Error propagation via SetError to avoid any direct need
+     *         of Python API. Due to this `SetError` may have to ensure the GIL
+     * is held since it will presumably set a Python error.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorAllocator)( //
+        DLTensor *prototype,
+        DLManagedTensorVersioned **out,
+        void *error_ctx, //
+        void (*SetError)(void *error_ctx,
+                         const char *kind,
+                         const char *message) //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \return The owning DLManagedTensorVersioned* or NULL on failure with a
+     *         Python exception set. If the data cannot be described using
+     * DLPack this should be a BufferError if possible. \note - As a C function,
+     * must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackManagedTensorFromPyObjectNoSync)( //
+        void *py_object,                                  //
+        DLManagedTensorVersioned **out                    //
+    );
+
+    /*!
+     * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+     *
+     * This function provides a faster interface for temporary, non-owning,
+     * exchange. The producer (implementer) still owns the memory of data,
+     * strides, shape. The liveness of the DLTensor and the data it views is
+     * only guaranteed until control is returned.
+     *
+     * This function currently assumes that the producer (implementer) can fill
+     * in the DLTensor shape and strides without the need for temporary
+     * allocations.
+     *
+     * This function does not perform any stream synchronization. The consumer
+     * should query DLPackCurrentWorkStream to get the current work stream and
+     * launch kernels on it.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param py_object The Python object to convert. Must have the same type
+     *        as the one the `DLPackExchangeAPI` was discovered from.
+     * \param out The output DLTensor, whose space is pre-allocated on stack.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+     */
+    typedef int (*DLPackDLTensorFromPyObjectNoSync)( //
+        void *py_object,                             //
+        DLTensor *out                                //
+    );
+
+    /*!
+     * \brief Obtain the current work stream of a device.
+     *
+     * Obtain the current work stream of a device from the producer framework.
+     * For example, it should map to torch.cuda.current_stream in PyTorch.
+     *
+     * When device_type is kDLCPU, the consumer do not have to query the stream
+     * and the producer can simply return NULL when queried.
+     * The consumer do not have to do anything on stream sync or setting.
+     * So CPU only framework can just provide a dummy implementation that
+     * always set out_current_stream[0] to NULL.
+     *
+     * \param device_type The device type.
+     * \param device_id The device id.
+     * \param out_current_stream The output current work stream.
+     *
+     * \return 0 on success, -1 on failure with a Python exception set.
+     * \note - As a C function, must not thrown C++ exceptions.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackCurrentWorkStream)( //
+        DLDeviceType device_type,           //
+        int32_t device_id,                  //
+        void **out_current_stream           //
+    );
+
+    /*!
+     * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+     *
+     * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+     * producer (implementer) library with the correct type.
+     *
+     * This function does not perform any stream synchronization.
+     *
+     * This function is exposed by the framework through the DLPackExchangeAPI.
+     *
+     * \param tensor The DLManagedTensorVersioned to convert the ownership of
+     * the tensor is stolen. \param out_py_object The output Python object.
+     * \return 0 on success, -1 on failure with a Python exception set.
+     *
+     * \sa DLPackExchangeAPI
+     */
+    typedef int (*DLPackManagedTensorToPyObjectNoSync)( //
+        DLManagedTensorVersioned *tensor,               //
+        void **out_py_object                            //
+    );
+
+    /*!
+     * \brief DLPackExchangeAPI stable header.
+     * \sa DLPackExchangeAPI
+     */
+    typedef struct DLPackExchangeAPIHeader
+    {
+        /*!
+         * \brief The provided DLPack version the consumer must check major
+         * version compatibility before using this struct.
+         */
+        DLPackVersion version;
+        /*!
+         * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+         *
+         * It must be NULL if the framework does not support older versions.
+         * If the current major version is larger than the one supported by the
+         * consumer, the consumer may walk this to find an earlier supported
+         * version.
+         *
+         * \sa DLPackExchangeAPI
+         */
+        struct DLPackExchangeAPIHeader *prev_api;
+    } DLPackExchangeAPIHeader;
+
+    /*!
+     * \brief Framework-specific function pointers table for DLPack exchange.
+     *
+     * Additionally to `__dlpack__()` we define a C function table sharable by
+     * Python implementations via `__c_dlpack_exchange_api__`.
+     * This attribute must be set on the type as a Python integer compatible
+     * with `PyLong_FromVoidPtr`/`PyLong_AsVoidPtr`.
+     *
+     * A consumer library may use a pattern such as:
+     *
+     * \code
+     *
+     * PyObject *api_obj = type(tensor_obj).__c_dlpack_exchange_api__;  // as
+     * C-code MyDLPackExchangeAPI *api = PyLong_AsVoidPtr(api_obj); if (api ==
+     * NULL && PyErr_Occurred()) { goto handle_error; }
+     *
+     * \endcode
+     *
+     * Note that this must be defined on the type. The consumer should look up
+     * the attribute on the type and may cache the result for each unique type.
+     *
+     * The precise API table is given by:
+     * \code
+     * struct MyDLPackExchangeAPI : public DLPackExchangeAPI {
+     *   MyDLPackExchangeAPI() {
+     *     header.version.major = DLPACK_MAJOR_VERSION;
+     *     header.version.minor = DLPACK_MINOR_VERSION;
+     *     header.prev_version_api = nullptr;
+     *
+     *     managed_tensor_allocator = MyDLPackManagedTensorAllocator;
+     *     managed_tensor_from_py_object_no_sync =
+     * MyDLPackManagedTensorFromPyObjectNoSync;
+     *     managed_tensor_to_py_object_no_sync =
+     * MyDLPackManagedTensorToPyObjectNoSync; dltensor_from_py_object_no_sync =
+     * MyDLPackDLTensorFromPyObjectNoSync; current_work_stream =
+     * MyDLPackCurrentWorkStream;
+     *  }
+     *
+     *  static const DLPackExchangeAPI* Global() {
+     *     static MyDLPackExchangeAPI inst;
+     *     return &inst;
+     *  }
+     * };
+     * \endcode
+     *
+     * Guidelines for leveraging DLPackExchangeAPI:
+     *
+     * There are generally two kinds of consumer needs for DLPack exchange:
+     * - N0: library support, where consumer.kernel(x, y, z) would like to run a
+     * kernel with the data from x, y, z. The consumer is also expected to run
+     * the kernel with the same stream context as the producer. For example,
+     * when x, y, z is torch.Tensor, consumer should query
+     * exchange_api->current_work_stream to get the current stream and launch
+     * the kernel with the same stream. This setup is necessary for no
+     * synchronization in kernel launch and maximum compatibility with CUDA
+     * graph capture in the producer. This is the desirable behavior for library
+     * extension support for frameworks like PyTorch.
+     * - N1: data ingestion and retention
+     *
+     * Note that obj.__dlpack__() API should provide useful ways for N1.
+     * The primary focus of the current DLPackExchangeAPI is to enable faster
+     * exchange N0 with the support of the function pointer current_work_stream.
+     *
+     * Array/Tensor libraries should statically create and initialize this
+     * structure then return a pointer to DLPackExchangeAPI as an int value in
+     * Tensor/Array. The DLPackExchangeAPI* must stay alive throughout the
+     * lifetime of the process.
+     *
+     * One simple way to do so is to create a static instance of
+     * DLPackExchangeAPI within the framework and return a pointer to it. The
+     * following code shows an example to do so in C++. It should also be
+     * reasonably easy to do so in other languages.
+     */
+    typedef struct DLPackExchangeAPI
+    {
+        /*!
+         * \brief The header that remains stable across versions.
+         */
+        DLPackExchangeAPIHeader header;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorAllocator
+         *        This function must not be NULL.
+         * \sa DLPackManagedTensorAllocator
+         */
+        DLPackManagedTensorAllocator managed_tensor_allocator;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorFromPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorFromPyObject
+         */
+        DLPackManagedTensorFromPyObjectNoSync
+            managed_tensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackManagedTensorToPyObject
+         *        This function must be not NULL.
+         * \sa DLPackManagedTensorToPyObject
+         */
+        DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackDLTensorFromPyObject
+         *        This function can be NULL when the producer does not support
+         * this function. \sa DLPackDLTensorFromPyObjectNoSync
+         */
+        DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+        /*!
+         * \brief Producer function pointer for DLPackCurrentWorkStream
+         *        This function must be not NULL.
+         * \sa DLPackCurrentWorkStream
+         */
+        DLPackCurrentWorkStream current_work_stream;
+    } DLPackExchangeAPI;
+
+#ifdef __cplusplus
+} // DLPACK_EXTERN_C
+#endif
+#endif // DLPACK_DLPACK_H_
diff --git a/dpnp/__init__.py b/dpnp/__init__.py
index 02420107972f..0d5c79b9a671 100644
--- a/dpnp/__init__.py
+++ b/dpnp/__init__.py
@@ -64,7 +64,7 @@
 # Borrowed from DPCTL
 with warnings.catch_warnings():
     warnings.simplefilter("ignore", DeprecationWarning)
-    from dpctl.tensor import __array_api_version__, DLDeviceType
+    from dpctl_ext.tensor import __array_api_version__, DLDeviceType
 
 from .dpnp_array import dpnp_array as ndarray
 from .dpnp_array_api_info import __array_namespace_info__
diff --git a/dpnp/backend/extensions/blas/CMakeLists.txt b/dpnp/backend/extensions/blas/CMakeLists.txt
index 69a99b996d97..2dce27001bbd 100644
--- a/dpnp/backend/extensions/blas/CMakeLists.txt
+++ b/dpnp/backend/extensions/blas/CMakeLists.txt
@@ -39,6 +39,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/fft/CMakeLists.txt b/dpnp/backend/extensions/fft/CMakeLists.txt
index 8a96d8cbd25a..bfebe1ed4226 100644
--- a/dpnp/backend/extensions/fft/CMakeLists.txt
+++ b/dpnp/backend/extensions/fft/CMakeLists.txt
@@ -33,6 +33,8 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/fft_py.cpp)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 373c6152f662..7729e2807a4d 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -36,6 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 2bac0932a673..a3ee4bae8ee5 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -55,6 +55,7 @@ set(_module_src
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
 
 if(_dpnp_sycl_targets)
     # make fat binary
diff --git a/dpnp/backend/extensions/statistics/CMakeLists.txt b/dpnp/backend/extensions/statistics/CMakeLists.txt
index 60d26295acf8..88b3f185e6f6 100644
--- a/dpnp/backend/extensions/statistics/CMakeLists.txt
+++ b/dpnp/backend/extensions/statistics/CMakeLists.txt
@@ -41,6 +41,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/extensions/ufunc/CMakeLists.txt b/dpnp/backend/extensions/ufunc/CMakeLists.txt
index 45d2706fb48d..d954316dcb2a 100644
--- a/dpnp/backend/extensions/ufunc/CMakeLists.txt
+++ b/dpnp/backend/extensions/ufunc/CMakeLists.txt
@@ -67,6 +67,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
diff --git a/dpnp/backend/extensions/vm/CMakeLists.txt b/dpnp/backend/extensions/vm/CMakeLists.txt
index 32f7d4281c2f..0d69c4e79c03 100644
--- a/dpnp/backend/extensions/vm/CMakeLists.txt
+++ b/dpnp/backend/extensions/vm/CMakeLists.txt
@@ -90,6 +90,8 @@ set(python_module_name _vm_impl)
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(WIN32)
     if(${CMAKE_VERSION} VERSION_LESS "3.27")
         # this is a work-around for target_link_options inserting option after -link option, cause
diff --git a/dpnp/backend/extensions/window/CMakeLists.txt b/dpnp/backend/extensions/window/CMakeLists.txt
index 5b7921ad324c..c8cbd7c03bbc 100644
--- a/dpnp/backend/extensions/window/CMakeLists.txt
+++ b/dpnp/backend/extensions/window/CMakeLists.txt
@@ -36,6 +36,8 @@ set(_module_src
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 
+target_link_libraries(${python_module_name} PRIVATE DpctlExtCAPI)
+
 if(_dpnp_sycl_targets)
     # make fat binary
     target_compile_options(
diff --git a/dpnp/backend/include/dpnp4pybind11.hpp b/dpnp/backend/include/dpnp4pybind11.hpp
index cd287989bef2..af2f5f866eba 100644
--- a/dpnp/backend/include/dpnp4pybind11.hpp
+++ b/dpnp/backend/include/dpnp4pybind11.hpp
@@ -28,7 +28,66 @@
 
 #pragma once
 
-#include "dpctl_capi.h"
+// TODO: Enable dpctl_capi.h once dpctl.tensor is removed.
+// Also call `import_dpctl_ext__tensor___usmarray();` right after
+// `import_dpctl()` (line 334) to initialize the dpctl_ext tensor C-API.
+//
+// Now we include dpctl C-API headers explicitly in order to
+// integrate dpctl_ext tensor C-API.
+
+// #include "dpctl_capi.h"
+
+// clang-format off
+// Ordering of includes is important here. dpctl_sycl_types and
+// dpctl_sycl_extension_interface define types used by dpctl's Python
+// C-API headers.
+#include "syclinterface/dpctl_sycl_types.h"
+#include "syclinterface/dpctl_sycl_extension_interface.h"
+#ifdef __cplusplus
+#define CYTHON_EXTERN_C extern "C"
+#else
+#define CYTHON_EXTERN_C
+#endif
+#include "dpctl/_sycl_device.h"
+#include "dpctl/_sycl_device_api.h"
+#include "dpctl/_sycl_context.h"
+#include "dpctl/_sycl_context_api.h"
+#include "dpctl/_sycl_event.h"
+#include "dpctl/_sycl_event_api.h"
+#include "dpctl/_sycl_queue.h"
+#include "dpctl/_sycl_queue_api.h"
+#include "dpctl/memory/_memory.h"
+#include "dpctl/memory/_memory_api.h"
+#include "dpctl/program/_program.h"
+#include "dpctl/program/_program_api.h"
+
+// clang-format on
+
+// TODO: Keep these includes once `dpctl.tensor` is removed from dpctl,
+// but replace the hardcoded relative path with a proper include pathы
+#include <dpctl_ext/tensor/_usmarray.h>
+#include <dpctl_ext/tensor/_usmarray_api.h>
+
+/*
+ * Function to import dpctl and make C-API functions available.
+ * C functions can use dpctl's C-API functions without linking to
+ * shared objects defining this symbols, if they call `import_dpctl()`
+ * prior to using those symbols.
+ *
+ * It is declared inline to allow multiple definitions in
+ * different translation units
+ */
+static inline void import_dpctl(void)
+{
+    import_dpctl___sycl_device();
+    import_dpctl___sycl_context();
+    import_dpctl___sycl_event();
+    import_dpctl___sycl_queue();
+    import_dpctl__memory___memory();
+    import_dpctl_ext__tensor___usmarray();
+    import_dpctl__program___program();
+    return;
+}
 
 #include <complex>
 #include <cstddef> // for std::size_t for C++ linkage
@@ -410,8 +469,10 @@ class dpctl_capi
         default_usm_memory_ = std::shared_ptr<py::object>(
             new py::object{py_default_usm_memory}, Deleter{});
 
+        // TODO: revert to `py::module_::import("dpctl.tensor._usmarray");`
+        // when dpnp fully migrates dpctl/tensor
         py::module_ mod_usmarray =
-            py::module_::import("dpctl.tensor._usmarray");
+            py::module_::import("dpctl_ext.tensor._usmarray");
         auto tensor_kl = mod_usmarray.attr("usm_ndarray");
 
         const py::object &py_default_usm_ndarray =
diff --git a/dpnp/dpnp_algo/dpnp_arraycreation.py b/dpnp/dpnp_algo/dpnp_arraycreation.py
index 4e2ee8531a18..fb277dd4d310 100644
--- a/dpnp/dpnp_algo/dpnp_arraycreation.py
+++ b/dpnp/dpnp_algo/dpnp_arraycreation.py
@@ -29,13 +29,12 @@
 import math
 import operator
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations, map_dtype_to_device
@@ -53,7 +52,7 @@ def _as_usm_ndarray(a, usm_type, sycl_queue):
 
     if isinstance(a, dpnp_array):
         a = a.get_array()
-    return dpt_ext.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
+    return dpt.asarray(a, usm_type=usm_type, sycl_queue=sycl_queue)
 
 
 def _check_has_zero_val(a):
@@ -196,7 +195,7 @@ def dpnp_linspace(
 
     if dpnp.isscalar(start) and dpnp.isscalar(stop):
         # Call linspace() function for scalars.
-        usm_res = dpt_ext.linspace(
+        usm_res = dpt.linspace(
             start,
             stop,
             num,
@@ -213,19 +212,19 @@ def dpnp_linspace(
             else:
                 step = dpnp.nan
     else:
-        usm_start = dpt_ext.asarray(
+        usm_start = dpt.asarray(
             start,
             dtype=dt,
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_stop = dpt_ext.asarray(
+        usm_stop = dpt.asarray(
             stop, dtype=dt, usm_type=_usm_type, sycl_queue=sycl_queue_normalized
         )
 
         delta = usm_stop - usm_start
 
-        usm_res = dpt_ext.arange(
+        usm_res = dpt.arange(
             0,
             stop=num,
             step=1,
@@ -233,9 +232,7 @@ def dpnp_linspace(
             usm_type=_usm_type,
             sycl_queue=sycl_queue_normalized,
         )
-        usm_res = dpt_ext.reshape(
-            usm_res, (-1,) + (1,) * delta.ndim, copy=False
-        )
+        usm_res = dpt.reshape(usm_res, (-1,) + (1,) * delta.ndim, copy=False)
 
         if step_num > 0:
             step = delta / step_num
@@ -243,7 +240,7 @@ def dpnp_linspace(
             # Needed a special handling for denormal numbers (when step == 0),
             # see numpy#5437 for more details.
             # Note, dpt.where() is used to avoid a synchronization branch.
-            usm_res = dpt_ext.where(
+            usm_res = dpt.where(
                 step == 0, (usm_res / step_num) * delta, usm_res * step
             )
         else:
@@ -256,17 +253,17 @@ def dpnp_linspace(
             usm_res[-1, ...] = usm_stop
 
     if axis != 0:
-        usm_res = dpt_ext.moveaxis(usm_res, 0, axis)
+        usm_res = dpt.moveaxis(usm_res, 0, axis)
 
     if dpnp.issubdtype(dtype, dpnp.integer):
         dpt.floor(usm_res, out=usm_res)
 
-    res = dpt_ext.astype(usm_res, dtype, copy=False)
+    res = dpt.astype(usm_res, dtype, copy=False)
     res = dpnp_array._create_from_usm_ndarray(res)
 
     if retstep is True:
         if dpnp.isscalar(step):
-            step = dpt_ext.asarray(
+            step = dpt.asarray(
                 step, usm_type=res.usm_type, sycl_queue=res.sycl_queue
             )
         return res, dpnp_array._create_from_usm_ndarray(step)
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
index d7eeccf78489..271013b58090 100644
--- a/dpnp/dpnp_algo/dpnp_elementwise_common.py
+++ b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -29,28 +29,27 @@
 import warnings
 from functools import wraps
 
-import dpctl.tensor as dpt
-import dpctl.tensor._type_utils as dtu
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._elementwise_common import (
-    BinaryElementwiseFunc,
-    UnaryElementwiseFunc,
-)
-from dpctl.tensor._scalar_utils import (
-    _get_dtype,
-    _get_shape,
-    _validate_dtype,
-)
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._copy_utils as dtc
 import dpctl_ext.tensor._tensor_impl as dti
+import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 import dpnp.backend.extensions.vm._vm_impl as vmi
+from dpctl_ext.tensor._elementwise_common import (
+    BinaryElementwiseFunc,
+    UnaryElementwiseFunc,
+)
+from dpctl_ext.tensor._scalar_utils import (
+    _get_dtype,
+    _get_shape,
+    _validate_dtype,
+)
 from dpnp.dpnp_array import dpnp_array
 from dpnp.dpnp_utils import get_usm_allocations
 from dpnp.dpnp_utils.dpnp_utils_common import (
@@ -213,7 +212,7 @@ def __call__(
 
         x_usm = dpnp.get_usm_ndarray(x)
         if dtype is not None:
-            x_usm = dpt_ext.astype(x_usm, dtype, copy=False)
+            x_usm = dpt.astype(x_usm, dtype, copy=False)
 
         out = self._unpack_out_kw(out)
         out_usm = None if out is None else dpnp.get_usm_ndarray(out)
@@ -467,7 +466,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
+                out[i] = dpt.empty_like(res, dtype=res_dt)
             elif (
                 buf_dt is None
                 and dti._array_overlap(x, res)
@@ -476,7 +475,7 @@ def __call__(
                 # Allocate a temporary buffer to avoid memory overlapping.
                 # Note if `buf_dt` is not None, a temporary copy of `x` will be
                 # created, so the array overlap check isn't needed.
-                out[i] = dpt_ext.empty_like(res)
+                out[i] = dpt.empty_like(res)
 
         _manager = dpu.SequentialOrderManager[exec_q]
         dep_evs = _manager.submitted_events
@@ -486,7 +485,7 @@ def __call__(
             if order == "K":
                 buf = dtc._empty_like_orderK(x, buf_dt)
             else:
-                buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+                buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
             ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                 src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -503,7 +502,7 @@ def __call__(
                 if order == "K":
                     out[i] = dtc._empty_like_orderK(x, res_dt)
                 else:
-                    out[i] = dpt_ext.empty_like(x, dtype=res_dt, order=order)
+                    out[i] = dpt.empty_like(x, dtype=res_dt, order=order)
 
         # Call the unary function with input and output arrays
         ht_unary_ev, unary_ev = self.get_implementation_function()(
@@ -713,24 +712,24 @@ def __call__(
 
         if dtype is not None:
             if dpnp.isscalar(x1):
-                x1_usm = dpt_ext.asarray(
+                x1_usm = dpt.asarray(
                     x1,
                     dtype=dtype,
                     sycl_queue=x2.sycl_queue,
                     usm_type=x2.usm_type,
                 )
-                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
+                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
             elif dpnp.isscalar(x2):
-                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt_ext.asarray(
+                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt.asarray(
                     x2,
                     dtype=dtype,
                     sycl_queue=x1.sycl_queue,
                     usm_type=x1.usm_type,
                 )
             else:
-                x1_usm = dpt_ext.astype(x1_usm, dtype, copy=False)
-                x2_usm = dpt_ext.astype(x2_usm, dtype, copy=False)
+                x1_usm = dpt.astype(x1_usm, dtype, copy=False)
+                x2_usm = dpt.astype(x2_usm, dtype, copy=False)
 
         res_usm = super().__call__(x1_usm, x2_usm, out=out_usm, order=order)
 
@@ -1078,7 +1077,7 @@ def __call__(
                     )
 
                 # Allocate a temporary buffer with the required dtype
-                out[i] = dpt_ext.empty_like(res, dtype=res_dt)
+                out[i] = dpt.empty_like(res, dtype=res_dt)
             else:
                 # If `dt` is not None, a temporary copy of `x` will be created,
                 # so the array overlap check isn't needed.
@@ -1094,7 +1093,7 @@ def __call__(
                     for x in x_to_check
                 ):
                     # allocate a temporary buffer to avoid memory overlapping
-                    out[i] = dpt_ext.empty_like(res)
+                    out[i] = dpt.empty_like(res)
 
         x1 = dpnp.as_usm_ndarray(x1, dtype=x1_dt, sycl_queue=exec_q)
         x2 = dpnp.as_usm_ndarray(x2, dtype=x2_dt, sycl_queue=exec_q)
@@ -1127,7 +1126,7 @@ def __call__(
                 if order == "K":
                     buf = dtc._empty_like_orderK(x, buf_dt)
                 else:
-                    buf = dpt_ext.empty_like(x, dtype=buf_dt, order=order)
+                    buf = dpt.empty_like(x, dtype=buf_dt, order=order)
 
                 ht_copy_ev, copy_ev = dti._copy_usm_ndarray_into_usm_ndarray(
                     src=x, dst=buf, sycl_queue=exec_q, depends=dep_evs
@@ -1146,7 +1145,7 @@ def __call__(
                         x1, x2, res_dt, res_shape, res_usm_type, exec_q
                     )
                 else:
-                    out[i] = dpt_ext.empty(
+                    out[i] = dpt.empty(
                         res_shape,
                         dtype=res_dt,
                         order=order,
@@ -1156,9 +1155,9 @@ def __call__(
 
         # Broadcast shapes of input arrays
         if x1.shape != res_shape:
-            x1 = dpt_ext.broadcast_to(x1, res_shape)
+            x1 = dpt.broadcast_to(x1, res_shape)
         if x2.shape != res_shape:
-            x2 = dpt_ext.broadcast_to(x2, res_shape)
+            x2 = dpt.broadcast_to(x2, res_shape)
 
         # Call the binary function with input and output arrays
         ht_binary_ev, binary_ev = self.get_implementation_function()(
@@ -1326,7 +1325,7 @@ def __call__(self, x, /, decimals=0, out=None, *, dtype=None):
                 res_usm = dpt.divide(x_usm, 10**decimals, out=out_usm)
 
             if dtype is not None:
-                res_usm = dpt_ext.astype(res_usm, dtype, copy=False)
+                res_usm = dpt.astype(res_usm, dtype, copy=False)
 
             if out is not None and isinstance(out, dpnp_array):
                 return out
diff --git a/dpnp/dpnp_array.py b/dpnp/dpnp_array.py
index 6418302d6e7b..cbb5835bbfc4 100644
--- a/dpnp/dpnp_array.py
+++ b/dpnp/dpnp_array.py
@@ -37,11 +37,9 @@
 
 import warnings
 
-import dpctl.tensor as dpt
-
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._type_utils as dtu
 import dpnp
 from dpctl_ext.tensor._numpy_helper import AxisError
@@ -777,7 +775,7 @@ def asnumpy(self):
 
         """
 
-        return dpt_ext.asnumpy(self._array_obj)
+        return dpt.asnumpy(self._array_obj)
 
     def astype(
         self,
@@ -2283,7 +2281,7 @@ def transpose(self, *axes):
                 # self.transpose(None).shape == self.shape[::-1]
                 axes = tuple((ndim - x - 1) for x in range(ndim))
 
-            usm_res = dpt_ext.permute_dims(self._array_obj, axes)
+            usm_res = dpt.permute_dims(self._array_obj, axes)
         return dpnp_array._create_from_usm_ndarray(usm_res)
 
     def var(
diff --git a/dpnp/dpnp_array_api_info.py b/dpnp/dpnp_array_api_info.py
index 6a3939d046b0..f792600cbb66 100644
--- a/dpnp/dpnp_array_api_info.py
+++ b/dpnp/dpnp_array_api_info.py
@@ -36,7 +36,9 @@
 
 """
 
-import dpctl.tensor as dpt
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
 
 
 def __array_namespace_info__():
diff --git a/dpnp/dpnp_iface.py b/dpnp/dpnp_iface.py
index 9fca083a6413..13b957ffff8f 100644
--- a/dpnp/dpnp_iface.py
+++ b/dpnp/dpnp_iface.py
@@ -45,17 +45,16 @@
 import os
 
 import dpctl
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
-from dpctl.tensor._device import normalize_queue_device
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
+from dpctl_ext.tensor._device import normalize_queue_device
 
 from .dpnp_array import dpnp_array
 from .dpnp_utils import (
@@ -137,7 +136,7 @@ def asnumpy(a, order="C"):
         return a.asnumpy()
 
     if isinstance(a, dpt.usm_ndarray):
-        return dpt_ext.asnumpy(a)
+        return dpt.asnumpy(a)
 
     return numpy.asarray(a, order=order)
 
@@ -191,7 +190,7 @@ def as_usm_ndarray(a, dtype=None, device=None, usm_type=None, sycl_queue=None):
     if is_supported_array_type(a):
         return get_usm_ndarray(a)
 
-    return dpt_ext.asarray(
+    return dpt.asarray(
         a, dtype=dtype, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
diff --git a/dpnp/dpnp_iface_arraycreation.py b/dpnp/dpnp_iface_arraycreation.py
index d09cc17bde79..2800df0b2ac8 100644
--- a/dpnp/dpnp_iface_arraycreation.py
+++ b/dpnp/dpnp_iface_arraycreation.py
@@ -43,12 +43,11 @@
 
 import operator
 
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpnp import dpnp_container
 
@@ -937,7 +936,7 @@ def astype(x, dtype, /, *, order="K", casting="unsafe", copy=True, device=None):
         order = "K"
 
     usm_x = dpnp.get_usm_ndarray(x)
-    usm_res = dpt_ext.astype(
+    usm_res = dpt.astype(
         usm_x, dtype, order=order, casting=casting, copy=copy, device=device
     )
 
@@ -3119,7 +3118,7 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
 
     s0 = (1,) * ndim
     output = [
-        dpt_ext.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
+        dpt.reshape(dpnp.get_usm_ndarray(x), s0[:i] + (-1,) + s0[i + 1 :])
         for i, x in enumerate(xi)
     ]
 
@@ -3127,14 +3126,14 @@ def meshgrid(*xi, copy=True, sparse=False, indexing="xy"):
     _, _ = get_usm_allocations(output)
 
     if indexing == "xy" and ndim > 1:
-        output[0] = dpt_ext.reshape(output[0], (1, -1) + s0[2:])
-        output[1] = dpt_ext.reshape(output[1], (-1, 1) + s0[2:])
+        output[0] = dpt.reshape(output[0], (1, -1) + s0[2:])
+        output[1] = dpt.reshape(output[1], (-1, 1) + s0[2:])
 
     if not sparse:
-        output = dpt_ext.broadcast_arrays(*output)
+        output = dpt.broadcast_arrays(*output)
 
     if copy:
-        output = [dpt_ext.copy(x) for x in output]
+        output = [dpt.copy(x) for x in output]
 
     return [dpnp_array._create_from_usm_ndarray(x) for x in output]
 
@@ -3696,7 +3695,7 @@ def tri(
     if usm_type is None:
         usm_type = "device"
 
-    m = dpt_ext.ones(
+    m = dpt.ones(
         (N, M),
         dtype=_dtype,
         device=device,
@@ -3912,7 +3911,7 @@ def vander(
 
     if dpnp.is_supported_array_type(x):
         x = dpnp.get_usm_ndarray(x)
-    usm_x = dpt_ext.asarray(
+    usm_x = dpt.asarray(
         x, device=device, usm_type=usm_type, sycl_queue=sycl_queue
     )
 
@@ -3934,8 +3933,8 @@ def vander(
 
     tmp = m[:, ::-1] if not increasing else m
     dpnp.power(
-        dpt_ext.reshape(usm_x, (-1, 1)),
-        dpt_ext.arange(
+        dpt.reshape(usm_x, (-1, 1)),
+        dpt.arange(
             N, dtype=_dtype, usm_type=x_usm_type, sycl_queue=x_sycl_queue
         ),
         out=tmp,
diff --git a/dpnp/dpnp_iface_indexing.py b/dpnp/dpnp_iface_indexing.py
index a52196e9e4db..4b8fb7bb6a38 100644
--- a/dpnp/dpnp_iface_indexing.py
+++ b/dpnp/dpnp_iface_indexing.py
@@ -44,14 +44,13 @@
 import operator
 from collections.abc import Iterable
 
-import dpctl.tensor as dpt
 import dpctl.utils as dpu
 import numpy
 
 # pylint: disable=no-name-in-module
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpctl_ext.tensor._tensor_impl as ti
 import dpnp
 
@@ -141,9 +140,9 @@ def _choose_run(inds, chcs, q, usm_type, out=None, mode=0):
             ti._array_overlap(out, chc) for chc in chcs
         ):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
+        out = dpt.empty(
             inds.shape, dtype=chcs[0].dtype, usm_type=usm_type, sycl_queue=q
         )
 
@@ -242,7 +241,7 @@ def choose(a, choices, out=None, mode="wrap"):
         # NumPy will cast up to int64 in general but
         # int32 is more than safe for bool
         if ind_dt == dpnp.bool:
-            inds = dpt_ext.astype(inds, dpt.int32)
+            inds = dpt.astype(inds, dpt.int32)
         else:
             raise TypeError("input index array must be of integer data type")
 
@@ -250,17 +249,17 @@ def choose(a, choices, out=None, mode="wrap"):
 
     res_usm_type, exec_q = get_usm_allocations(choices + [inds])
     # apply type promotion to input choices
-    res_dt = dpt_ext.result_type(*choices)
+    res_dt = dpt.result_type(*choices)
     if len(choices) > 1:
         choices = tuple(
             map(
                 lambda chc: (
-                    chc if chc.dtype == res_dt else dpt_ext.astype(chc, res_dt)
+                    chc if chc.dtype == res_dt else dpt.astype(chc, res_dt)
                 ),
                 choices,
             )
         )
-    arrs_broadcast = dpt_ext.broadcast_arrays(inds, *choices)
+    arrs_broadcast = dpt.broadcast_arrays(inds, *choices)
     inds = arrs_broadcast[0]
     choices = tuple(arrs_broadcast[1:])
 
@@ -301,11 +300,9 @@ def _take_index(x, inds, axis, q, usm_type, out=None, mode=0):
 
         if ti._array_overlap(x, out):
             # Allocate a temporary buffer to avoid memory overlapping.
-            out = dpt_ext.empty_like(out)
+            out = dpt.empty_like(out)
     else:
-        out = dpt_ext.empty(
-            res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q
-        )
+        out = dpt.empty(res_sh, dtype=x.dtype, usm_type=usm_type, sycl_queue=q)
 
     _manager = dpu.SequentialOrderManager[q]
     dep_evs = _manager.submitted_events
@@ -816,16 +813,16 @@ def extract(condition, a):
     )
 
     if usm_cond.size != usm_a.size:
-        usm_a = dpt_ext.reshape(usm_a, -1)
-        usm_cond = dpt_ext.reshape(usm_cond, -1)
+        usm_a = dpt.reshape(usm_a, -1)
+        usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.take(usm_a, dpt_ext.nonzero(usm_cond)[0])
+        usm_res = dpt.take(usm_a, dpt.nonzero(usm_cond)[0])
     else:
         if usm_cond.shape != usm_a.shape:
-            usm_a = dpt_ext.reshape(usm_a, -1)
-            usm_cond = dpt_ext.reshape(usm_cond, -1)
+            usm_a = dpt.reshape(usm_a, -1)
+            usm_cond = dpt.reshape(usm_cond, -1)
 
-        usm_res = dpt_ext.extract(usm_cond, usm_a)
+        usm_res = dpt.extract(usm_cond, usm_a)
 
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
@@ -960,18 +957,18 @@ def fill_diagonal(a, val, wrap=False):
     # a.flat[:end:step] = val
     # but need to consider use case when `a` is usm_ndarray also
     a_sh = a.shape
-    tmp_a = dpt_ext.reshape(usm_a, -1)
+    tmp_a = dpt.reshape(usm_a, -1)
     if dpnp.isscalar(usm_val):
         tmp_a[:end:step] = usm_val
     else:
-        usm_val = dpt_ext.reshape(usm_val, -1)
+        usm_val = dpt.reshape(usm_val, -1)
 
         # Setitem can work only if index size equal val size.
         # Using loop for general case without dependencies of val size.
         for i in range(0, usm_val.size):
             tmp_a[step * i : end : step * (i + 1)] = usm_val[i]
 
-    tmp_a = dpt_ext.reshape(tmp_a, a_sh)
+    tmp_a = dpt.reshape(tmp_a, a_sh)
     usm_a[:] = tmp_a
 
 
@@ -1548,7 +1545,7 @@ def nonzero(a):
 
     usm_a = dpnp.get_usm_ndarray(a)
     return tuple(
-        dpnp_array._create_from_usm_ndarray(y) for y in dpt_ext.nonzero(usm_a)
+        dpnp_array._create_from_usm_ndarray(y) for y in dpt.nonzero(usm_a)
     )
 
 
@@ -1612,16 +1609,14 @@ def place(a, mask, vals):
 
     if usm_vals.ndim != 1:
         # dpt.place supports only 1-D array of values
-        usm_vals = dpt_ext.reshape(usm_vals, -1)
+        usm_vals = dpt.reshape(usm_vals, -1)
 
     if usm_vals.dtype != usm_a.dtype:
         # dpt.place casts values to a.dtype with "unsafe" rule,
         # while numpy.place does that with "safe" casting rule
-        usm_vals = dpt_ext.astype(
-            usm_vals, usm_a.dtype, casting="safe", copy=False
-        )
+        usm_vals = dpt.astype(usm_vals, usm_a.dtype, casting="safe", copy=False)
 
-    dpt_ext.place(usm_a, usm_mask, usm_vals)
+    dpt.place(usm_a, usm_mask, usm_vals)
 
 
 def put(a, ind, v, /, *, axis=None, mode="wrap"):
@@ -1711,19 +1706,19 @@ def put(a, ind, v, /, *, axis=None, mode="wrap"):
 
     if usm_ind.ndim != 1:
         # dpt.put supports only 1-D array of indices
-        usm_ind = dpt_ext.reshape(usm_ind, -1, copy=False)
+        usm_ind = dpt.reshape(usm_ind, -1, copy=False)
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.put supports only integer dtype for array of indices
-        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, casting="safe")
+        usm_ind = dpt.astype(usm_ind, dpnp.intp, casting="safe")
 
     in_usm_a = usm_a
     if axis is None and usm_a.ndim > 1:
-        usm_a = dpt_ext.reshape(usm_a, -1)
+        usm_a = dpt.reshape(usm_a, -1)
 
-    dpt_ext.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
+    dpt.put(usm_a, usm_ind, usm_v, axis=axis, mode=mode)
     if in_usm_a._pointer != usm_a._pointer:  # pylint: disable=protected-access
-        in_usm_a[:] = dpt_ext.reshape(usm_a, in_usm_a.shape, copy=False)
+        in_usm_a[:] = dpt.reshape(usm_a, in_usm_a.shape, copy=False)
 
 
 def put_along_axis(a, ind, values, axis, mode="wrap"):
@@ -1805,11 +1800,11 @@ def put_along_axis(a, ind, values, axis, mode="wrap"):
     if dpnp.is_supported_array_type(values):
         usm_vals = dpnp.get_usm_ndarray(values)
     else:
-        usm_vals = dpt_ext.asarray(
+        usm_vals = dpt.asarray(
             values, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
 
-    dpt_ext.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
+    dpt.put_along_axis(usm_a, usm_ind, usm_vals, axis=axis, mode=mode)
 
 
 def putmask(x1, mask, values):
@@ -2153,7 +2148,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     usm_a = dpnp.get_usm_ndarray(a)
     if not dpnp.is_supported_array_type(indices):
-        usm_ind = dpt_ext.asarray(
+        usm_ind = dpt.asarray(
             indices, usm_type=a.usm_type, sycl_queue=a.sycl_queue
         )
     else:
@@ -2165,7 +2160,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
     if axis is None:
         if a_ndim > 1:
             # flatten input array
-            usm_a = dpt_ext.reshape(usm_a, -1)
+            usm_a = dpt.reshape(usm_a, -1)
         axis = 0
     elif a_ndim == 0:
         axis = normalize_axis_index(operator.index(axis), 1)
@@ -2174,7 +2169,7 @@ def take(a, indices, /, *, axis=None, out=None, mode="wrap"):
 
     if not dpnp.issubdtype(usm_ind.dtype, dpnp.integer):
         # dpt.take supports only integer dtype for array of indices
-        usm_ind = dpt_ext.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
+        usm_ind = dpt.astype(usm_ind, dpnp.intp, copy=False, casting="safe")
 
     usm_res = _take_index(
         usm_a, usm_ind, axis, exec_q, res_usm_type, out=out, mode=mode
@@ -2297,7 +2292,7 @@ def take_along_axis(a, indices, axis=-1, mode="wrap"):
     usm_a = dpnp.get_usm_ndarray(a)
     usm_ind = dpnp.get_usm_ndarray(indices)
 
-    usm_res = dpt_ext.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
+    usm_res = dpt.take_along_axis(usm_a, usm_ind, axis=axis, mode=mode)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
diff --git a/dpnp/dpnp_iface_manipulation.py b/dpnp/dpnp_iface_manipulation.py
index 2ff08cc6ec8b..0fc2c3f80fde 100644
--- a/dpnp/dpnp_iface_manipulation.py
+++ b/dpnp/dpnp_iface_manipulation.py
@@ -45,12 +45,11 @@
 from typing import NamedTuple
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 from dpctl_ext.tensor._numpy_helper import (
     AxisError,
@@ -375,27 +374,25 @@ def _get_first_nan_index(usm_a):
         ):
             if dpnp.issubdtype(usm_a.dtype, dpnp.complexfloating):
                 # for complex all NaNs are considered equivalent
-                true_val = dpt_ext.asarray(
+                true_val = dpt.asarray(
                     True, sycl_queue=usm_a.sycl_queue, usm_type=usm_a.usm_type
                 )
-                return dpt_ext.searchsorted(
-                    dpt.isnan(usm_a), true_val, side="left"
-                )
-            return dpt_ext.searchsorted(usm_a, usm_a[-1], side="left")
+                return dpt.searchsorted(dpt.isnan(usm_a), true_val, side="left")
+            return dpt.searchsorted(usm_a, usm_a[-1], side="left")
         return None
 
     usm_ar = dpnp.get_usm_ndarray(ar)
 
     num_of_flags = (return_index, return_inverse, return_counts).count(True)
     if num_of_flags == 0:
-        usm_res = dpt_ext.unique_values(usm_ar)
+        usm_res = dpt.unique_values(usm_ar)
         usm_res = (usm_res,)  # cast to a tuple to align with other cases
     elif num_of_flags == 1 and return_inverse:
-        usm_res = dpt_ext.unique_inverse(usm_ar)
+        usm_res = dpt.unique_inverse(usm_ar)
     elif num_of_flags == 1 and return_counts:
-        usm_res = dpt_ext.unique_counts(usm_ar)
+        usm_res = dpt.unique_counts(usm_ar)
     else:
-        usm_res = dpt_ext.unique_all(usm_ar)
+        usm_res = dpt.unique_all(usm_ar)
 
     first_nan = None
     if equal_nan:
@@ -417,10 +414,10 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to replace the indices with
             # the index of the first NaN value in result array of unique values
-            dpt_ext.place(
+            dpt.place(
                 usm_res.inverse_indices,
                 usm_res.inverse_indices > first_nan,
-                dpt_ext.reshape(first_nan, 1),
+                dpt.reshape(first_nan, 1),
             )
 
         result += (usm_res.inverse_indices,)
@@ -428,9 +425,7 @@ def _get_first_nan_index(usm_a):
         if first_nan is not None:
             # all NaNs are collapsed, so need to put a count of all NaNs
             # at the last index
-            dpt_ext.sum(
-                usm_res.counts[first_nan:], out=usm_res.counts[first_nan]
-            )
+            dpt.sum(usm_res.counts[first_nan:], out=usm_res.counts[first_nan])
             result += (usm_res.counts[: first_nan + 1],)
         else:
             result += (usm_res.counts,)
@@ -1097,9 +1092,7 @@ def broadcast_arrays(*args, subok=False):
     if len(args) == 0:
         return []
 
-    usm_arrays = dpt_ext.broadcast_arrays(
-        *[dpnp.get_usm_ndarray(a) for a in args]
-    )
+    usm_arrays = dpt.broadcast_arrays(*[dpnp.get_usm_ndarray(a) for a in args])
     return [dpnp_array._create_from_usm_ndarray(a) for a in usm_arrays]
 
 
@@ -1184,7 +1177,7 @@ def broadcast_to(array, /, shape, subok=False):
         raise NotImplementedError(f"subok={subok} is currently not supported")
 
     usm_array = dpnp.get_usm_ndarray(array)
-    new_array = dpt_ext.broadcast_to(usm_array, shape)
+    new_array = dpt.broadcast_to(usm_array, shape)
     return dpnp_array._create_from_usm_ndarray(new_array)
 
 
@@ -1276,7 +1269,7 @@ def can_cast(from_, to, casting="safe"):
         if dpnp.is_supported_array_type(from_)
         else dpnp.dtype(from_)
     )
-    return dpt_ext.can_cast(dtype_from, to, casting=casting)
+    return dpt.can_cast(dtype_from, to, casting=casting)
 
 
 def column_stack(tup):
@@ -1422,7 +1415,7 @@ def concatenate(
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt_ext.concat(usm_arrays, axis=axis)
+    usm_res = dpt.concat(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -1527,7 +1520,7 @@ def copyto(dst, src, casting="same_kind", where=True):
                 f"but got {where.dtype}"
             )
 
-        dst_usm, src_usm, mask_usm = dpt_ext.broadcast_arrays(
+        dst_usm, src_usm, mask_usm = dpt.broadcast_arrays(
             dpnp.get_usm_ndarray(dst),
             dpnp.get_usm_ndarray(src),
             dpnp.get_usm_ndarray(where),
@@ -1855,7 +1848,7 @@ def expand_dims(a, axis):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.expand_dims(usm_a, axis=axis)
+    usm_res = dpt.expand_dims(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -1926,7 +1919,7 @@ def flip(m, axis=None):
     """
 
     m_usm = dpnp.get_usm_ndarray(m)
-    return dpnp_array._create_from_usm_ndarray(dpt_ext.flip(m_usm, axis=axis))
+    return dpnp_array._create_from_usm_ndarray(dpt.flip(m_usm, axis=axis))
 
 
 def fliplr(m):
@@ -2370,7 +2363,7 @@ def matrix_transpose(x, /):
             f"but it is {usm_x.ndim}"
         )
 
-    usm_res = dpt_ext.matrix_transpose(usm_x)
+    usm_res = dpt.matrix_transpose(usm_x)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -2414,7 +2407,7 @@ def moveaxis(a, source, destination):
 
     usm_array = dpnp.get_usm_ndarray(a)
     return dpnp_array._create_from_usm_ndarray(
-        dpt_ext.moveaxis(usm_array, source, destination)
+        dpt.moveaxis(usm_array, source, destination)
     )
 
 
@@ -2843,7 +2836,7 @@ def repeat(a, repeats, axis=None):
         a = dpnp.ravel(a)
 
     usm_arr = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.repeat(usm_arr, repeats, axis=axis)
+    usm_res = dpt.repeat(usm_arr, repeats, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3066,7 +3059,7 @@ def reshape(a, /, shape, order="C", *, copy=None):
         )
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.reshape(usm_a, shape=shape, order=order, copy=copy)
+    usm_res = dpt.reshape(usm_a, shape=shape, order=order, copy=copy)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3201,7 +3194,7 @@ def result_type(*arrays_and_dtypes):
         )
         for X in arrays_and_dtypes
     ]
-    return dpt_ext.result_type(*usm_arrays_and_dtypes)
+    return dpt.result_type(*usm_arrays_and_dtypes)
 
 
 def roll(x, shift, axis=None):
@@ -3268,9 +3261,9 @@ def roll(x, shift, axis=None):
         shift = dpnp.asnumpy(shift)
 
     if axis is None:
-        return roll(dpt_ext.reshape(usm_x, -1), shift, 0).reshape(x.shape)
+        return roll(dpt.reshape(usm_x, -1), shift, 0).reshape(x.shape)
 
-    usm_res = dpt_ext.roll(usm_x, shift=shift, axis=axis)
+    usm_res = dpt.roll(usm_x, shift=shift, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3669,7 +3662,7 @@ def squeeze(a, /, axis=None):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.squeeze(usm_a, axis=axis)
+    usm_res = dpt.squeeze(usm_a, axis=axis)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3757,7 +3750,7 @@ def stack(arrays, /, *, axis=0, out=None, dtype=None, casting="same_kind"):
         )
 
     usm_arrays = [dpnp.get_usm_ndarray(x) for x in arrays]
-    usm_res = dpt_ext.stack(usm_arrays, axis=axis)
+    usm_res = dpt.stack(usm_arrays, axis=axis)
 
     res = dpnp_array._create_from_usm_ndarray(usm_res)
     if dtype is not None:
@@ -3818,7 +3811,7 @@ def swapaxes(a, axis1, axis2):
     """
 
     usm_a = dpnp.get_usm_ndarray(a)
-    usm_res = dpt_ext.swapaxes(usm_a, axis1=axis1, axis2=axis2)
+    usm_res = dpt.swapaxes(usm_a, axis1=axis1, axis2=axis2)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -3898,7 +3891,7 @@ def tile(A, reps):
     """
 
     usm_a = dpnp.get_usm_ndarray(A)
-    usm_res = dpt_ext.tile(usm_a, reps)
+    usm_res = dpt.tile(usm_a, reps)
     return dpnp_array._create_from_usm_ndarray(usm_res)
 
 
@@ -4528,7 +4521,7 @@ def unstack(x, /, *, axis=0):
     if usm_x.ndim == 0:
         raise ValueError("Input array must be at least 1-d.")
 
-    res = dpt_ext.unstack(usm_x, axis=axis)
+    res = dpt.unstack(usm_x, axis=axis)
     return tuple(dpnp_array._create_from_usm_ndarray(a) for a in res)
 
 
diff --git a/dpnp/dpnp_iface_types.py b/dpnp/dpnp_iface_types.py
index f133333d6b83..7d2d60089d98 100644
--- a/dpnp/dpnp_iface_types.py
+++ b/dpnp/dpnp_iface_types.py
@@ -37,12 +37,11 @@
 import functools
 
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .dpnp_array import dpnp_array
@@ -214,7 +213,7 @@ def finfo(dtype):
     """
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt_ext.finfo(dtype)
+    return dpt.finfo(dtype)
 
 
 # pylint: disable=redefined-outer-name
@@ -247,7 +246,7 @@ def iinfo(dtype):
 
     if isinstance(dtype, dpnp_array):
         dtype = dtype.dtype
-    return dpt_ext.iinfo(dtype)
+    return dpt.iinfo(dtype)
 
 
 def isdtype(dtype, kind):
@@ -301,7 +300,7 @@ def isdtype(dtype, kind):
     elif isinstance(kind, tuple):
         kind = tuple(dpt.dtype(k) if isinstance(k, type) else k for k in kind)
 
-    return dpt_ext.isdtype(dtype, kind)
+    return dpt.isdtype(dtype, kind)
 
 
 def issubdtype(arg1, arg2):
diff --git a/dpnp/dpnp_utils/dpnp_utils_statistics.py b/dpnp/dpnp_utils/dpnp_utils_statistics.py
index ec67b619a13f..cd9932cb7153 100644
--- a/dpnp/dpnp_utils/dpnp_utils_statistics.py
+++ b/dpnp/dpnp_utils/dpnp_utils_statistics.py
@@ -29,13 +29,12 @@
 import warnings
 
 import dpctl
-import dpctl.tensor as dpt
 from dpctl.utils import ExecutionPlacementError
 
-import dpnp
-
 # TODO: revert to `from dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+import dpnp
 from dpctl_ext.tensor._numpy_helper import normalize_axis_tuple
 from dpnp.dpnp_array import dpnp_array
 
diff --git a/dpnp/exceptions/__init__.py b/dpnp/exceptions/__init__.py
index 26d78a853f41..7abcdbf0553f 100644
--- a/dpnp/exceptions/__init__.py
+++ b/dpnp/exceptions/__init__.py
@@ -32,10 +32,13 @@
     SyclQueueCreationError,
 )
 from dpctl.memory import USMAllocationError
-from dpctl.tensor._dlpack import DLPackCreationError
 from dpctl.utils import ExecutionPlacementError
 from numpy.exceptions import AxisError
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+from dpctl_ext.tensor._dlpack import DLPackCreationError
+
 __all__ = [
     "AxisError",
     "DLPackCreationError",
diff --git a/dpnp/memory/_memory.py b/dpnp/memory/_memory.py
index f978c5e50db2..3e95baacd424 100644
--- a/dpnp/memory/_memory.py
+++ b/dpnp/memory/_memory.py
@@ -26,11 +26,14 @@
 # THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
-import dpctl.tensor as dpt
 from dpctl.memory import MemoryUSMDevice as DPCTLMemoryUSMDevice
 from dpctl.memory import MemoryUSMHost as DPCTLMemoryUSMHost
 from dpctl.memory import MemoryUSMShared as DPCTLMemoryUSMShared
 
+# TODO: revert to `from dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor as dpt
+
 
 def _add_ptr_property(cls):
     _storage_attr = "_ptr"
diff --git a/dpnp/tests/test_mathematical.py b/dpnp/tests/test_mathematical.py
index c03787790280..155f4cdb06fb 100644
--- a/dpnp/tests/test_mathematical.py
+++ b/dpnp/tests/test_mathematical.py
@@ -1,5 +1,4 @@
 import dpctl
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from dpctl.utils import ExecutionPlacementError
@@ -13,7 +12,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 # TODO: revert to `from dpctl.tensor...`
@@ -672,15 +671,15 @@ def test_to_begin_to_end(self, to_begin, to_end):
         "to_begin, to_end",
         [
             (-20, 20),
-            (dpt_ext.asarray([-20, -30]), dpt_ext.asarray([20, 15])),
-            (dpt_ext.asarray([[-20, -30]]), dpt_ext.asarray([[20, 15]])),
+            (dpt.asarray([-20, -30]), dpt.asarray([20, 15])),
+            (dpt.asarray([[-20, -30]]), dpt.asarray([[20, 15]])),
             ([1, 2], [3, 4]),
             ((1, 2), (3, 4)),
         ],
     )
     def test_usm_ndarray(self, to_begin, to_end):
         a = numpy.array([[1, 2, 0]])
-        dpt_a = dpt_ext.asarray(a)
+        dpt_a = dpt.asarray(a)
 
         if isinstance(to_begin, dpt.usm_ndarray):
             np_to_begin = dpt.asnumpy(to_begin)
@@ -1581,7 +1580,7 @@ def test_out(self):
         assert_allclose(result, expected)
 
         # output is usm_ndarray
-        dpt_out = dpt_ext.empty(expected.shape, dtype=expected.dtype)
+        dpt_out = dpt.empty(expected.shape, dtype=expected.dtype)
         result = dpnp.prod(ia, axis=0, out=dpt_out)
         assert dpt_out is result.get_array()
         assert_allclose(result, expected)
@@ -2634,7 +2633,7 @@ def test_out_float16(self, func):
     def test_out_usm_ndarray(self, func, dt):
         a = generate_random_numpy_array(10, dt)
         out = numpy.empty(a.shape, dtype=dt)
-        ia, usm_out = dpnp.array(a), dpt_ext.asarray(out)
+        ia, usm_out = dpnp.array(a), dpt.asarray(out)
 
         expected = getattr(numpy, func)(a, out=out)
         result = getattr(dpnp, func)(ia, out=usm_out)
diff --git a/dpnp/tests/test_memory.py b/dpnp/tests/test_memory.py
index 94aeda33f505..dd87a993e1dc 100644
--- a/dpnp/tests/test_memory.py
+++ b/dpnp/tests/test_memory.py
@@ -1,10 +1,9 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 import dpnp.memory as dpm
 
@@ -24,7 +23,7 @@ def test_wrong_input_type(self, x):
             dpm.create_data(x)
 
     def test_wrong_usm_data(self):
-        a = dpt_ext.ones(10)
+        a = dpt.ones(10)
         d = IntUsmData(a.shape, buffer=a)
 
         with pytest.raises(TypeError):
diff --git a/dpnp/tests/test_ndarray.py b/dpnp/tests/test_ndarray.py
index a27f0fe6aa14..8944043d90a0 100644
--- a/dpnp/tests/test_ndarray.py
+++ b/dpnp/tests/test_ndarray.py
@@ -1,4 +1,3 @@
-import dpctl.tensor as dpt
 import numpy
 import pytest
 from numpy.testing import (
@@ -11,7 +10,7 @@
 
 # TODO: revert to `import dpctl.tensor...`
 # when dpnp fully migrates dpctl/tensor
-import dpctl_ext.tensor as dpt_ext
+import dpctl_ext.tensor as dpt
 import dpnp
 
 from .helper import (
@@ -410,7 +409,7 @@ def test_error(self):
 class TestUsmNdarrayProtocol:
     def test_basic(self):
         a = dpnp.arange(256, dtype=dpnp.int64)
-        usm_a = dpt_ext.asarray(a)
+        usm_a = dpt.asarray(a)
 
         assert a.sycl_queue == usm_a.sycl_queue
         assert a.usm_type == usm_a.usm_type
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index 41df0a82e0a0..e44f51f09b20 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -1,10 +1,12 @@
 from __future__ import annotations
 
 import dpctl
-import dpctl.tensor._dlpack as dlp
 import numpy
 import pytest
 
+# TODO: revert to `import dpctl.tensor...`
+# when dpnp fully migrates dpctl/tensor
+import dpctl_ext.tensor._dlpack as dlp
 import dpnp as cupy
 from dpnp.tests.third_party.cupy import testing