From 0b5e65d11949625976a8fae497fad42beb1867dc Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Mon, 1 Jun 2026 21:51:56 -0400
Subject: [PATCH] [CI] Add cibuildwheel-based wheel publishing to PyPI

Add a workflow_dispatch pipeline that builds, tests, and publishes manylinux/
macOS/Windows wheels via cibuildwheel with OIDC trusted publishing.
- publish_wheel.yml + build-wheel-for-publish action: per-OS/arch matrix,
  static-LLVM host builds, a separate build_cuda_runtime stage that compiles
  the libtvm_runtime_cuda sidecar and injects it via -DTVM_PACKAGE_EXTRA_LIBS,
  auditwheel/delocate/delvewheel repair, and post-upload install+import verify.
- pyproject.toml [tool.cibuildwheel]: manylinux_2_28 image pins, skips,
  test-command, per-platform repair commands.
- cmake: FindLLVM prefers static zstd; tvm_compiler links --no-relax +
  --as-needed on Linux; Library.cmake rpath/install helper reused by backends.
- tests/python/wheel: assert LLVM enabled and CUDA runtime bundled.
---
 .../build-wheel-for-publish/action.yml        | 136 ++++++++
 .github/actions/setup/action.yml              |  14 +-
 .github/workflows/publish_wheel.yml           | 307 ++++++++++++++++++
 .gitignore                                    |   2 +
 CMakeLists.txt                                |  65 ++--
 ci/scripts/package/README.md                  |  29 ++
 .../scripts/package}/build-environment.yaml   |   2 +
 .../manylinux_build_libtvm_runtime_cuda.sh    |  70 ++++
 .../windows_build_libtvm_runtime_cuda.sh      | 101 ++++++
 cmake/modules/CUDA.cmake                      |  16 +-
 cmake/modules/Hexagon.cmake                   |  10 +-
 cmake/modules/Metal.cmake                     |  10 +-
 cmake/modules/OpenCL.cmake                    |  10 +-
 cmake/modules/ROCM.cmake                      |  10 +-
 cmake/modules/Vulkan.cmake                    |  10 +-
 cmake/utils/FindLLVM.cmake                    |   9 +-
 cmake/utils/Library.cmake                     |  65 ++++
 pyproject.toml                                | 149 +++------
 .../wheel/test_validate_runtime_library.py    |  50 +++
 19 files changed, 877 insertions(+), 188 deletions(-)
 create mode 100644 .github/actions/build-wheel-for-publish/action.yml
 create mode 100644 .github/workflows/publish_wheel.yml
 create mode 100644 ci/scripts/package/README.md
 rename {tests/conda => ci/scripts/package}/build-environment.yaml (98%)
 create mode 100755 ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh
 create mode 100755 ci/scripts/package/windows_build_libtvm_runtime_cuda.sh
 create mode 100644 cmake/utils/Library.cmake
 create mode 100644 tests/python/wheel/test_validate_runtime_library.py

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
new file mode 100644
index 000000000000..3fa01372aec0
--- /dev/null
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -0,0 +1,136 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Build TVM Wheel
+description: >
+  Build and test the LLVM-enabled TVM wheel for a given OS/architecture
+  combination using cibuildwheel.
+
+inputs:
+  arch:
+    description: "Target architecture for cibuildwheel (e.g., x86_64, aarch64, arm64, AMD64)"
+    required: true
+  build:
+    description: "cibuildwheel build selector (e.g., cp310-manylinux_x86_64)"
+    required: true
+  include_cuda_runtime:
+    description: "Set to 1 to build and inject the CUDA runtime library (Linux only)"
+    required: false
+    default: "0"
+
+runs:
+  using: "composite"
+  steps:
+    # Single source of truth for the LLVM toolchain version, shared by the cache
+    # key and the conda install steps below.
+    - name: Set LLVM version
+      shell: bash
+      run: echo "LLVM_VERSION=22.1.0" >> "$GITHUB_ENV"
+
+    - name: Prepare LLVM cache path (Unix)
+      if: runner.os != 'Windows'
+      shell: bash
+      run: |
+        set -eux
+        sudo mkdir -p /opt/llvm
+        sudo chown -R "$(whoami)" /opt/llvm
+
+    # ---- Cache LLVM prefix ----
+    - name: Cache LLVM
+      uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
+      id: llvm-cache
+      with:
+        path: ${{ runner.os == 'Windows' && 'C:/opt/llvm' || '/opt/llvm' }}
+        key: tvm-wheel-llvm-${{ env.LLVM_VERSION }}-${{ runner.os }}-${{ inputs.arch }}-v6
+
+    # ---- Install LLVM via conda (cache miss only) ----
+    - name: Setup conda
+      if: steps.llvm-cache.outputs.cache-hit != 'true'
+      uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
+      continue-on-error: true
+      id: conda1
+      with:
+        miniforge-version: latest
+        conda-remove-defaults: true
+
+    - name: Setup conda (retry with tar.bz2)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && steps.conda1.outcome == 'failure'
+      uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
+      with:
+        miniforge-version: latest
+        use-only-tar-bz2: true
+        conda-remove-defaults: true
+
+    - name: Install LLVM (Unix)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os != 'Windows'
+      shell: bash -l {0}
+      run: |
+        set -eux
+        if [[ "${RUNNER_OS}" == "Linux" ]]; then
+          sudo mkdir -p /opt/llvm
+          sudo chown -R "$(whoami)" /opt/llvm
+        fi
+        conda create -q -p /opt/llvm -c conda-forge \
+          "llvmdev=${LLVM_VERSION}" "clangdev=${LLVM_VERSION}" "compiler-rt=${LLVM_VERSION}" zlib zstd-static libxml2-devel \
+          -y
+
+    - name: Install LLVM (Windows)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os == 'Windows'
+      shell: cmd /C call {0}
+      run: |
+        call conda create -q -p C:\opt\llvm -c conda-forge llvmdev=%LLVM_VERSION% zlib zstd-static libxml2-devel -y
+
+    # The {project} placeholder is not expanded inside CIBW_ENVIRONMENT values, so
+    # compute the forward-slash host path here (the Windows analog of the Linux
+    # /project hardcode).
+    - name: Compute CUDA sidecar path (Windows)
+      if: runner.os == 'Windows' && inputs.include_cuda_runtime == '1'
+      shell: bash
+      run: echo "TVM_CUDA_EXTRA_LIB=$(cygpath -m "$(pwd)")/build-wheel-cuda/lib/tvm_runtime_cuda.dll" >> "$GITHUB_ENV"
+
+    # ---- Build and test wheels ----
+    - name: Build and test wheels
+      uses: pypa/cibuildwheel@298ed2fb2c105540f5ed055e8a6ad78d82dd3a7e # v3.3.1
+      with:
+        package-dir: .
+        output-dir: wheelhouse
+      env:
+        CIBW_BUILD: ${{ inputs.build }}
+        CIBW_ARCHS_LINUX: ${{ inputs.arch }}
+        CIBW_ARCHS_MACOS: ${{ inputs.arch }}
+        CIBW_ARCHS_WINDOWS: ${{ inputs.arch }}
+        # Linux builds run in cibuildwheel's default manylinux_2_28 container;
+        # bind-mount the cached LLVM prefix into it. Ignored on macOS/Windows,
+        # which build without a container.
+        CIBW_CONTAINER_ENGINE: "docker; create_args: --volume /opt/llvm:/opt/llvm:ro"
+        # Bundle the prebuilt CUDA sidecar (downloaded into build-wheel-cuda/lib/)
+        # via -DTVM_PACKAGE_EXTRA_LIBS; no CUDA toolkit needed here.
+        CIBW_ENVIRONMENT: >-
+          CMAKE_PREFIX_PATH="/opt/llvm"
+          CMAKE_ARGS="-DUSE_LLVM='/opt/llvm/bin/llvm-config --link-static' -DZLIB_USE_STATIC_LIBS=ON -DCMAKE_PREFIX_PATH=/opt/llvm ${{ inputs.include_cuda_runtime == '1' && '-DTVM_PACKAGE_EXTRA_LIBS=/project/build-wheel-cuda/lib/libtvm_runtime_cuda.so' || '' }}"
+        CIBW_ENVIRONMENT_LINUX: >-
+          CMAKE_PREFIX_PATH="/opt/llvm"
+          LIBRARY_PATH="/opt/llvm/lib"
+          CMAKE_ARGS="-DUSE_LLVM='/opt/llvm/bin/llvm-config --link-static' -DZLIB_USE_STATIC_LIBS=ON -DCMAKE_PREFIX_PATH=/opt/llvm ${{ inputs.include_cuda_runtime == '1' && '-DTVM_PACKAGE_EXTRA_LIBS=/project/build-wheel-cuda/lib/libtvm_runtime_cuda.so' || '' }}"
+        CIBW_ENVIRONMENT_WINDOWS: >-
+          CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
+          PATH="C:/opt/llvm/Library/bin;$PATH"
+          CMAKE_ARGS="-DUSE_LLVM='C:/opt/llvm/Library/bin/llvm-config.exe --link-static' -DZLIB_USE_STATIC_LIBS=ON -DCMAKE_PREFIX_PATH=C:/opt/llvm/Library ${{ inputs.include_cuda_runtime == '1' && format('-DTVM_PACKAGE_EXTRA_LIBS={0}', env.TVM_CUDA_EXTRA_LIB) || '' }}"
+        # Tells tests/python/wheel to assert the CUDA runtime is bundled
+        # (only on the CUDA wheels; the value is "0" for CPU wheels).
+        CIBW_TEST_ENVIRONMENT: >-
+          TVM_WHEEL_EXPECT_CUDA_RUNTIME="${{ inputs.include_cuda_runtime }}"
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index e78ce2f66d7a..842dbd03b092 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -1,34 +1,36 @@
 runs:
  using: "composite"
  steps:
-  - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+  - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
     env:
       CACHE_NUMBER: 2
     with:
       path: ~/conda_pkgs_dir
-      key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('tests/conda/build-environment.yaml') }}
-  - uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('ci/scripts/package/build-environment.yaml') }}
+  - uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
     continue-on-error: true
     id: conda1
     with:
       activate-environment: tvm-build
       channel-priority: strict
-      environment-file: tests/conda/build-environment.yaml
+      environment-file: ci/scripts/package/build-environment.yaml
       auto-activate-base: false
       miniforge-version: latest
       python-version: "3.10"
       condarc-file: tests/conda/condarc
-  - uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      conda-remove-defaults: true
+  - uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
     if: steps.conda1.outcome == 'failure'
     with:
       activate-environment: tvm-build
       channel-priority: strict
-      environment-file: tests/conda/build-environment.yaml
+      environment-file: ci/scripts/package/build-environment.yaml
       auto-activate-base: false
       miniforge-version: latest
       use-only-tar-bz2: true
       python-version: "3.10"
       condarc-file: tests/conda/condarc
+      conda-remove-defaults: true
   - name: Conda info
     shell: pwsh
     run: |
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
new file mode 100644
index 000000000000..4b950997c3ae
--- /dev/null
+++ b/.github/workflows/publish_wheel.yml
@@ -0,0 +1,307 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Publish TVM wheels
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Tag, branch, or SHA to build; PyPI publishes require refs/tags/<tag>"
+        required: true
+        type: string
+      publish_repository:
+        description: "Where to publish after the wheel build succeeds"
+        required: true
+        default: "none"
+        type: choice
+        options:
+          - none
+          - testpypi
+          - pypi
+      verify_from_repository:
+        description: "Install the uploaded package from the selected repository and import-test it"
+        required: true
+        default: true
+        type: boolean
+
+permissions:
+  contents: read
+
+jobs:
+  # Build the CUDA runtime sidecar once per arch and upload it as an artifact that
+  # build_wheels bundles. Linux legs use the pinned manylinux image (see
+  # pyproject.toml) so the sidecar is ABI-identical to the wheel's libtvm_runtime.so.
+  build_cuda_runtime:
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    container: ${{ matrix.container }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "CUDA runtime sidecar (Linux x86_64, manylinux_2_28)"
+            os: ubuntu-latest
+            container: quay.io/pypa/manylinux_2_28_x86_64:2026.01.04-1
+            arch: x86_64
+            script: ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh
+            lib: build-wheel-cuda/lib/libtvm_runtime_cuda.so
+          - name: "CUDA runtime sidecar (Linux aarch64, manylinux_2_28)"
+            os: ubuntu-24.04-arm
+            container: quay.io/pypa/manylinux_2_28_aarch64:2026.01.04-1
+            arch: aarch64
+            script: ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh
+            lib: build-wheel-cuda/lib/libtvm_runtime_cuda.so
+          - name: "CUDA runtime sidecar (Windows AMD64)"
+            os: windows-2022
+            container: ""
+            arch: AMD64
+            script: ci/scripts/package/windows_build_libtvm_runtime_cuda.sh
+            lib: build-wheel-cuda/lib/tvm_runtime_cuda.dll
+    steps:
+      # The containerized Linux legs check out as root; mark the tree safe so git
+      # (submodule init in checkout) does not bail on "dubious ownership".
+      - name: Mark workspace safe for git
+        if: runner.os == 'Linux'
+        shell: bash
+        run: git config --global --add safe.directory '*'
+
+      - name: Checkout source
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ inputs.tag }}
+          submodules: recursive
+          fetch-depth: 1
+          fetch-tags: true
+
+      # Windows has no manylinux interpreter; the script's pip install needs one.
+      - name: Set up Python (Windows host)
+        if: runner.os == 'Windows'
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: "3.10"
+
+      - name: Build CUDA runtime sidecar
+        shell: bash
+        run: bash ${{ matrix.script }}
+
+      - name: Upload CUDA runtime sidecar
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: tvm-cuda-runtime-${{ matrix.arch }}
+          path: ${{ matrix.lib }}
+          if-no-files-found: error
+
+  build_wheels:
+    name: ${{ matrix.name }}
+    # All-or-nothing: a failed sidecar leg blocks the whole wheel matrix (a publish
+    # needs the complete 4-wheel set anyway).
+    needs: [build_cuda_runtime]
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: "Linux x86_64 wheel with CUDA runtime (manylinux_2_28)"
+            os: ubuntu-latest
+            arch: x86_64
+            build: cp310-manylinux_x86_64
+            include_cuda_runtime: "1"
+            artifact_suffix: linux-x86_64-manylinux_2_28
+          - name: "Linux aarch64 wheel with CUDA runtime (manylinux_2_28)"
+            os: ubuntu-24.04-arm
+            arch: aarch64
+            build: cp310-manylinux_aarch64
+            include_cuda_runtime: "1"
+            artifact_suffix: linux-aarch64-manylinux_2_28
+          - name: "macOS arm64 CPU wheel"
+            os: macos-14
+            arch: arm64
+            build: cp310-macosx_arm64
+            include_cuda_runtime: "0"
+            artifact_suffix: macos-arm64
+          - name: "Windows AMD64 wheel with CUDA runtime"
+            os: windows-2022
+            arch: AMD64
+            build: cp310-win_amd64
+            include_cuda_runtime: "1"
+            artifact_suffix: windows-amd64
+    steps:
+      - name: Validate publish inputs
+        shell: bash
+        env:
+          TVM_PUBLISH_REPOSITORY: ${{ inputs.publish_repository }}
+          TVM_PUBLISH_REF: ${{ inputs.tag }}
+          TVM_VERIFY_FROM_REPOSITORY: ${{ inputs.verify_from_repository }}
+        run: |
+          set -eux
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && "${TVM_PUBLISH_REF}" != refs/tags/* ]]; then
+            echo "PyPI publishes must use an immutable refs/tags/<tag> ref" >&2
+            exit 1
+          fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && "${TVM_VERIFY_FROM_REPOSITORY}" != "true" ]]; then
+            echo "verify_from_repository must be enabled when publishing to PyPI" >&2
+            exit 1
+          fi
+
+      - name: Checkout source
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ inputs.tag }}
+          submodules: recursive
+          fetch-depth: 1
+          fetch-tags: true
+
+      # Land the sidecar where -DTVM_PACKAGE_EXTRA_LIBS / cibuildwheel's /project
+      # mount expects it. Skipped on CPU-only rows (macOS).
+      - name: Download CUDA runtime sidecar
+        if: ${{ matrix.include_cuda_runtime == '1' }}
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: tvm-cuda-runtime-${{ matrix.arch }}
+          path: build-wheel-cuda/lib
+
+      - name: Build TVM wheel
+        uses: ./.github/actions/build-wheel-for-publish
+        with:
+          arch: ${{ matrix.arch }}
+          build: ${{ matrix.build }}
+          include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
+        with:
+          name: tvm-wheel-${{ matrix.artifact_suffix }}
+          path: wheelhouse/*.whl
+          if-no-files-found: error
+
+  upload_pypi:
+    name: Upload package distributions
+    needs: [build_wheels]
+    if: ${{ inputs.publish_repository != 'none' }}
+    runs-on: ubuntu-latest
+    environment: ${{ inputs.publish_repository }}
+    permissions:
+      actions: read
+      contents: read
+      id-token: write
+      attestations: write
+    steps:
+      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          pattern: tvm-wheel-*
+          path: dist
+          merge-multiple: true
+
+      - name: Check wheel sizes
+        shell: bash
+        run: |
+          set -euo pipefail
+          limit_bytes=100000000
+          shopt -s nullglob
+          wheels=(dist/*.whl)
+          if [[ "${#wheels[@]}" -eq 0 ]]; then
+            echo "No wheel artifacts found under dist/" >&2
+            exit 1
+          fi
+          if [[ "${#wheels[@]}" -ne 4 ]]; then
+            echo "Expected 4 wheel artifacts, found ${#wheels[@]}" >&2
+            printf '%s\n' "${wheels[@]}" >&2
+            exit 1
+          fi
+          failed=0
+          for wheel in "${wheels[@]}"; do
+            size="$(stat -c '%s' "$wheel")"
+            printf '%s %s bytes\n' "$wheel" "$size"
+            if (( size > limit_bytes )); then
+              echo "Wheel exceeds 100 MB PyPI/TestPyPI upload limit: ${wheel}" >&2
+              failed=1
+            fi
+          done
+          exit "$failed"
+
+      - name: Generate artifact attestation for wheels
+        uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0
+        with:
+          subject-path: dist/*
+
+      - name: Publish package distributions to TestPyPI
+        if: ${{ inputs.publish_repository == 'testpypi' }}
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
+        with:
+          attestations: true
+          verbose: true
+          repository-url: https://test.pypi.org/legacy/
+
+      - name: Publish package distributions to PyPI
+        if: ${{ inputs.publish_repository == 'pypi' }}
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
+        with:
+          attestations: true
+          verbose: true
+
+  verify_pypi:
+    name: Verify uploaded package
+    needs: [upload_pypi]
+    if: ${{ inputs.publish_repository != 'none' && inputs.verify_from_repository }}
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+    steps:
+      - name: Check out source
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ inputs.tag }}
+          submodules: recursive
+          fetch-depth: 0
+          fetch-tags: true
+
+      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
+        with:
+          name: tvm-wheel-linux-x86_64-manylinux_2_28
+          path: wheelhouse
+
+      - name: Set up Python
+        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+        with:
+          python-version: "3.10"
+
+      - name: Verify the published package
+        env:
+          TVM_WHEEL_EXPECT_CUDA_RUNTIME: "1"
+        run: |
+          set -eux
+          if [[ "${{ inputs.publish_repository }}" == "pypi" ]]; then
+            index_url="https://pypi.org/simple/"
+          else
+            index_url="https://test.pypi.org/simple/"
+          fi
+          # The package name/version come from the uploaded wheel's filename
+          # (PEP 427: {distribution}-{version}-...).
+          wheel="$(ls wheelhouse/*.whl | head -n1)"
+          base="$(basename "${wheel}")"
+          name="${base%%-*}"
+          version="$(printf '%s' "${base#*-}" | cut -d- -f1)"
+          python -m venv /tmp/verify-venv
+          /tmp/verify-venv/bin/python -m pip install --upgrade pip
+          /tmp/verify-venv/bin/python -m pip install \
+            --index-url "${index_url}" --extra-index-url https://pypi.org/simple \
+            "${name}==${version}"
+          /tmp/verify-venv/bin/python -m pip install pytest numpy
+          /tmp/verify-venv/bin/python -m pytest -p no:tvm.testing.plugin -vvs tests/python/wheel
+          /tmp/verify-venv/bin/python -m pytest -vvs tests/python/all-platform-minimal-test
diff --git a/.gitignore b/.gitignore
index 9e734b0be06d..0ee1eb241807 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ __pycache__/
 env/
 build/
 build-*/
+!.github/actions/build-*/
+!.github/actions/build-*/action.yml
 develop-eggs/
 dist/
 downloads/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a11a8729700f..138cfd22b02f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@ project(tvm C CXX)
 include(cmake/utils/Utils.cmake)
 include(cmake/utils/Summary.cmake)
 include(cmake/utils/Linker.cmake)
+include(cmake/utils/Library.cmake)
 include(cmake/utils/FindCUDA.cmake)
 include(cmake/utils/FindNCCL.cmake)
 include(cmake/utils/FindOpenCL.cmake)
@@ -526,16 +527,7 @@ if(TVM_VISIBILITY_FLAG)
   set_property(TARGET tvm_runtime_extra APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 endif()
 
-set_target_properties(tvm_runtime_extra PROPERTIES
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-)
-
-install(TARGETS tvm_runtime_extra DESTINATION lib${LIB_SUFFIX})
-if(TVM_BUILD_PYTHON_MODULE)
-  install(TARGETS tvm_runtime_extra DESTINATION "lib")
-endif()
+tvm_configure_target_library(tvm_runtime_extra RUNTIME_MODULE)
 
 add_library(tvm_objs OBJECT ${COMPILER_SRCS})
 add_library(tvm_runtime_objs OBJECT ${RUNTIME_SRCS})
@@ -592,6 +584,25 @@ target_include_directories(tvm_compiler PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INST
 set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "${TVM_NO_UNDEFINED_SYMBOLS}")
 set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 
+# Work around a GNU ld (binutils) relaxation bug that miscompiles
+# R_X86_64_GOTPCRELX relocations inside very large statically-linked archives.
+# When the full LLVM static libraries are linked into libtvm_compiler.so, the
+# library is large enough that ld can relax an indirect GOT call (LLVM built
+# with -fno-plt emits these) into a direct call with an incorrect displacement.
+# The call then targets read-only data instead of the intended function and
+# crashes at runtime with a SIGSEGV inside llvm::X86Subtarget during code
+# generation. Disabling linker relaxation keeps the GOT-indirect sequences and
+# avoids the miscompilation; it is harmless when LLVM is linked dynamically.
+# See binutils bug ld/25754.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND NOT ${USE_LLVM} MATCHES ${IS_FALSE_PATTERN})
+  set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "-Wl,--no-relax")
+  # LLVM's --system-libs reports -lxml2, but TVM calls no libxml2 symbols. The
+  # manylinux toolchain does not default to --as-needed, so set it explicitly:
+  # the unused libxml2 is then not recorded as a NEEDED dependency, so auditwheel
+  # does not vendor it (and its deps) into the wheel.
+  set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "-Wl,--as-needed")
+endif()
+
 # Place runtime/compiler/allvisible artifacts under build/lib/ to mirror the
 # tvm-ffi layout and make tvm_ffi.libinfo.load_lib_ctypes(package="tvm") able
 # to discover them in dev / editable builds.
@@ -840,23 +851,19 @@ endif()
 # Note: NCCL, NVSHMEM, RCCL target_link_libraries are handled in the inline
 # libtvm_runtime_extra assembly block above.
 
+# Keep the core shared libraries relocatable. A relative rpath
+# ($ORIGIN / @loader_path) is correct in any build because the sibling runtime
+# DSOs are installed next to each other, so apply it unconditionally rather than
+# only when building the Python wheel. (tvm_runtime_extra already gets its rpath
+# where it is defined above.)
+tvm_configure_target_library(tvm_compiler)
+tvm_configure_target_library(tvm_runtime)
+
 # Python package installation configuration
 # This section ensures that all necessary files are installed for the Python wheel
 if(TVM_BUILD_PYTHON_MODULE)
   message(STATUS "Configuring Python package installation")
 
-  # Set RPATH for tvm_compiler and tvm_runtime to find each other relatively
-  # (libtvm_compiler.so links against libtvm_runtime.so).
-  if(APPLE)
-    # macOS uses @loader_path
-    set_target_properties(tvm_compiler PROPERTIES INSTALL_RPATH "@loader_path")
-    set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path")
-  elseif(LINUX)
-    # Linux uses $ORIGIN
-    set_target_properties(tvm_compiler PROPERTIES INSTALL_RPATH "\$ORIGIN")
-    set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN")
-  endif()
-
   # Install compiled shared libraries into <project>/lib so that
   # tvm_ffi.libinfo.load_lib_ctypes(package="tvm", target_name=...) can find
   # them via the package RECORD or the project's lib/ fallback dir.
@@ -865,12 +872,26 @@ if(TVM_BUILD_PYTHON_MODULE)
 
   # Install third-party compiled dependencies into the same lib/ dir.
   if(TARGET fpA_intB_gemm)
+    tvm_configure_target_library(fpA_intB_gemm)
     install(TARGETS fpA_intB_gemm DESTINATION "lib")
   endif()
   if(TARGET flash_attn)
+    tvm_configure_target_library(flash_attn)
     install(TARGETS flash_attn DESTINATION "lib")
   endif()
 
+  # Install prebuilt extra runtime libraries into the same lib/ dir. This is how
+  # the separately-built CUDA runtime (libtvm_runtime_cuda.so) is bundled: the
+  # publishing flow builds it in a CUDA-enabled environment and passes its path
+  # via TVM_PACKAGE_EXTRA_LIBS, so it ships through the normal CMake install
+  # rather than a post-build wheel rewrite.
+  foreach(_extra_lib IN LISTS TVM_PACKAGE_EXTRA_LIBS)
+    if(NOT EXISTS "${_extra_lib}")
+      message(FATAL_ERROR "TVM_PACKAGE_EXTRA_LIBS entry does not exist: ${_extra_lib}")
+    endif()
+    install(FILES "${_extra_lib}" DESTINATION "lib")
+  endforeach()
+
   # Install minimal header files needed by Python extensions
   install(
     DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/include/tvm/runtime/"
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
new file mode 100644
index 000000000000..a7dace9d4e23
--- /dev/null
+++ b/ci/scripts/package/README.md
@@ -0,0 +1,29 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# TVM wheel packaging
+
+The wheels are built by a standard `cibuildwheel` flow, configured in
+`.github/workflows/publish_wheel.yml` and `pyproject.toml` (`[tool.cibuildwheel]`
+and `[tool.scikit-build]`). This directory holds the few helper scripts that flow
+invokes:
+
+- `manylinux_build_libtvm_runtime_cuda.sh` — run by the `build_cuda_runtime` CI
+  stage; builds the `libtvm_runtime_cuda.so` sidecar inside the manylinux container.
+- `windows_build_libtvm_runtime_cuda.sh` — the Windows equivalent, building
+  `tvm_runtime_cuda.dll`.
+- `build-environment.yaml` — conda environment for building the wheel.
diff --git a/tests/conda/build-environment.yaml b/ci/scripts/package/build-environment.yaml
similarity index 98%
rename from tests/conda/build-environment.yaml
rename to ci/scripts/package/build-environment.yaml
index ebd45ff4c422..3b2c4dd16751 100644
--- a/tests/conda/build-environment.yaml
+++ b/ci/scripts/package/build-environment.yaml
@@ -33,6 +33,8 @@ dependencies:
   - pip
   - git
   - bzip2
+  - zlib
+  - zstd-static
   - pytest
   - numpy
   - scipy
diff --git a/ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh b/ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh
new file mode 100755
index 000000000000..fe721a18f4d0
--- /dev/null
+++ b/ci/scripts/package/manylinux_build_libtvm_runtime_cuda.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Build libtvm_runtime_cuda.so inside a manylinux container, run by the
+# build_cuda_runtime CI job. Installs the pinned CUDA toolkit and builds the
+# sidecar into build-wheel-cuda/lib/ for the wheel build to bundle.
+#
+# Usage: manylinux_build_libtvm_runtime_cuda.sh
+set -euxo pipefail
+
+repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+build_dir="${repo_root}/build-wheel-cuda"
+python_bin="/opt/python/cp310-cp310/bin/python"
+parallel="$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)"
+
+# Install the pinned CUDA toolkit into the manylinux_2_28 container. The RHEL8
+# local-repo RPM is compatible with manylinux_2_28 for both x86_64 and aarch64.
+arch="$(uname -m)"
+cuda_rpm="cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${arch}.rpm"
+curl -fsSLo "/tmp/${cuda_rpm}" \
+  "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm}"
+rpm -i "/tmp/${cuda_rpm}"
+dnf clean all
+dnf -y --disablerepo=epel install cuda-toolkit-13-0
+rm -f "/tmp/${cuda_rpm}"
+dnf clean all
+
+# Build the CUDA runtime sidecar with CUDA on and LLVM off, so it does not need
+# the LLVM prefix; the main CPU wheel links LLVM statically. The manylinux image
+# ships no cmake/ninja, so install the build tools here.
+export PATH="/opt/python/cp310-cp310/bin:/usr/local/cuda/bin:${PATH}"
+"${python_bin}" -m pip install -U pip cmake ninja
+nvcc --version
+
+rm -rf "${build_dir}"
+# CMAKE_CUDA_COMPILER only tells CMake which nvcc to use; it does not affect the
+# resulting libtvm_runtime_cuda.so, which is built only from .cc host sources (no
+# .cu device code, so nvcc is never invoked for it). CMAKE_CUDA_ARCHITECTURES is
+# intentionally not set: it would be a no-op here for the same reason (verified --
+# the .so is byte-identical across arch values and carries no device code), and
+# modern CMake fills in a default so configure does not fail without it.
+cmake -S "${repo_root}" -B "${build_dir}" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DBUILD_TESTING=OFF \
+  -DTVM_BUILD_PYTHON_MODULE=ON \
+  -DUSE_CUDA=/usr/local/cuda \
+  -DUSE_LLVM=OFF \
+  -DUSE_CUBLAS=OFF -DUSE_CUDNN=OFF -DUSE_CUTLASS=OFF -DUSE_NCCL=OFF -DUSE_NVTX=OFF \
+  -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
+cmake --build "${build_dir}" --target tvm_runtime tvm_runtime_cuda --parallel "${parallel}"
+
+cuda_lib="${build_dir}/lib/libtvm_runtime_cuda.so"
+test -f "${cuda_lib}"
+patchelf --set-rpath '$ORIGIN' "${cuda_lib}"
+echo "CUDA runtime: ${cuda_lib}"
diff --git a/ci/scripts/package/windows_build_libtvm_runtime_cuda.sh b/ci/scripts/package/windows_build_libtvm_runtime_cuda.sh
new file mode 100755
index 000000000000..5e28a1ef0122
--- /dev/null
+++ b/ci/scripts/package/windows_build_libtvm_runtime_cuda.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Build tvm_runtime_cuda.dll on a Windows runner, run by the build_cuda_runtime CI
+# job (on the host; unlike Linux there is no build container on Windows). Installs
+# the pinned CUDA toolkit via conda and builds the sidecar into build-wheel-cuda/lib/
+# for the wheel build to bundle. Windows mirror of manylinux_build_libtvm_runtime_cuda.sh.
+#
+# Usage: windows_build_libtvm_runtime_cuda.sh
+set -euxo pipefail
+
+# Keep a unix-style path for bash file operations and a mixed (forward-slash)
+# path for CMake/cmd, which dislike the /c/... msys form.
+repo_root_unix="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+repo_root="$(cygpath -m "${repo_root_unix}")"
+build_dir_unix="${repo_root_unix}/build-wheel-cuda"
+build_dir="$(cygpath -m "${build_dir_unix}")"
+cuda_prefix_unix="/c/opt/cuda"
+cuda_prefix="C:/opt/cuda"
+
+# Locate conda: this script runs under a non-login bash, so conda may not be on
+# PATH even though the runner ships Miniconda (exposed via $CONDA).
+conda_exe="$(command -v conda || true)"
+if [[ -z "${conda_exe}" ]]; then
+  conda_exe="${CONDA:-/c/Miniconda}/Scripts/conda.exe"
+fi
+
+# Install the pinned CUDA toolkit via conda from the nvidia channel, mirroring the
+# LLVM-via-conda install used elsewhere in the publish action. The win-64 channel
+# caps at 13.0.x, matching the Linux hook's CUDA 13.0.2.
+if [[ ! -e "${cuda_prefix_unix}/Library/bin/nvcc.exe" ]]; then
+  "${conda_exe}" create -q -p "${cuda_prefix}" -c nvidia/label/cuda-13.0.2 cuda-toolkit -y \
+    || "${conda_exe}" create -q -p "${cuda_prefix}" -c nvidia/label/cuda-13.0.2 cuda-toolkit --use-only-tar-bz2 -y
+fi
+
+# conda lays the Windows toolkit out under <prefix>/Library (bin/nvcc.exe,
+# lib/x64/cudart.lib, include/...). Discover the root from nvcc.exe so TVM's
+# FindCUDA MSVC branch resolves against the real layout instead of a hardcode.
+nvcc_unix="$(find "${cuda_prefix_unix}" -iname nvcc.exe | head -n1)"
+test -n "${nvcc_unix}"
+nvcc_exe="$(cygpath -m "${nvcc_unix}")"
+cuda_root="$(cygpath -m "$(dirname "$(dirname "${nvcc_unix}")")")"   # <prefix>/Library
+export CUDA_PATH="${cuda_root}"
+
+python -m pip install -U pip cmake ninja
+"${nvcc_exe}" --version
+
+# nvcc needs the MSVC host compiler (cl.exe) on PATH, but this bash is not a VS
+# Developer shell. Locate VS via vswhere and run the cmake configure+build inside
+# vcvars64.
+vswhere="C:/Program Files (x86)/Microsoft Visual Studio/Installer/vswhere.exe"
+vs_path="$("${vswhere}" -latest -products '*' \
+  -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 \
+  -property installationPath | tr -d '\r')"
+test -n "${vs_path}"
+vcvars="${vs_path}\\VC\\Auxiliary\\Build\\vcvars64.bat"
+
+rm -rf "${build_dir_unix}"
+
+# CMAKE_CUDA_COMPILER only tells CMake which nvcc to use (load-bearing here: the
+# conda nvcc is not on PATH); it does not affect the resulting tvm_runtime_cuda.dll,
+# which is built only from .cc host sources (no .cu device code). CMAKE_CUDA_ARCHITECTURES
+# is intentionally not set -- a no-op for the same reason, and modern CMake fills a
+# default. -allow-unsupported-compiler guards against the runner's MSVC being newer
+# than the CUDA toolkit officially supports.
+cmd_script="$(mktemp --suffix=.bat)"
+cat > "${cmd_script}" <<EOF
+call "${vcvars}" || exit /b 1
+cmake -S "${repo_root}" -B "${build_dir}" -G Ninja ^
+  -DCMAKE_BUILD_TYPE=Release ^
+  -DBUILD_TESTING=OFF ^
+  -DTVM_BUILD_PYTHON_MODULE=ON ^
+  -DUSE_CUDA="${cuda_root}" ^
+  -DUSE_LLVM=OFF ^
+  -DUSE_CUBLAS=OFF -DUSE_CUDNN=OFF -DUSE_CUTLASS=OFF -DUSE_NCCL=OFF -DUSE_NVTX=OFF ^
+  -DCMAKE_CUDA_COMPILER="${nvcc_exe}" ^
+  -DCMAKE_CUDA_FLAGS="-allow-unsupported-compiler" || exit /b 1
+cmake --build "${build_dir}" --target tvm_runtime tvm_runtime_cuda --config Release || exit /b 1
+EOF
+cmd //C "$(cygpath -w "${cmd_script}")"
+rm -f "${cmd_script}"
+
+cuda_lib_unix="${build_dir_unix}/lib/tvm_runtime_cuda.dll"
+test -f "${cuda_lib_unix}"
+# No patchelf/rpath step on Windows; delvewheel vendors dependencies at repair time.
+echo "CUDA runtime: ${build_dir}/lib/tvm_runtime_cuda.dll"
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index ec6160e7afaf..0028e04fcc5d 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -67,6 +67,10 @@ if(USE_CUDA)
 
   add_library(tvm_runtime_cuda_objs OBJECT ${RUNTIME_CUDA_SRCS} ${VM_CUDA_BUILTIN_SRC_CC})
   target_link_libraries(tvm_runtime_cuda_objs PUBLIC tvm_ffi_header)
+  # These sources compile into tvm_runtime_cuda.dll, so their TVM_RUNTIME_DLL /
+  # TVM_FFI_DLL symbols must be dllexport on MSVC (e.g. GetCudaDeviceCount in
+  # cuda_device_api.cc). Mirror tvm_runtime_objs; a no-op on non-MSVC platforms.
+  target_compile_definitions(tvm_runtime_cuda_objs PRIVATE TVM_RUNTIME_EXPORTS TVM_FFI_EXPORTS)
   set_target_properties(tvm_runtime_cuda_objs PROPERTIES POSITION_INDEPENDENT_CODE ON)
   if(TVM_VISIBILITY_FLAG)
     target_compile_options(tvm_runtime_cuda_objs PRIVATE "${TVM_VISIBILITY_FLAG}")
@@ -74,15 +78,7 @@ if(USE_CUDA)
   add_library(tvm_runtime_cuda SHARED $<TARGET_OBJECTS:tvm_runtime_cuda_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_cuda)
   target_link_libraries(tvm_runtime_cuda PUBLIC tvm_runtime ${CUDA_CUDART_LIBRARY} ${CUDA_CUDA_LIBRARY})
-  set_target_properties(tvm_runtime_cuda PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_cuda DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_cuda DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_cuda RUNTIME_MODULE)
 
   if(USE_NVTX)
     message(STATUS "Build with NVTX support")
@@ -102,7 +98,7 @@ if(USE_CUDA AND USE_CUDNN)
   add_library(tvm_cudnn_objs OBJECT ${CONTRIB_CUDNN_SRCS})
   target_link_libraries(tvm_cudnn_objs PRIVATE tvm_runtime_extra_defs)
   target_link_libraries(tvm_runtime_extra PRIVATE tvm_cudnn_objs ${CUDA_CUDNN_LIBRARY})
-endif(USE_CUDNN)
+endif(USE_CUDA AND USE_CUDNN)
 
 if(USE_CUDA AND USE_CUDNN_FRONTEND)
   message(STATUS "Build with cuDNN Frontend support")
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 431b15b13ac6..c92fc7079949 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -346,13 +346,5 @@ elseif(USE_HEXAGON)
   add_library(tvm_runtime_hexagon SHARED $<TARGET_OBJECTS:tvm_runtime_hexagon_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_hexagon)
   target_link_libraries(tvm_runtime_hexagon PUBLIC tvm_runtime)
-  set_target_properties(tvm_runtime_hexagon PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_hexagon DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_hexagon DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_hexagon RUNTIME_MODULE)
 endif()
diff --git a/cmake/modules/Metal.cmake b/cmake/modules/Metal.cmake
index 72e7585534bb..c593d0d420cb 100644
--- a/cmake/modules/Metal.cmake
+++ b/cmake/modules/Metal.cmake
@@ -30,15 +30,7 @@ if(USE_METAL)
   add_library(tvm_runtime_metal SHARED $<TARGET_OBJECTS:tvm_runtime_metal_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_metal)
   target_link_libraries(tvm_runtime_metal PUBLIC tvm_runtime ${METAL_LIB} ${FOUNDATION_LIB})
-  set_target_properties(tvm_runtime_metal PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_metal DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_metal DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_metal RUNTIME_MODULE)
 endif(USE_METAL)
 # When USE_METAL=OFF the codegen-side fallback in
 # src/target/metal/metal_fallback_module.cc handles construction; no opt
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 9a1c20a5a5ab..f833832d4cde 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -46,15 +46,7 @@ if(USE_OPENCL)
   add_library(tvm_runtime_opencl SHARED $<TARGET_OBJECTS:tvm_runtime_opencl_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_opencl)
   target_link_libraries(tvm_runtime_opencl PUBLIC tvm_runtime ${_opencl_libs})
-  set_target_properties(tvm_runtime_opencl PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_opencl DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_opencl DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_opencl RUNTIME_MODULE)
 
   if(USE_OPENCL_ENABLE_HOST_PTR)
     add_definitions(-DOPENCL_ENABLE_HOST_PTR)
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
index b974aa412959..a2d1516558ba 100644
--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
@@ -48,15 +48,7 @@ if(USE_ROCM)
   add_library(tvm_runtime_rocm SHARED $<TARGET_OBJECTS:tvm_runtime_rocm_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_rocm)
   target_link_libraries(tvm_runtime_rocm PUBLIC tvm_runtime ${_rocm_libs})
-  set_target_properties(tvm_runtime_rocm PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_rocm DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_rocm DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_rocm RUNTIME_MODULE)
 endif(USE_ROCM)
 
 # HIPBLAS contrib goes into libtvm_runtime_extra.
diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
index 6821b4419b1a..ba51e4b84206 100644
--- a/cmake/modules/Vulkan.cmake
+++ b/cmake/modules/Vulkan.cmake
@@ -55,13 +55,5 @@ if(USE_VULKAN)
   add_library(tvm_runtime_vulkan SHARED $<TARGET_OBJECTS:tvm_runtime_vulkan_objs>)
   list(APPEND TVM_RUNTIME_BACKEND_LIBS tvm_runtime_vulkan)
   target_link_libraries(tvm_runtime_vulkan PUBLIC tvm_runtime ${Vulkan_LIBRARY})
-  set_target_properties(tvm_runtime_vulkan PROPERTIES
-    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  )
-  install(TARGETS tvm_runtime_vulkan DESTINATION lib${LIB_SUFFIX})
-  if(TVM_BUILD_PYTHON_MODULE)
-    install(TARGETS tvm_runtime_vulkan DESTINATION "lib")
-  endif()
+  tvm_configure_target_library(tvm_runtime_vulkan RUNTIME_MODULE)
 endif(USE_VULKAN)
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index 8aa9c8b1b959..2bf229eca756 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -210,8 +210,13 @@ macro(find_llvm use_llvm)
         message(STATUS "LLVM links against xml2")
         list(APPEND LLVM_LIBS "-lxml2")
       elseif("${__flag}" STREQUAL "zstd.dll.lib")
-        message(STATUS "LLVM linker flag under LLVM libdir: ${__llvm_libdir}/zstd.lib")
-        list(APPEND LLVM_LIBS "${__llvm_libdir}/zstd.lib")
+        if (EXISTS "${__llvm_libdir}/zstd_static.lib")
+          message(STATUS "LLVM links against static zstd")
+          list(APPEND LLVM_LIBS "${__llvm_libdir}/zstd_static.lib")
+        else()
+          message(STATUS "LLVM linker flag under LLVM libdir: ${__llvm_libdir}/zstd.lib")
+          list(APPEND LLVM_LIBS "${__llvm_libdir}/zstd.lib")
+        endif()
       elseif((__flag MATCHES ".lib$") AND (EXISTS "${__llvm_libdir}/${__flag}"))
         # If the library file ends in .lib try to also search the llvm_libdir
         message(STATUS "LLVM linker flag under LLVM libdir: ${__llvm_libdir}/${__flag}")
diff --git a/cmake/utils/Library.cmake b/cmake/utils/Library.cmake
new file mode 100644
index 000000000000..93cc748ae48b
--- /dev/null
+++ b/cmake/utils/Library.cmake
@@ -0,0 +1,65 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Helpers for configuring library targets.
+
+#######################################################
+# tvm_configure_target_library(target_name [RUNTIME_MODULE])
+#
+# Configure a TVM library target. The target always gets a relative rpath
+# ($ORIGIN / @loader_path) so that sibling shared libraries in the same
+# directory resolve each other regardless of the install location (e.g. inside a
+# Python wheel).
+#
+# With the RUNTIME_MODULE option -- used for the optional runtime backend
+# libraries (tvm_runtime_cuda, tvm_runtime_vulkan, ...) -- the target is also
+# emitted into the build "lib" directory and installed; when building the Python
+# module it is additionally installed into the package "lib" directory. Targets
+# that manage their own output directory / install rules (tvm_compiler,
+# tvm_runtime, ...) omit the option and take only the rpath.
+#
+# No-op if the target does not exist.
+function(tvm_configure_target_library target_name)
+  if(NOT TARGET ${target_name})
+    return()
+  endif()
+  cmake_parse_arguments(ARG "RUNTIME_MODULE" "" "" ${ARGN})
+
+  if(APPLE)
+    set_target_properties(${target_name} PROPERTIES
+      BUILD_RPATH "@loader_path"
+      INSTALL_RPATH "@loader_path"
+    )
+  elseif(UNIX)
+    set_target_properties(${target_name} PROPERTIES
+      BUILD_RPATH "\$ORIGIN"
+      INSTALL_RPATH "\$ORIGIN"
+    )
+  endif()
+
+  if(ARG_RUNTIME_MODULE)
+    set_target_properties(${target_name} PROPERTIES
+      LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+      RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+      ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    )
+    install(TARGETS ${target_name} DESTINATION lib${LIB_SUFFIX})
+    if(TVM_BUILD_PYTHON_MODULE)
+      install(TARGETS ${target_name} DESTINATION "lib")
+    endif()
+  endif()
+endfunction()
diff --git a/pyproject.toml b/pyproject.toml
index 2e400f0609e6..1dd4a27df802 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,7 @@
 # under the License.
 
 [build-system]
-requires = ["scikit-build-core>=0.10.0"]
+requires = ["scikit-build-core>=0.11"]
 build-backend = "scikit_build_core.build"
 
 [project]
@@ -25,7 +25,8 @@ name = "tvm"
 version = "0.25.dev0"
 description = "Apache TVM: An End-to-End Deep Learning Compiler Stack"
 readme = "README.md"
-license = { text = "Apache-2.0" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
 requires-python = ">=3.10"
 authors = [{ name = "Apache TVM Community", email = "dev@tvm.apache.org" }]
 keywords = ["machine learning", "compiler", "deep learning", "inference"]
@@ -34,55 +35,39 @@ classifiers = [
   "Intended Audience :: Developers",
   "Intended Audience :: Education",
   "Intended Audience :: Science/Research",
-  "License :: OSI Approved :: Apache Software License",
   "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
   "Topic :: Scientific/Engineering :: Artificial Intelligence",
   "Topic :: Software Development :: Libraries :: Python Modules",
 ]
-# Core dependencies - these are the minimum required for basic TVM functionality
 dependencies = [
-  "apache-tvm-ffi",
-  "cloudpickle",
+  "apache-tvm-ffi>=0.1.11",
   "ml_dtypes",
   "numpy",
-  "packaging",
   "psutil",
-  "scipy",
-  "tornado",
+  "pytest",
   "typing_extensions",
 ]
 
-# Optional dependencies for different features
 [project.optional-dependencies]
-# Model importers
-importer-coreml = ["coremltools"]
-importer-keras = ["tensorflow", "tensorflow-estimator"]
-importer-onnx = [
-  "future",
-  "onnx",
-  "onnxoptimizer",
-  "onnxruntime",
-  "torch",
-  "torchvision",
-]
+importer-onnx = ["onnx", "onnxoptimizer", "onnxruntime"]
 importer-pytorch = ["torch", "torchvision"]
-importer-tensorflow = ["tensorflow", "tensorflow-estimator"]
 importer-tflite = ["tflite"]
-importer-paddle = ["paddlepaddle"]
+coreml = ["coremltools"]
+meta-schedule = ["xgboost"]
+all = ["xgboost"]
 
-# AutoTVM and autoscheduler
-autotvm = ["xgboost"]
-autoscheduler = ["xgboost"]
+[project.urls]
+Homepage = "https://tvm.apache.org/"
+Documentation = "https://tvm.apache.org/docs/"
+Repository = "https://github.com/apache/tvm"
+"Bug Tracker" = "https://github.com/apache/tvm/issues"
 
-# Development and testing
-dev = [
-  "ruff",
-  "mypy",
-  "pre-commit",
+[dependency-groups]
+test = [
   "pytest",
   "pytest-xdist",
   "pytest-cov",
@@ -92,42 +77,13 @@ dev = [
   "pytest-rerunfailures",
   "pytest-repeat",
 ]
-
-# All optional dependencies (excluding dev)
-all = [
-  "coremltools",
-  "tensorflow",
-  "tensorflow-estimator",
-  "future",
-  "onnx",
-  "onnxoptimizer",
-  "onnxruntime",
-  "torch",
-  "torchvision",
-  "tflite",
-  "paddlepaddle",
-  "xgboost",
-]
-
-[project.urls]
-Homepage = "https://tvm.apache.org/"
-Documentation = "https://tvm.apache.org/docs/"
-Repository = "https://github.com/apache/tvm"
-"Bug Tracker" = "https://github.com/apache/tvm/issues"
+lint = ["ruff", "pre-commit"]
+dev = [{ include-group = "test" }, { include-group = "lint" }]
 
 [tool.scikit-build]
-# Point to the root CMakeLists.txt
-cmake.source-dir = "."
 cmake.build-type = "Release"
-
-# Configure the wheel to be Python version-agnostic
 wheel.py-api = "py3"
-
-# Build configuration
-build-dir = "build"
-
-# CMake configuration - ensure proper installation paths
-cmake.args = ["-DTVM_BUILD_PYTHON_MODULE=ON"]
+build-dir = "build/{wheel_tag}"
 
 # Wheel configuration
 wheel.packages = ["python/tvm"]
@@ -139,7 +95,6 @@ sdist.include = [
   "/CMakeLists.txt",
   "/pyproject.toml",
   "/cmake/**/*",
-  "/  */*",
 
   # Source code
   "/src/**/*.cc",
@@ -176,6 +131,11 @@ sdist.exclude = [
 # Logging
 logging.level = "INFO"
 
+[tool.scikit-build.cmake.define]
+TVM_BUILD_PYTHON_MODULE = "ON"
+USE_CUDA = "OFF"
+BUILD_TESTING = "OFF"
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-v --tb=short"
@@ -205,15 +165,7 @@ include = [
 line-length = 100
 indent-width = 4
 target-version = "py310"
-exclude = [
-  "3rdparty",
-  "build",
-  "dist",
-  ".venv",
-  ".mypy_cache",
-  ".ruff_cache",
-  "node_modules",
-]
+exclude = ["3rdparty", "build", "dist", ".venv", ".ruff_cache", "node_modules"]
 
 [tool.ruff.lint]
 select = [
@@ -257,32 +209,23 @@ line-ending = "auto"
 docstring-code-format = false
 docstring-code-line-length = "dynamic"
 
-[tool.mypy]
-python_version = "3.9"
-show_error_codes = true
-mypy_path = ["python"]
-files = ["python/tvm"]
-namespace_packages = true
-explicit_package_bases = true
-allow_redefinition = true
-ignore_missing_imports = true
-follow_imports = "skip"
-strict_optional = false
-exclude = '''(?x)(
-    ^\.venv/|
-    ^build/|
-    ^dist/|
-    ^\.mypy_cache/|
-    ^3rdparty/
-)'''
-
-[[tool.mypy.overrides]]
-module = ["python.tvm.auto_scheduler.*"]
-ignore_errors = true
-
-[[tool.mypy.overrides]]
-module = ["python.tvm.runtime.*"]
-ignore_errors = true
-
-[dependency-groups]
-lint = ["pre-commit"]
+[tool.cibuildwheel]
+# Skip win32, i686 (32-bit), and musllinux wheels.
+skip = "*-win32 *-manylinux_i686 *-musllinux*"
+build-verbosity = 1
+# Pin the manylinux image so the build_cuda_runtime sidecar (built in this same
+# image) stays ABI-compatible if a cibuildwheel bump changes the default.
+manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2026.01.04-1"
+manylinux-aarch64-image = "quay.io/pypa/manylinux_2_28_aarch64:2026.01.04-1"
+test-requires = ["pytest", "numpy"]
+test-command = "pytest -p no:tvm.testing.plugin -vvs {project}/tests/python/wheel && pytest -vvs {project}/tests/python/all-platform-minimal-test"
+
+[tool.cibuildwheel.linux]
+repair-wheel-command = "auditwheel repair --exclude libtvm_ffi.so --exclude libtvm_runtime_cuda.so --exclude 'libcuda.so.*' --exclude 'libcudart.so.*' --exclude 'libnvrtc.so.*' --exclude 'libnvrtc-builtins.so.*' -w {dest_dir} {wheel}"
+
+[tool.cibuildwheel.macos]
+repair-wheel-command = 'delocate-wheel --ignore-missing-dependencies --exclude libtvm_ffi.dylib --require-archs {delocate_archs} -w {dest_dir} -v {wheel}'
+
+[tool.cibuildwheel.windows]
+before-build = 'python -m pip install delvewheel'
+repair-wheel-command = "delvewheel repair --analyze-existing --ignore-existing --exclude tvm_ffi.dll --exclude libtvm_ffi.dll --exclude tvm_runtime_cuda.dll --exclude nvcuda.dll --exclude cudart64_13.dll --exclude nvrtc64_130_0.dll -w {dest_dir} {wheel}"
diff --git a/tests/python/wheel/test_validate_runtime_library.py b/tests/python/wheel/test_validate_runtime_library.py
new file mode 100644
index 000000000000..10a455f2a917
--- /dev/null
+++ b/tests/python/wheel/test_validate_runtime_library.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Post-install checks for a built TVM wheel.
+
+Run by cibuildwheel against the installed wheel (``test-command`` in
+``[tool.cibuildwheel]``). These assert the two wheel-specific things the standard
+``tests/python/all-platform-minimal-test`` suite cannot: that LLVM is enabled (its
+LLVM test merely *skips* when LLVM is absent), and that the CUDA runtime library
+got bundled (when ``TVM_WHEEL_EXPECT_CUDA_RUNTIME=1``). The functional LLVM
+compile / ndarray ops are covered by that all-platform suite.
+"""
+
+import glob
+import os
+from pathlib import Path
+
+import pytest
+
+import tvm
+
+
+def test_llvm_enabled():
+    """Every TVM wheel ships with LLVM enabled. The all-platform suite only skips
+    (does not fail) when LLVM is absent, so assert presence here."""
+    assert tvm.runtime.enabled("llvm"), "wheel was not built with LLVM enabled"
+
+
+def test_cuda_runtime_present():
+    """The bundled CUDA runtime library must be present in tvm/lib."""
+    if os.environ.get("TVM_WHEEL_EXPECT_CUDA_RUNTIME") != "1":
+        pytest.skip("CUDA runtime not expected in this wheel")
+    libdir = Path(tvm.__file__).resolve().parent / "lib"
+    present = glob.glob(str(libdir / "libtvm_runtime_cuda.*")) or glob.glob(
+        str(libdir / "tvm_runtime_cuda.*")
+    )
+    assert present, "CUDA runtime expected but not bundled in tvm/lib"