From 365f5888777777d5086ebedd9ae87340cdac6b1a Mon Sep 17 00:00:00 2001
From: Reuben Dunn <reubend@meta.com>
Date: Mon, 8 Jun 2026 18:46:24 -0700
Subject: [PATCH 1/2] Vulkan: support Linux/Windows desktop GPUs and opt-in
 wheel builds

The Vulkan backend was developed for Android GPUs. This makes it build and run
on Linux/Windows desktop discrete GPUs (NVIDIA/AMD/Intel) and adds opt-in
pre-built Vulkan wheels, with no change to Android behavior (build divergence is
behind compile-time guards; runtime changes key off queried capabilities).

Covers build portability, discrete-GPU correctness fixes (real-GPU device/ICD
selection, shaderInt16/Int64/Float64 enablement, a blit queue guard, and
texel-rounded buffer allocations to avoid out-of-bounds vec4 reads), and
EXECUTORCH_BUILD_VULKAN-gated CI/packaging (a new vulkan.yml runs the real-GPU
NVIDIA and Windows MSVC jobs, plus opt-in wheel plumbing). Tested on an NVIDIA
A100; the SwiftShader CI path is unchanged.

This change was authored with Claude.
---
 .ci/scripts/setup-vulkan-linux-deps.sh        | 125 ++++++++++++++-
 .ci/scripts/setup-vulkan-windows-deps.ps1     |  37 +++++
 .ci/scripts/setup-windows-msvc-vulkan.ps1     |  51 ++++++
 .ci/scripts/test_backend.sh                   |  11 +-
 .ci/scripts/wheel/pre_build_script.sh         |  31 ++++
 .ci/scripts/wheel/test_linux.py               |   7 +
 .ci/scripts/wheel/test_windows.py             |  11 ++
 .github/workflows/test-backend-vulkan.yml     |   5 +
 .github/workflows/vulkan.yml                  | 151 ++++++++++++++++++
 backends/vulkan/CMakeLists.txt                |  32 +++-
 backends/vulkan/cmake/ShaderLibrary.cmake     |  34 ++--
 .../vulkan/partitioner/vulkan_partitioner.py  |  12 +-
 backends/vulkan/runtime/api/Context.cpp       |   9 ++
 .../vulkan/runtime/api/containers/Tensor.cpp  |  15 +-
 backends/vulkan/runtime/gen_vulkan_spv.py     |   9 +-
 .../runtime/graph/ops/glsl/coopmat_mm.yaml    |   4 +
 backends/vulkan/runtime/vk_api/Adapter.cpp    |  76 ++++-----
 backends/vulkan/runtime/vk_api/Runtime.cpp    |  64 ++++++--
 backends/vulkan/runtime/vk_api/Runtime.h      |   4 +-
 .../vulkan/runtime/vk_api/memory/vma_api.h    |  38 ++++-
 .../vulkan/test/custom_ops/build_and_run.sh   |   2 +-
 .../test/test_vulkan_compile_options.py       |  48 ++++++
 backends/vulkan/utils.py                      |   4 +
 backends/vulkan/vulkan_preprocess.py          |  20 ++-
 setup.py                                      |   4 +
 tools/cmake/preset/pybind.cmake               |  28 ++++
 26 files changed, 746 insertions(+), 86 deletions(-)
 create mode 100644 .ci/scripts/setup-vulkan-windows-deps.ps1
 create mode 100644 .ci/scripts/setup-windows-msvc-vulkan.ps1
 create mode 100644 .github/workflows/vulkan.yml
 create mode 100644 backends/vulkan/test/test_vulkan_compile_options.py

diff --git a/.ci/scripts/setup-vulkan-linux-deps.sh b/.ci/scripts/setup-vulkan-linux-deps.sh
index a0dcb75ad4a..debe610a18a 100755
--- a/.ci/scripts/setup-vulkan-linux-deps.sh
+++ b/.ci/scripts/setup-vulkan-linux-deps.sh
@@ -1,4 +1,3 @@
-
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
@@ -22,7 +21,7 @@ install_swiftshader() {
   tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
 
   export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
-  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/"
+  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH:-}"
   export ETVK_USING_SWIFTSHADER=1
 }
 
@@ -43,7 +42,125 @@ install_vulkan_sdk() {
   export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
 }
 
+_maybe_sudo() {
+  if [ "$(id -u)" -eq 0 ]; then
+    "$@"
+  else
+    sudo "$@"
+  fi
+}
+
+install_glslc() {
+  # The glslc shipped in the LunarG SDK is dynamically linked against a newer
+  # glibc/libstdc++ than the manylinux_2_28 / AlmaLinux 8 CUDA runner image
+  # provides (glibc 2.28), where it fails to load with "GLIBC_2.29 not found".
+  # conda-forge's shaderc is built against an old sysroot, runs there, and is
+  # recent enough for the GL_EXT_integer_dot_product / GL_KHR_cooperative_matrix
+  # extensions the Vulkan shaders use. Install it into an isolated prefix so the
+  # base conda env that builds ExecuTorch is left untouched, then put it on PATH.
+  _glslc_prefix=/tmp/shaderc
+  conda create -y -p "${_glslc_prefix}" -c conda-forge shaderc
+  export PATH="${_glslc_prefix}/bin:${PATH}"
+}
+
+install_vulkan_loader() {
+  # libvulkan.so.1 (the Khronos loader that volk dlopen()s at runtime) is not part
+  # of the NVIDIA driver and is absent from the CUDA builder image; vulkan-tools
+  # provides vulkaninfo for the device sanity check. Both ship as native el8 RPMs.
+  if command -v dnf >/dev/null 2>&1; then
+    _maybe_sudo dnf install -y vulkan-loader vulkan-tools
+  fi
+}
+
+_find_nvidia_vulkan_library() {
+  # NVIDIA implements its Vulkan ICD inside libGLX_nvidia.so.0. The NVIDIA
+  # container runtime mounts this library into the container (it is pulled from
+  # the driver's ldcache when NVIDIA_DRIVER_CAPABILITIES includes graphics/all),
+  # so prefer ldconfig and fall back to the usual mount locations.
+  local lib cand
+  lib="$(ldconfig -p 2>/dev/null | awk '/libGLX_nvidia\.so\.0/ {print $NF; exit}')"
+  if [ -z "${lib}" ]; then
+    for cand in /usr/lib64/libGLX_nvidia.so.0 \
+        /usr/lib/x86_64-linux-gnu/libGLX_nvidia.so.0 \
+        /usr/lib/libGLX_nvidia.so.0; do
+      if [ -e "${cand}" ]; then
+        lib="${cand}"
+        break
+      fi
+    done
+  fi
+  printf '%s' "${lib}"
+}
+
+_vulkan_has_real_device() {
+  # True if the loader enumerates a hardware GPU. vulkaninfo can exit non-zero
+  # for unrelated reasons (no display/WSI), so key off the reported deviceType.
+  command -v vulkaninfo >/dev/null 2>&1 || return 0
+  vulkaninfo --summary 2>/dev/null |
+    grep -qE 'PHYSICAL_DEVICE_TYPE_(DISCRETE|INTEGRATED|VIRTUAL)_GPU'
+}
+
+setup_real_gpu_icd() {
+  # Select a Vulkan ICD so the runtime exercises the real GPU when one is usable.
+  # Two quirks of the CUDA CI image make this non-trivial:
+  #   1. The NVIDIA container runtime mounts the driver's Vulkan library but does
+  #      not register its ICD manifest, so the loader never discovers the GPU on
+  #      its own. We synthesize the manifest and pin the loader to it.
+  #   2. Installing vulkan-loader/vulkan-tools pulls in mesa-vulkan-drivers,
+  #      which drop Intel/AMD/lavapipe manifests for absent hardware. lavapipe
+  #      fails vkCreateInstance on this image and, because the loader walks every
+  #      manifest in icd.d, that poisons device enumeration for the whole
+  #      process. Pinning VK_ICD_FILENAMES makes the loader ignore icd.d, so the
+  #      broken stubs cannot interfere.
+  local nvidia_lib
+  nvidia_lib="$(_find_nvidia_vulkan_library)"
+  if [ -n "${nvidia_lib}" ]; then
+    local icd=/tmp/nvidia_icd.json
+    cat >"${icd}" <<JSON
+{
+    "file_format_version": "1.0.0",
+    "ICD": {
+        "library_path": "${nvidia_lib}",
+        "api_version": "1.3.0"
+    }
+}
+JSON
+    export VK_ICD_FILENAMES="${icd}"
+    unset ETVK_USING_SWIFTSHADER || true
+    if _vulkan_has_real_device; then
+      echo "Real NVIDIA GPU selected; pinned Vulkan ICD to ${nvidia_lib}"
+      return
+    fi
+    echo "WARNING: ${nvidia_lib} present but no GPU enumerated; using SwiftShader."
+    # Surface why the NVIDIA driver did not enumerate (e.g. a missing dependency
+    # of libGLX_nvidia, or no render node) so the fallback is diagnosable in CI.
+    if command -v vulkaninfo >/dev/null 2>&1; then
+      echo "--- NVIDIA Vulkan ICD diagnostic ---"
+      VK_LOADER_DEBUG=warn vulkaninfo --summary 2>&1 | head -40 || true
+      echo "--- end diagnostic ---"
+    fi
+    unset VK_ICD_FILENAMES
+  else
+    echo "WARNING: no NVIDIA Vulkan driver library found; using SwiftShader."
+  fi
+  install_swiftshader
+}
+
 VULKAN_SDK_VERSION="1.4.321.1"
 
-install_swiftshader
-install_vulkan_sdk "${VULKAN_SDK_VERSION}"
+# The no-argument default installs SwiftShader so the existing CPU-runner CI is
+# unchanged. Pass "real-gpu" to prefer a real system ICD when one is present.
+case "${1:-swiftshader}" in
+  real-gpu)
+    # Do not download the LunarG SDK here: its prebuilt glslc cannot run on the
+    # old-glibc CUDA image. glslc comes from conda-forge and the loader from the
+    # system package manager instead.
+    install_vulkan_loader
+    install_glslc
+    setup_real_gpu_icd
+    ;;
+  swiftshader | *)
+    install_swiftshader
+    install_vulkan_sdk "${VULKAN_SDK_VERSION}"
+    ;;
+esac
diff --git a/.ci/scripts/setup-vulkan-windows-deps.ps1 b/.ci/scripts/setup-vulkan-windows-deps.ps1
new file mode 100644
index 00000000000..335f457714f
--- /dev/null
+++ b/.ci/scripts/setup-vulkan-windows-deps.ps1
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install glslc (the Vulkan shader compiler) on Windows via conda-forge's
+# shaderc package, and make sure it is on PATH. glslc is the only build-time
+# Vulkan dependency -- the Vulkan headers and the volk loader come from the
+# in-tree submodules -- so this avoids depending on the heavyweight LunarG SDK
+# installer. Requires conda to be available (the callers create/activate an env).
+
+$ErrorActionPreference = "Stop"
+
+Write-Host "Installing shaderc (provides glslc) from conda-forge..."
+conda install -y -c conda-forge shaderc
+if ($LASTEXITCODE -ne 0) {
+    Write-Error "Failed to install shaderc from conda-forge (exit ${LASTEXITCODE})"
+    exit 1
+}
+
+$glslc = Get-Command glslc -ErrorAction SilentlyContinue
+if (-not $glslc) {
+    Write-Error "glslc not found on PATH after installing shaderc"
+    exit 1
+}
+
+# Expose glslc to the current process and, when running as a GitHub Actions step,
+# to subsequent steps.
+$glslcDir = Split-Path -Parent $glslc.Source
+$env:PATH = "${glslcDir};${env:PATH}"
+if ($env:GITHUB_PATH) {
+    Add-Content -Path $env:GITHUB_PATH -Value $glslcDir
+}
+
+Write-Host "glslc available at $($glslc.Source)"
+& glslc --version
diff --git a/.ci/scripts/setup-windows-msvc-vulkan.ps1 b/.ci/scripts/setup-windows-msvc-vulkan.ps1
new file mode 100644
index 00000000000..7fa2006e83f
--- /dev/null
+++ b/.ci/scripts/setup-windows-msvc-vulkan.ps1
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build-validation for the Vulkan backend under MSVC on Windows. Mirrors
+# setup-windows-msvc.ps1 but installs glslc (the Vulkan shader compiler) and
+# configures/builds the vulkan_backend target. This is a bring-up job: it exists
+# to surface MSVC portability issues in the Vulkan/volk/VMA code, so it may need
+# iteration.
+
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Install cmake
+conda install -y cmake
+
+# Activate the VS environment - this is required for MSVC to work.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install glslc (via conda-forge shaderc) and put it on PATH in this process.
+.ci/scripts/setup-vulkan-windows-deps.ps1
+
+# Install CI requirements
+pip install -r .ci/docker/requirements-ci.txt
+
+$buildDir = "cmake-out-vulkan"
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+
+cmake -S . -B $buildDir `
+    -DCMAKE_BUILD_TYPE=Release `
+    -DEXECUTORCH_BUILD_VULKAN=ON `
+    -DPYTHON_EXECUTABLE=python
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cmake --build $buildDir --config Release --target vulkan_backend -j16
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Vulkan backend MSVC build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+Write-Host "Vulkan backend MSVC build completed successfully!"
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index fe9b564a18f..80352fe1393 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -51,8 +51,15 @@ if [[ "$FLOW" == *qnn* ]]; then
 fi
 
 if [[ "$FLOW" == *vulkan* ]]; then
-    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate.
-    source .ci/scripts/setup-vulkan-linux-deps.sh
+    # Setup the Vulkan SDK and select an ICD: use the real system GPU ICD when one
+    # is present (real-GPU runner), otherwise fall back to SwiftShader (CPU
+    # runner). The Vulkan loader searches both standard ICD directories.
+    if ls /etc/vulkan/icd.d/*.json /usr/share/vulkan/icd.d/*.json \
+        >/dev/null 2>&1; then
+        source .ci/scripts/setup-vulkan-linux-deps.sh "real-gpu"
+    else
+        source .ci/scripts/setup-vulkan-linux-deps.sh "swiftshader"
+    fi
 
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
diff --git a/.ci/scripts/wheel/pre_build_script.sh b/.ci/scripts/wheel/pre_build_script.sh
index 365398d27a4..2c10e75fafd 100755
--- a/.ci/scripts/wheel/pre_build_script.sh
+++ b/.ci/scripts/wheel/pre_build_script.sh
@@ -69,3 +69,34 @@ if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then
   echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" >> "${GITHUB_ENV}"
   echo "QNN SDK downloaded to ${QNN_SDK_ROOT}"
 fi
+
+# Provision the Vulkan SDK (glslc) and submodules ONLY when explicitly requested
+# via EXECUTORCH_BUILD_VULKAN. The default wheel build leaves this unset, so it
+# does no extra work (no submodule fetch, no SDK download) and is unaffected.
+if [[ "${EXECUTORCH_BUILD_VULKAN:-0}" != "0" \
+      && "${EXECUTORCH_BUILD_VULKAN:-OFF}" != "OFF" ]]; then
+  echo "Initializing Vulkan backend third-party submodules..."
+  VULKAN_SUBMODULES=(
+    backends/vulkan/third-party/Vulkan-Headers
+    backends/vulkan/third-party/volk
+    backends/vulkan/third-party/VulkanMemoryAllocator
+  )
+  if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then
+    git -c http.sslBackend=openssl submodule update --init "${VULKAN_SUBMODULES[@]}"
+    echo "Installing Vulkan SDK for Windows wheel build..."
+    powershell -ExecutionPolicy Bypass -File .ci/scripts/setup-vulkan-windows-deps.ps1
+  else
+    git submodule update --init "${VULKAN_SUBMODULES[@]}"
+    # Install glslc from conda-forge rather than the LunarG SDK: the manylinux
+    # wheel image uses an old glibc where the SDK's prebuilt glslc cannot run
+    # ("GLIBC_2.29 not found"). conda-forge's shaderc is built against an old
+    # sysroot and runs there. Vulkan headers come from the submodules above and
+    # volk dlopen()s the loader at runtime, so only glslc is needed to build.
+    echo "Installing glslc (conda-forge shaderc) for Linux wheel build..."
+    _glslc_prefix="${HOME}/.shaderc"
+    conda create -y -p "${_glslc_prefix}" -c conda-forge shaderc
+    export PATH="${_glslc_prefix}/bin:${PATH}"
+    echo "${_glslc_prefix}/bin" >> "${GITHUB_PATH}"
+    echo "glslc installed: $(command -v glslc)"
+  fi
+fi
diff --git a/.ci/scripts/wheel/test_linux.py b/.ci/scripts/wheel/test_linux.py
index c441bcec91f..7545b4c6f20 100644
--- a/.ci/scripts/wheel/test_linux.py
+++ b/.ci/scripts/wheel/test_linux.py
@@ -31,6 +31,13 @@
         ), f"OpenvinoBackend not found in registered backends: {registered}"
         print("✓ OpenvinoBackend is registered")
 
+        # Vulkan backend is optional: only present when the wheel was built with
+        # EXECUTORCH_BUILD_VULKAN=1 and the Vulkan SDK (glslc) was available.
+        if "VulkanBackend" in registered:
+            print("✓ VulkanBackend is registered")
+        else:
+            print("⚠ VulkanBackend not registered (expected for the default wheel)")
+
     test_base.run_tests(
         model_tests=[
             test_base.ModelTest(
diff --git a/.ci/scripts/wheel/test_windows.py b/.ci/scripts/wheel/test_windows.py
index d2d8b29a534..ba141d4498c 100644
--- a/.ci/scripts/wheel/test_windows.py
+++ b/.ci/scripts/wheel/test_windows.py
@@ -5,6 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import platform
 from typing import List
 
 import torch
@@ -15,6 +16,7 @@
 from executorch.examples.xnnpack.quantization.utils import quantize as quantize_xnn
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 from executorch.extension.pybindings.portable_lib import (
+    _get_registered_backend_names,
     _load_for_executorch_from_buffer,
 )
 from test_base import ModelTest
@@ -63,6 +65,15 @@ def run_tests(model_tests: List[ModelTest]) -> None:
 
 
 if __name__ == "__main__":
+    if platform.system() == "Windows":
+        registered = _get_registered_backend_names()
+        # Vulkan backend is optional: only present when the wheel was built with
+        # EXECUTORCH_BUILD_VULKAN=1 and the Vulkan SDK (glslc) was available.
+        if "VulkanBackend" in registered:
+            print("✓ VulkanBackend is registered")
+        else:
+            print("⚠ VulkanBackend not registered (expected for the default wheel)")
+
     run_tests(
         model_tests=[
             ModelTest(
diff --git a/.github/workflows/test-backend-vulkan.yml b/.github/workflows/test-backend-vulkan.yml
index 0461527b073..5745adbd38b 100644
--- a/.github/workflows/test-backend-vulkan.yml
+++ b/.github/workflows/test-backend-vulkan.yml
@@ -17,6 +17,8 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # Default coverage: builds + runs on SwiftShader (software Vulkan) on CPU
+  # runners. Runs on every PR and nightly.
   test-vulkan:
     uses: ./.github/workflows/_test_backend.yml
     with:
@@ -28,3 +30,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 120
       run-linux: true
+
+  # Real-GPU (NVIDIA) and Windows MSVC coverage live in vulkan.yml, which gates
+  # those scarce/expensive runners behind path filtering.
diff --git a/.github/workflows/vulkan.yml b/.github/workflows/vulkan.yml
new file mode 100644
index 00000000000..276868836b9
--- /dev/null
+++ b/.github/workflows/vulkan.yml
@@ -0,0 +1,151 @@
+name: Test Vulkan Backend (specialized runners)
+
+# Vulkan CI jobs that require special runners (an NVIDIA GPU, or a Windows
+# MSVC toolchain). These are separate from test-backend-vulkan.yml (which runs the
+# default SwiftShader coverage on standard runners) so that they're only run
+# when Vulkan related files change.
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/vulkan.yml
+      - backends/vulkan/**
+      - examples/vulkan/**
+      - .ci/scripts/setup-vulkan-linux-deps.sh
+      - .ci/scripts/setup-vulkan-windows-deps.ps1
+      - .ci/scripts/setup-windows-msvc-vulkan.ps1
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  changed-files:
+    name: Get changed files
+    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      include-push-diff: true # so push commits can also be path-filtered
+
+  run-decision:
+    name: CI run decision
+    uses: ./.github/workflows/_ci-run-decision.yml
+
+  test-vulkan-nvidia:
+    needs: [changed-files, run-decision]
+    # Path-filtered: skip commits that don't touch Vulkan-relevant paths, except
+    # on sampled full runs (see _ci-run-decision.yml).
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/vulkan/') ||
+      contains(needs.changed-files.outputs.changed-files, 'examples/vulkan/') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-vulkan-linux-deps.sh') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/vulkan.yml') ||
+      needs.run-decision.outputs.is-full-run == 'true'
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 120
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      use-custom-docker-registry: false
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # Install a glibc-compatible glslc (conda-forge), the Vulkan loader and
+        # tools, and select a real system ICD. The CUDA runner image is
+        # manylinux_2_28 (glibc 2.28), where the LunarG SDK's prebuilt glslc does
+        # not run, so setup-vulkan-linux-deps.sh sources those from conda-forge and
+        # the system package manager instead. The NVIDIA container runtime mounts
+        # the driver's Vulkan library but not its ICD manifest, so the script
+        # synthesizes one and pins the loader to it; if no NVIDIA library is found
+        # it falls back to SwiftShader.
+        # NOTE: first-run check - inspect the vulkaninfo output below to confirm a
+        # real NVIDIA device is selected (not llvmpipe/SwiftShader).
+        source .ci/scripts/setup-vulkan-linux-deps.sh real-gpu
+        vulkaninfo --summary || true
+
+        # Full from-source install. Unlike the SwiftShader jobs in pull.yml, the
+        # CUDA runner image does not pre-install ExecuTorch's dependencies, so
+        # setup-linux.sh's "deps already in the image" assumption does not hold.
+        # CMAKE_ARGS enables Vulkan in the pybindings so the model --test runs and
+        # the pt2e/torchao e2e tests below execute on the GPU (default is OFF).
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" PYTHON_EXECUTABLE=python ./install_executorch.sh
+
+        # Model coverage (mirrors test-vulkan-models-linux, on real hardware).
+        PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build
+
+        models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4"
+        for model in $models; do
+          python -m examples.vulkan.export --model_name=$model --test
+        done
+
+        # For selected vision models, test with dynamic shapes
+        models="mv2 resnet18 resnet50 ic3 densenet161"
+        for model in $models; do
+          python -m examples.vulkan.export --model_name=$model --test -d
+        done
+
+        # Operator coverage (mirrors test-vulkan-operators-linux, on real hardware).
+        # The custom-op prototyping binaries are GPU microbenchmarks that rely on
+        # GPU timestamp queries; they need a real device and crash on the
+        # SwiftShader software fallback. Always build them (compile coverage), but
+        # only run them when a real GPU was selected (setup-vulkan-linux-deps.sh
+        # exports ETVK_USING_SWIFTSHADER when it falls back to SwiftShader).
+        PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh
+        if [ -z "${ETVK_USING_SWIFTSHADER:-}" ]; then
+          ./cmake-out/backends/vulkan/test/custom_ops/test_add
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_linear
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_conv2d
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q4gsw_linear
+          ./cmake-out/backends/vulkan/test/custom_ops/test_choose_qparams_per_row
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone
+          ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary
+        else
+          echo "SwiftShader fallback active: built custom-op benchmarks but skipping execution (they require real-GPU timestamp queries)."
+        fi
+
+        PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build
+
+        # Run e2e testing for selected operators.
+        python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
+        python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*torchao*"
+
+  build-vulkan-windows-msvc:
+    needs: [changed-files, run-decision]
+    if: |
+      contains(needs.changed-files.outputs.changed-files, 'backends/vulkan/') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-vulkan-windows-deps.ps1') ||
+      contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-windows-msvc-vulkan.ps1') ||
+      contains(needs.changed-files.outputs.changed-files, '.github/workflows/vulkan.yml') ||
+      needs.run-decision.outputs.is-full-run == 'true'
+    name: build-vulkan-windows-msvc
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
+    with:
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        git config --global http.sslBackend openssl
+        git submodule update --init backends/vulkan/third-party/Vulkan-Headers backends/vulkan/third-party/volk backends/vulkan/third-party/VulkanMemoryAllocator
+        git submodule update --init
+        conda init powershell
+        powershell -Command "& {
+          Set-PSDebug -Trace 1
+          \$ErrorActionPreference = 'Stop'
+          \$PSNativeCommandUseErrorActionPreference = \$true
+          .ci/scripts/setup-windows-msvc-vulkan.ps1
+        }"
diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt
index d9acde79ecf..5ee7fc03ef8 100644
--- a/backends/vulkan/CMakeLists.txt
+++ b/backends/vulkan/CMakeLists.txt
@@ -41,6 +41,24 @@ set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers)
 set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk)
 set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator)
 
+# These third-party dependencies are git submodules. They are not part of the
+# default submodule set checked out by install_executorch.py, so fail early with
+# an actionable message rather than a confusing missing-header error.
+if(NOT EXISTS "${VOLK_PATH}/volk.c"
+   OR NOT EXISTS "${VULKAN_HEADERS_PATH}/include/vulkan/vulkan.h"
+   OR NOT EXISTS "${VMA_PATH}/include/vk_mem_alloc.h"
+)
+  message(
+    FATAL_ERROR
+      "The Vulkan backend third-party submodules are missing. "
+      "Run the following from the repository root:\n"
+      "  git submodule update --init "
+      "backends/vulkan/third-party/Vulkan-Headers "
+      "backends/vulkan/third-party/volk "
+      "backends/vulkan/third-party/VulkanMemoryAllocator"
+  )
+endif()
+
 set(COMMON_INCLUDES
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
     $<BUILD_INTERFACE:${VULKAN_HEADERS_PATH}/include>
@@ -49,7 +67,11 @@ set(COMMON_INCLUDES
 
 # Compile settings
 
-set(VULKAN_CXX_FLAGS "-fexceptions")
+# Exceptions are required: the vk_api layer throws on Vulkan errors (see
+# vk_api/Exception.h). MSVC does not understand -fexceptions and enables C++
+# exceptions via /EHsc instead, so select the flag per compiler.
+set(VULKAN_CXX_FLAGS "$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions>")
+list(APPEND VULKAN_CXX_FLAGS "$<$<CXX_COMPILER_ID:MSVC>:/EHsc>")
 list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER")
 list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK")
 
@@ -123,7 +145,13 @@ add_library(vulkan_backend ${vulkan_backend_cpp})
 target_include_directories(
   vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES}
 )
-target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core)
+# volk (bundled into this library) calls dlopen/dlsym/dlclose to load libvulkan
+# at runtime. On glibc < 2.34 those live in libdl, so consumers linking an
+# executable against libvulkan_backend.a need -ldl; CMAKE_DL_LIBS provides it
+# (and is harmless where dlopen is already in libc). Matches the other backends.
+target_link_libraries(
+  vulkan_backend PRIVATE vulkan_schema executorch_core ${CMAKE_DL_LIBS}
+)
 # Optionally link boost for stacktraces if boost is available
 if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY)
   target_link_libraries(
diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake
index e2045cbf7da..0fb99757b0c 100644
--- a/backends/vulkan/cmake/ShaderLibrary.cmake
+++ b/backends/vulkan/cmake/ShaderLibrary.cmake
@@ -24,17 +24,33 @@ if(NOT EXECUTORCH_ROOT)
   message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.")
 endif()
 
-find_program(GLSLC_PATH glslc PATHS $ENV{PATH})
+# find_program already searches the PATH environment variable and appends the
+# platform executable suffix (.exe on Windows). Add the Vulkan SDK bin dir as a
+# hint so glslc is found on Windows even when only VULKAN_SDK is set.
+find_program(GLSLC_PATH glslc HINTS $ENV{VULKAN_SDK}/bin $ENV{VULKAN_SDK}/Bin)
 
 if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN)
-  message(
-    FATAL_ERROR
-      "glslc from the Vulkan SDK must be installed to build the Vulkan backend. "
-      "Please install the Vulkan SDK 1.4.341.1 or newer from "
-      "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. "
-      "Note that the glslc distributed with the Android NDK is not compatible since it "
-      "does not support the GL_EXT_integer_dot_product extension. "
-  )
+  if(EXECUTORCH_BUILD_WHEEL_DO_NOT_USE)
+    # In a wheel/pybind build, degrade gracefully so the wheel can still be
+    # produced without the Vulkan backend rather than failing the whole build.
+    message(
+      STATUS
+        "glslc not found; the Vulkan backend will not be included in the wheel."
+    )
+    set(EXECUTORCH_BUILD_VULKAN
+        OFF
+        CACHE BOOL "" FORCE
+    )
+  else()
+    message(
+      FATAL_ERROR
+        "glslc from the Vulkan SDK must be installed to build the Vulkan backend. "
+        "Please install the Vulkan SDK 1.4.341.1 or newer from "
+        "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. "
+        "Note that the glslc distributed with the Android NDK is not compatible since it "
+        "does not support the GL_EXT_integer_dot_product extension. "
+    )
+  endif()
 endif()
 
 # Required to enable linking with --whole-archive
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
index 60b4c3346f3..fb51a0edfad 100644
--- a/backends/vulkan/partitioner/vulkan_partitioner.py
+++ b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -378,9 +378,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             exported_program.graph_module
         )
 
-        texture_limits: utils.ImageExtents = self.options.get(
-            "texture_limits", utils.DEFAULT_TEXTURE_LIMITS
-        )
+        # small_texture_limits opts into the conservative 3D texture limit that is
+        # compatible with most desktop/laptop GPUs (the Vulkan spec only guarantees
+        # 2048). An explicit texture_limits always takes precedence.
+        if "texture_limits" in self.options:
+            texture_limits: utils.ImageExtents = self.options["texture_limits"]
+        elif self.options.get("small_texture_limits", False):
+            texture_limits = utils.SMALL_TEXTURE_LIMITS
+        else:
+            texture_limits = utils.DEFAULT_TEXTURE_LIMITS
         buffer_limit: int = self.options.get("buffer_limit", utils.DEFAULT_BUFFER_LIMIT)
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index d090a62f370..9d0694fa70f 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -212,6 +212,15 @@ void Context::register_blit(
     vkapi::PipelineBarrier& pipeline_barrier,
     vkapi::VulkanImage& src,
     vkapi::VulkanImage& dst) {
+  // vkCmdBlitImage requires a queue with graphics capability; transfer- or
+  // compute-only queues cannot perform blits. The queue is selected by compute
+  // capability only, so on desktop GPUs that expose compute-only queue families
+  // this could otherwise be invalid usage. On mobile the single universal queue
+  // always has this bit set.
+  VK_CHECK_COND(
+      queue_.capabilities & VK_QUEUE_GRAPHICS_BIT,
+      "The Vulkan queue selected for compute does not support blit operations "
+      "(VK_QUEUE_GRAPHICS_BIT is not set).");
   cmd_.insert_barrier(pipeline_barrier);
   cmd_.blit(src, dst);
 }
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index 47cefa1031a..ef892206d48 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -680,10 +680,23 @@ vkapi::VulkanBuffer allocate_buffer(
       return vkapi::VulkanBuffer();
   }
 
+  // Round the underlying allocation up to a whole number of 16-byte texels.
+  // Shaders may read a buffer-backed tensor as vec4/ivec4 (e.g. the
+  // per-output-channel weight scales/sums/bias in the tiled quantized kernels);
+  // when a dimension is not a multiple of 4, the final vec4 load would
+  // otherwise read past the end of the buffer, which silently zeroes the value
+  // on NVIDIA GPUs. This only grows the allocation; the tensor's
+  // physical_numel() is unchanged.
+  const size_t alloc_nbytes =
+      utils::align_up(element_size(dtype) * numel, static_cast<size_t>(16));
+
+  // TODO: this check is incorrect. max_buffer_numel() returns
+  // maxStorageBufferRange, which is a size in bytes, so the comparison should
+  // use the buffer's byte size (alloc_nbytes), not the element count.
   VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel());
 
   return adapter_ptr->vma().create_storage_buffer(
-      element_size(dtype) * numel, allocate_memory);
+      alloc_nbytes, allocate_memory);
 }
 
 vTensorStorage::vTensorStorage(
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index 93d6f9e41aa..69c87563bbd 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -1123,6 +1123,7 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
             # Construct name of SPIR-V file to be compiled
             spv_out_path = os.path.join(output_dir, f"{src_file_name}.spv")
 
+            cached_spv_out_path = None
             if cache_dir is not None:
                 # Construct the file names of cached SPIR-V file to check if they exist
                 # in the cache.
@@ -1160,7 +1161,9 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
                             subprocess.run(cmd_no_opt, check=True, capture_output=True)
                         except subprocess.CalledProcessError as e_no_opt:
                             # Delete any existing cached SPIR-V file if it exists
-                            if os.path.exists(cached_spv_out_path):
+                            if cached_spv_out_path is not None and os.path.exists(
+                                cached_spv_out_path
+                            ):
                                 os.remove(cached_spv_out_path)
 
                             raise RuntimeError(
@@ -1169,7 +1172,9 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]:
 
                     else:
                         # Delete any existing cached SPIR-V file if it exists
-                        if os.path.exists(cached_spv_out_path):
+                        if cached_spv_out_path is not None and os.path.exists(
+                            cached_spv_out_path
+                        ):
                             os.remove(cached_spv_out_path)
 
                         raise RuntimeError(f"{err_msg_base} {e.stderr}") from e
diff --git a/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml b/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml
index bd5c2377cf6..05b26adfb24 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml
+++ b/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml
@@ -12,6 +12,10 @@
 
 coopmat_mm:
   parameter_names_with_default_values:
+    # GL_KHR_cooperative_matrix requires SPIR-V 1.6, so target Vulkan 1.3 when
+    # compiling this shader (the default target-env of 1.1 is too low). Other
+    # shaders are unaffected and keep the default.
+    VK_VERSION: '1.3'
     DTYPE: float
     PRECISION: highp
     WEIGHT_LAYOUT: row_major
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index b762c95205b..3d9acae8975 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -140,6 +140,20 @@ VkDevice create_logical_device(
       enabled_device_extensions,
       requested_device_extensions);
 
+  // Enable the base device features that ExecuTorch shaders rely on, but only
+  // those that the physical device reports as supported. With pEnabledFeatures
+  // left null, all base features are disabled; using a shader that performs
+  // e.g. int16 arithmetic without enabling shaderInt16 is invalid usage and
+  // crashes on drivers that enforce it. Unsupported features stay VK_FALSE, so
+  // this is a no-op on devices that lack them.
+  VkPhysicalDeviceFeatures enabled_features{};
+  enabled_features.shaderInt16 =
+      physical_device.supports_int16_shader_types ? VK_TRUE : VK_FALSE;
+  enabled_features.shaderInt64 =
+      physical_device.supports_int64_shader_types ? VK_TRUE : VK_FALSE;
+  enabled_features.shaderFloat64 =
+      physical_device.supports_float64_shader_types ? VK_TRUE : VK_FALSE;
+
   VkDeviceCreateInfo device_create_info{
       VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType
       nullptr, // pNext
@@ -151,7 +165,7 @@ VkDevice create_logical_device(
       static_cast<uint32_t>(
           enabled_device_extensions.size()), // enabledExtensionCount
       enabled_device_extensions.data(), // ppEnabledExtensionNames
-      nullptr, // pEnabledFeatures
+      &enabled_features, // pEnabledFeatures
   };
 
   void* extension_list_top = nullptr;
@@ -234,41 +248,31 @@ VkDevice create_logical_device(
 bool test_linear_tiling_3d_image_support(
     VkDevice device,
     VkPhysicalDevice physical_device) {
-  // Test creating a 3D image with linear tiling to see if it is supported.
-  // According to the Vulkan spec, linear tiling may not be supported for 3D
-  // images.
-  VkExtent3D image_extents{1u, 1u, 1u};
-  const VkImageCreateInfo image_create_info{
-      VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType
-      nullptr, // pNext
-      0u, // flags
-      VK_IMAGE_TYPE_3D, // imageType
-      VK_FORMAT_R32G32B32A32_SFLOAT, // format
-      image_extents, // extents
-      1u, // mipLevels
-      1u, // arrayLayers
-      VK_SAMPLE_COUNT_1_BIT, // samples
-      VK_IMAGE_TILING_LINEAR, // tiling
-      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, // usage
-      VK_SHARING_MODE_EXCLUSIVE, // sharingMode
-      0u, // queueFamilyIndexCount
-      nullptr, // pQueueFamilyIndices
-      VK_IMAGE_LAYOUT_UNDEFINED, // initialLayout
-  };
-  VkImage image = VK_NULL_HANDLE;
-  VkResult res = vkCreateImage(device, &image_create_info, nullptr, &image);
-
-  if (res == VK_SUCCESS) {
-    vkDestroyImage(device, image, nullptr);
-
-    VkFormatProperties props;
-    vkGetPhysicalDeviceFormatProperties(
-        physical_device, VK_FORMAT_R32G32B32A32_SFLOAT, &props);
-
-    return props.linearTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
-  }
-
-  return false;
+  (void)device;
+  // ExecuTorch allocates 3D image tensors that are used as both sampled and
+  // storage images, with FP32 (VK_FORMAT_R32G32B32A32_SFLOAT) being the most
+  // demanding format. Linear tiling may only be used if the physical device
+  // supports creating such images; per the Vulkan spec, linear tiling support
+  // for 3D images is optional.
+  //
+  // vkGetPhysicalDeviceImageFormatProperties is the authoritative query for
+  // this exact (format, type, tiling, usage) combination. A vkCreateImage probe
+  // is unreliable: some drivers (e.g. NVIDIA) accept a trivial 1x1x1 linear 3D
+  // image even though larger linear 3D storage images of the same format are
+  // unsupported, and checking only the SAMPLED format feature misses that the
+  // STORAGE usage is unsupported -- both lead to VK_ERROR_FORMAT_NOT_SUPPORTED
+  // when allocating real tensors.
+  VkImageFormatProperties format_props;
+  const VkResult res = vkGetPhysicalDeviceImageFormatProperties(
+      physical_device,
+      VK_FORMAT_R32G32B32A32_SFLOAT,
+      VK_IMAGE_TYPE_3D,
+      VK_IMAGE_TILING_LINEAR,
+      VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT,
+      0u,
+      &format_props);
+
+  return res == VK_SUCCESS;
 }
 
 } // namespace
diff --git a/backends/vulkan/runtime/vk_api/Runtime.cpp b/backends/vulkan/runtime/vk_api/Runtime.cpp
index 3d3a146d80d..d7e101d0865 100644
--- a/backends/vulkan/runtime/vk_api/Runtime.cpp
+++ b/backends/vulkan/runtime/vk_api/Runtime.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/backends/vulkan/runtime/vk_api/Adapter.h>
 
+#include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <sstream>
@@ -239,19 +240,64 @@ VkDebugReportCallbackEXT create_debug_report_callback(
 // Adapter selection methods
 //
 
-uint32_t select_first(const std::vector<Runtime::DeviceMapping>& devices) {
+// Ranks compute-capable devices so that a real GPU is preferred over a software
+// rasterizer (e.g. SwiftShader/lavapipe, which report as CPU). On a single-GPU
+// system (e.g. mobile) there is only one candidate, so the choice is unchanged.
+int compute_device_priority(const PhysicalDevice& device) {
+  if (device.num_compute_queues == 0) {
+    return -1; // not compute-capable, never select
+  }
+  switch (device.properties.deviceType) {
+    case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU:
+      return 5;
+    case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU:
+      return 4;
+    case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU:
+      return 3;
+    case VK_PHYSICAL_DEVICE_TYPE_CPU:
+      return 1;
+    default:
+      return 2;
+  }
+}
+
+uint32_t select_compute_device(
+    const std::vector<Runtime::DeviceMapping>& devices) {
+  const uint32_t invalid =
+      devices.size() + 1; // out of range signals invalidity
   if (devices.empty()) {
-    return devices.size() + 1; // return out of range to signal invalidity
+    return invalid;
+  }
+
+  // Allow overriding device selection via the ETVK_DEVICE_INDEX environment
+  // variable, which is useful on multi-GPU desktop systems. Invalid values fall
+  // through to automatic selection below.
+  const char* device_index_env = std::getenv("ETVK_DEVICE_INDEX");
+  if (device_index_env != nullptr) {
+    char* end = nullptr;
+    const long idx = std::strtol(device_index_env, &end, 10);
+    // strtol always sets `end`; the explicit null-check makes the `*end`
+    // dereference safe for the static analyzer.
+    if (end != nullptr && end != device_index_env && *end == '\0' && idx >= 0 &&
+        static_cast<size_t>(idx) < devices.size() &&
+        devices[static_cast<size_t>(idx)].first.num_compute_queues > 0) {
+      return static_cast<uint32_t>(idx);
+    }
   }
 
-  // Select the first adapter that has compute capability
+  // Otherwise pick the highest-priority compute-capable device, preferring the
+  // first one on ties (preserving the previous first-match behavior).
+  uint32_t best_i = invalid;
+  int best_priority = -1;
   for (size_t i = 0; i < devices.size(); ++i) {
-    if (devices[i].first.num_compute_queues > 0) {
-      return i;
+    const int priority = compute_device_priority(devices[i].first);
+    if (priority > best_priority) {
+      best_priority = priority;
+      best_i = static_cast<uint32_t>(i);
     }
   }
 
-  return devices.size() + 1;
+  return best_i;
 }
 
 //
@@ -283,7 +329,7 @@ std::unique_ptr<Runtime> init_global_vulkan_runtime(
   const RuntimeConfig default_config{
       enable_validation_messages,
       init_default_device,
-      AdapterSelector::First,
+      AdapterSelector::Auto,
       num_requested_queues,
       cache_data_path,
   };
@@ -311,8 +357,8 @@ Runtime::Runtime(const RuntimeConfig config)
   if (config.init_default_device) {
     try {
       switch (config.default_selector) {
-        case AdapterSelector::First:
-          default_adapter_i_ = create_adapter(select_first);
+        case AdapterSelector::Auto:
+          default_adapter_i_ = create_adapter(select_compute_device);
       }
     } catch (...) {
     }
diff --git a/backends/vulkan/runtime/vk_api/Runtime.h b/backends/vulkan/runtime/vk_api/Runtime.h
index 3706d6c73d0..285e979eab3 100644
--- a/backends/vulkan/runtime/vk_api/Runtime.h
+++ b/backends/vulkan/runtime/vk_api/Runtime.h
@@ -31,7 +31,9 @@ namespace vkapi {
 //
 
 enum AdapterSelector {
-  First,
+  // Automatically select the best compute-capable device (highest priority
+  // device type, with an optional ETVK_DEVICE_INDEX override).
+  Auto,
 };
 
 struct RuntimeConfig final {
diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.h b/backends/vulkan/runtime/vk_api/memory/vma_api.h
index dc7abbf8b1e..cf267a27d11 100644
--- a/backends/vulkan/runtime/vk_api/memory/vma_api.h
+++ b/backends/vulkan/runtime/vk_api/memory/vma_api.h
@@ -25,17 +25,28 @@
 #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1
 #define VMA_VULKAN_VERSION 1002000
 
-#ifdef __clang__
+#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnullability-completeness"
 #pragma clang diagnostic ignored "-Wunused-variable"
-#endif /* __clang__ */
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100 4101 4189)
+#endif
 
 #include <vk_mem_alloc.h>
 
-#ifdef __clang__
+#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif /* __clang__ */
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 #else // !ETVK_USE_META_VMA
 
@@ -71,16 +82,27 @@
 */
 #endif /* VULKAN_DEBUG */
 
-#ifdef __clang__
+#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wnullability-completeness"
 #pragma clang diagnostic ignored "-Wunused-variable"
-#endif /* __clang__ */
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4100 4101 4189)
+#endif
 
 #include <include/vk_mem_alloc.h>
 
-#ifdef __clang__
+#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif /* __clang__ */
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
 #endif // ETVK_USE_META_VMA
diff --git a/backends/vulkan/test/custom_ops/build_and_run.sh b/backends/vulkan/test/custom_ops/build_and_run.sh
index 2b9ce576e0e..b1195568b1b 100755
--- a/backends/vulkan/test/custom_ops/build_and_run.sh
+++ b/backends/vulkan/test/custom_ops/build_and_run.sh
@@ -120,7 +120,7 @@ ANDROID_MODE=false
 CMAKE_OUT_DIR="cmake-out"
 
 # Check for --android flag and adjust arguments accordingly
-if [[ "$1" == "--android" ]]; then
+if [[ "${1:-}" == "--android" ]]; then
     ANDROID_MODE=true
     CMAKE_OUT_DIR="cmake-android-out"
     shift  # Remove --android from arguments
diff --git a/backends/vulkan/test/test_vulkan_compile_options.py b/backends/vulkan/test/test_vulkan_compile_options.py
new file mode 100644
index 00000000000..f44850d2915
--- /dev/null
+++ b/backends/vulkan/test/test_vulkan_compile_options.py
@@ -0,0 +1,48 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Any, Dict
+
+from executorch.backends.vulkan.partitioner.vulkan_partitioner import (
+    parse_compile_options,
+)
+from executorch.backends.vulkan.vulkan_preprocess import parse_compile_spec
+
+
+class TestVulkanCompileOptions(unittest.TestCase):
+    """Verify that compile options survive the partitioner -> backend round trip.
+
+    The partitioner serializes the user-provided options into CompileSpecs
+    (parse_compile_options) and the backend deserializes them at preprocess time
+    (parse_compile_spec). Boolean options that are serialized but not handled on
+    the deserialization side are silently dropped, which is a class of bug that
+    previously hid the small_texture_limits desktop-compatibility option.
+    """
+
+    def _round_trip(self, options: Dict[str, Any]) -> Dict[str, Any]:
+        return parse_compile_spec(parse_compile_options(options))
+
+    def test_small_texture_limits_round_trips(self) -> None:
+        round_tripped = self._round_trip({"small_texture_limits": True})
+        self.assertTrue(round_tripped.get("small_texture_limits"))
+
+    def test_skip_memory_planning_round_trips(self) -> None:
+        round_tripped = self._round_trip({"skip_memory_planning": True})
+        self.assertTrue(round_tripped.get("skip_memory_planning"))
+
+    def test_force_fp16_round_trips(self) -> None:
+        round_tripped = self._round_trip({"force_fp16": True})
+        self.assertTrue(round_tripped.get("force_fp16"))
+
+    def test_unset_options_are_absent(self) -> None:
+        round_tripped = self._round_trip({})
+        self.assertNotIn("small_texture_limits", round_tripped)
+        self.assertNotIn("skip_memory_planning", round_tripped)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py
index 7febff260c6..b349fb51001 100644
--- a/backends/vulkan/utils.py
+++ b/backends/vulkan/utils.py
@@ -588,6 +588,10 @@ def node_has_target(node: Any, target: str):
 ImageExtents = Tuple[int, int, int]
 
 DEFAULT_TEXTURE_LIMITS = (16384, 16384, 2048)
+# Conservative 3D texture limit compatible with most desktop/laptop GPUs. The
+# Vulkan spec only guarantees maxImageDimension3D >= 2048, whereas mobile GPUs
+# commonly support 16384. Used when the small_texture_limits option is set.
+SMALL_TEXTURE_LIMITS = (2048, 2048, 2048)
 DEFAULT_BUFFER_LIMIT = 128 * (1024 * 1024)
 
 all_storage_types: Set[VkStorageType] = {
diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py
index e9d5613668a..53a81d1772e 100644
--- a/backends/vulkan/vulkan_preprocess.py
+++ b/backends/vulkan/vulkan_preprocess.py
@@ -6,7 +6,6 @@
 
 # pyre-strict
 
-import copy
 from functools import partial
 from typing import Any, Callable, Dict, final, List
 
@@ -114,6 +113,12 @@ def parse_compile_spec(compile_specs: List[CompileSpec]) -> Dict[str, Any]:
         if spec.key == "force_fp16":
             options[spec.key] = bool.from_bytes(spec.value, byteorder="little")
 
+        if spec.key == "small_texture_limits":
+            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")
+
+        if spec.key == "skip_memory_planning":
+            options[spec.key] = bool.from_bytes(spec.value, byteorder="little")
+
         # Unhandled options are ignored
 
     return options
@@ -130,16 +135,15 @@ def preprocess(  # noqa: C901
     ) -> PreprocessResult:
         compile_options = parse_compile_spec(module_compile_spec)
 
-        default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS)
         # 2048 is the typical limit value for 3D textures, but mobile GPUs often support
         # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment,
-        # 16394 is the default texture limit used. This option is provided as a
-        # convenient way to switch to using a limit of 2048 for image textures which
-        # will be compatible with most GPUs.
+        # 16384 is the default texture limit used. The small_texture_limits option is
+        # provided as a convenient way to switch to a limit of 2048 for image textures,
+        # which will be compatible with most desktop/laptop GPUs.
         if compile_options.get("small_texture_limits", False):
-            default_texture_limits[0] = 2048
-            default_texture_limits[1] = 2048
-            default_texture_limits[2] = 2048
+            default_texture_limits = utils.SMALL_TEXTURE_LIMITS
+        else:
+            default_texture_limits = utils.DEFAULT_TEXTURE_LIMITS
 
         limits_x = compile_options.get("texture_limits_x", default_texture_limits[0])
         limits_y = compile_options.get("texture_limits_y", default_texture_limits[1])
diff --git a/setup.py b/setup.py
index 85228bd37ae..cd2bb5332cb 100644
--- a/setup.py
+++ b/setup.py
@@ -134,6 +134,7 @@ def _minimal_cmake_flags() -> List[str]:
         "-DEXECUTORCH_BUILD_PYBIND=OFF",
         "-DEXECUTORCH_BUILD_QNN=OFF",
         "-DEXECUTORCH_BUILD_TESTS=OFF",
+        "-DEXECUTORCH_BUILD_VULKAN=OFF",
         "-DEXECUTORCH_BUILD_XNNPACK=OFF",
     ]
 
@@ -962,6 +963,9 @@ def run(self):  # noqa C901
             if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"):
                 cmake_build_args += ["--target", "_llm_runner"]
 
+            if cmake_cache.is_enabled("EXECUTORCH_BUILD_VULKAN"):
+                cmake_build_args += ["--target", "vulkan_backend"]
+
             if cmake_cache.is_enabled("EXECUTORCH_BUILD_CUDA"):
                 cmake_build_args += ["--target", "aoti_cuda_backend"]
                 cmake_build_args += ["--target", "aoti_common_shims_slim"]
diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake
index ecce850ab3c..9a17f561785 100644
--- a/tools/cmake/preset/pybind.cmake
+++ b/tools/cmake/preset/pybind.cmake
@@ -97,3 +97,31 @@ else()
     FATAL_ERROR "Unsupported CMAKE_SYSTEM_NAME for pybind: ${CMAKE_SYSTEM_NAME}"
   )
 endif()
+
+# Opt-in Vulkan backend for Linux/Windows wheels. Enabled ONLY when the build
+# requests it via the EXECUTORCH_BUILD_VULKAN env var AND glslc (Vulkan SDK) is
+# available to compile the shaders. This keeps the default wheel (and
+# macOS/Android) byte-for-byte unchanged: GPU backends are opt-in rather than
+# bundled into the universal wheel.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux"
+   OR CMAKE_SYSTEM_NAME STREQUAL "Windows"
+   OR CMAKE_SYSTEM_NAME STREQUAL "WIN32"
+)
+  if(DEFINED ENV{EXECUTORCH_BUILD_VULKAN}
+     AND NOT "$ENV{EXECUTORCH_BUILD_VULKAN}" STREQUAL "0"
+     AND NOT "$ENV{EXECUTORCH_BUILD_VULKAN}" STREQUAL "OFF"
+  )
+    find_program(
+      GLSLC_PATH glslc HINTS $ENV{VULKAN_SDK}/bin $ENV{VULKAN_SDK}/Bin
+    )
+    if(GLSLC_PATH)
+      set_overridable_option(EXECUTORCH_BUILD_VULKAN ON)
+      message(STATUS "Enabling Vulkan backend for wheel; glslc: ${GLSLC_PATH}")
+    else()
+      message(
+        STATUS "EXECUTORCH_BUILD_VULKAN requested but glslc was not found; "
+               "the Vulkan backend will not be included."
+      )
+    endif()
+  endif()
+endif()

From 391f139fc97693096bd1152970f69894963b48c9 Mon Sep 17 00:00:00 2001
From: Reuben Dunn <reubend@meta.com>
Date: Wed, 17 Jun 2026 19:10:40 -0700
Subject: [PATCH 2/2] fix cast issue for Vulkan allocation sizes

---
 backends/vulkan/runtime/api/containers/Tensor.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
index ef892206d48..15e2660078a 100644
--- a/backends/vulkan/runtime/api/containers/Tensor.cpp
+++ b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -687,8 +687,9 @@ vkapi::VulkanBuffer allocate_buffer(
   // otherwise read past the end of the buffer, which silently zeroes the value
   // on NVIDIA GPUs. This only grows the allocation; the tensor's
   // physical_numel() is unchanged.
-  const size_t alloc_nbytes =
-      utils::align_up(element_size(dtype) * numel, static_cast<size_t>(16));
+  const size_t alloc_nbytes = utils::align_up(
+      element_size(dtype) * static_cast<size_t>(numel),
+      static_cast<size_t>(16));
 
   // TODO: this check is incorrect. max_buffer_numel() returns
   // maxStorageBufferRange, which is a size in bytes, so the comparison should