From 365f5888777777d5086ebedd9ae87340cdac6b1a Mon Sep 17 00:00:00 2001 From: Reuben Dunn Date: Mon, 8 Jun 2026 18:46:24 -0700 Subject: [PATCH 1/2] Vulkan: support Linux/Windows desktop GPUs and opt-in wheel builds The Vulkan backend was developed for Android GPUs. This makes it build and run on Linux/Windows desktop discrete GPUs (NVIDIA/AMD/Intel) and adds opt-in pre-built Vulkan wheels, with no change to Android behavior (build divergence is behind compile-time guards; runtime changes key off queried capabilities). Covers build portability, discrete-GPU correctness fixes (real-GPU device/ICD selection, shaderInt16/Int64/Float64 enablement, a blit queue guard, and texel-rounded buffer allocations to avoid out-of-bounds vec4 reads), and EXECUTORCH_BUILD_VULKAN-gated CI/packaging (a new vulkan.yml runs the real-GPU NVIDIA and Windows MSVC jobs, plus opt-in wheel plumbing). Tested on an NVIDIA A100; the SwiftShader CI path is unchanged. This change was authored with Claude. --- .ci/scripts/setup-vulkan-linux-deps.sh | 125 ++++++++++++++- .ci/scripts/setup-vulkan-windows-deps.ps1 | 37 +++++ .ci/scripts/setup-windows-msvc-vulkan.ps1 | 51 ++++++ .ci/scripts/test_backend.sh | 11 +- .ci/scripts/wheel/pre_build_script.sh | 31 ++++ .ci/scripts/wheel/test_linux.py | 7 + .ci/scripts/wheel/test_windows.py | 11 ++ .github/workflows/test-backend-vulkan.yml | 5 + .github/workflows/vulkan.yml | 151 ++++++++++++++++++ backends/vulkan/CMakeLists.txt | 32 +++- backends/vulkan/cmake/ShaderLibrary.cmake | 34 ++-- .../vulkan/partitioner/vulkan_partitioner.py | 12 +- backends/vulkan/runtime/api/Context.cpp | 9 ++ .../vulkan/runtime/api/containers/Tensor.cpp | 15 +- backends/vulkan/runtime/gen_vulkan_spv.py | 9 +- .../runtime/graph/ops/glsl/coopmat_mm.yaml | 4 + backends/vulkan/runtime/vk_api/Adapter.cpp | 76 ++++----- backends/vulkan/runtime/vk_api/Runtime.cpp | 64 ++++++-- backends/vulkan/runtime/vk_api/Runtime.h | 4 +- .../vulkan/runtime/vk_api/memory/vma_api.h | 38 ++++- .../vulkan/test/custom_ops/build_and_run.sh | 2 +- .../test/test_vulkan_compile_options.py | 48 ++++++ backends/vulkan/utils.py | 4 + backends/vulkan/vulkan_preprocess.py | 20 ++- setup.py | 4 + tools/cmake/preset/pybind.cmake | 28 ++++ 26 files changed, 746 insertions(+), 86 deletions(-) create mode 100644 .ci/scripts/setup-vulkan-windows-deps.ps1 create mode 100644 .ci/scripts/setup-windows-msvc-vulkan.ps1 create mode 100644 .github/workflows/vulkan.yml create mode 100644 backends/vulkan/test/test_vulkan_compile_options.py diff --git a/.ci/scripts/setup-vulkan-linux-deps.sh b/.ci/scripts/setup-vulkan-linux-deps.sh index a0dcb75ad4a..debe610a18a 100755 --- a/.ci/scripts/setup-vulkan-linux-deps.sh +++ b/.ci/scripts/setup-vulkan-linux-deps.sh @@ -1,4 +1,3 @@ - #!/bin/bash # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. @@ -22,7 +21,7 @@ install_swiftshader() { tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}" export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json" - export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/" + export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH:-}" export ETVK_USING_SWIFTSHADER=1 } @@ -43,7 +42,125 @@ install_vulkan_sdk() { export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/" } +_maybe_sudo() { + if [ "$(id -u)" -eq 0 ]; then + "$@" + else + sudo "$@" + fi +} + +install_glslc() { + # The glslc shipped in the LunarG SDK is dynamically linked against a newer + # glibc/libstdc++ than the manylinux_2_28 / AlmaLinux 8 CUDA runner image + # provides (glibc 2.28), where it fails to load with "GLIBC_2.29 not found". + # conda-forge's shaderc is built against an old sysroot, runs there, and is + # recent enough for the GL_EXT_integer_dot_product / GL_KHR_cooperative_matrix + # extensions the Vulkan shaders use. Install it into an isolated prefix so the + # base conda env that builds ExecuTorch is left untouched, then put it on PATH. + _glslc_prefix=/tmp/shaderc + conda create -y -p "${_glslc_prefix}" -c conda-forge shaderc + export PATH="${_glslc_prefix}/bin:${PATH}" +} + +install_vulkan_loader() { + # libvulkan.so.1 (the Khronos loader that volk dlopen()s at runtime) is not part + # of the NVIDIA driver and is absent from the CUDA builder image; vulkan-tools + # provides vulkaninfo for the device sanity check. Both ship as native el8 RPMs. + if command -v dnf >/dev/null 2>&1; then + _maybe_sudo dnf install -y vulkan-loader vulkan-tools + fi +} + +_find_nvidia_vulkan_library() { + # NVIDIA implements its Vulkan ICD inside libGLX_nvidia.so.0. The NVIDIA + # container runtime mounts this library into the container (it is pulled from + # the driver's ldcache when NVIDIA_DRIVER_CAPABILITIES includes graphics/all), + # so prefer ldconfig and fall back to the usual mount locations. + local lib cand + lib="$(ldconfig -p 2>/dev/null | awk '/libGLX_nvidia\.so\.0/ {print $NF; exit}')" + if [ -z "${lib}" ]; then + for cand in /usr/lib64/libGLX_nvidia.so.0 \ + /usr/lib/x86_64-linux-gnu/libGLX_nvidia.so.0 \ + /usr/lib/libGLX_nvidia.so.0; do + if [ -e "${cand}" ]; then + lib="${cand}" + break + fi + done + fi + printf '%s' "${lib}" +} + +_vulkan_has_real_device() { + # True if the loader enumerates a hardware GPU. vulkaninfo can exit non-zero + # for unrelated reasons (no display/WSI), so key off the reported deviceType. + command -v vulkaninfo >/dev/null 2>&1 || return 0 + vulkaninfo --summary 2>/dev/null | + grep -qE 'PHYSICAL_DEVICE_TYPE_(DISCRETE|INTEGRATED|VIRTUAL)_GPU' +} + +setup_real_gpu_icd() { + # Select a Vulkan ICD so the runtime exercises the real GPU when one is usable. + # Two quirks of the CUDA CI image make this non-trivial: + # 1. The NVIDIA container runtime mounts the driver's Vulkan library but does + # not register its ICD manifest, so the loader never discovers the GPU on + # its own. We synthesize the manifest and pin the loader to it. + # 2. Installing vulkan-loader/vulkan-tools pulls in mesa-vulkan-drivers, + # which drop Intel/AMD/lavapipe manifests for absent hardware. lavapipe + # fails vkCreateInstance on this image and, because the loader walks every + # manifest in icd.d, that poisons device enumeration for the whole + # process. Pinning VK_ICD_FILENAMES makes the loader ignore icd.d, so the + # broken stubs cannot interfere. + local nvidia_lib + nvidia_lib="$(_find_nvidia_vulkan_library)" + if [ -n "${nvidia_lib}" ]; then + local icd=/tmp/nvidia_icd.json + cat >"${icd}" </dev/null 2>&1; then + echo "--- NVIDIA Vulkan ICD diagnostic ---" + VK_LOADER_DEBUG=warn vulkaninfo --summary 2>&1 | head -40 || true + echo "--- end diagnostic ---" + fi + unset VK_ICD_FILENAMES + else + echo "WARNING: no NVIDIA Vulkan driver library found; using SwiftShader." + fi + install_swiftshader +} + VULKAN_SDK_VERSION="1.4.321.1" -install_swiftshader -install_vulkan_sdk "${VULKAN_SDK_VERSION}" +# The no-argument default installs SwiftShader so the existing CPU-runner CI is +# unchanged. Pass "real-gpu" to prefer a real system ICD when one is present. +case "${1:-swiftshader}" in + real-gpu) + # Do not download the LunarG SDK here: its prebuilt glslc cannot run on the + # old-glibc CUDA image. glslc comes from conda-forge and the loader from the + # system package manager instead. + install_vulkan_loader + install_glslc + setup_real_gpu_icd + ;; + swiftshader | *) + install_swiftshader + install_vulkan_sdk "${VULKAN_SDK_VERSION}" + ;; +esac diff --git a/.ci/scripts/setup-vulkan-windows-deps.ps1 b/.ci/scripts/setup-vulkan-windows-deps.ps1 new file mode 100644 index 00000000000..335f457714f --- /dev/null +++ b/.ci/scripts/setup-vulkan-windows-deps.ps1 @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Install glslc (the Vulkan shader compiler) on Windows via conda-forge's +# shaderc package, and make sure it is on PATH. glslc is the only build-time +# Vulkan dependency -- the Vulkan headers and the volk loader come from the +# in-tree submodules -- so this avoids depending on the heavyweight LunarG SDK +# installer. Requires conda to be available (the callers create/activate an env). + +$ErrorActionPreference = "Stop" + +Write-Host "Installing shaderc (provides glslc) from conda-forge..." +conda install -y -c conda-forge shaderc +if ($LASTEXITCODE -ne 0) { + Write-Error "Failed to install shaderc from conda-forge (exit ${LASTEXITCODE})" + exit 1 +} + +$glslc = Get-Command glslc -ErrorAction SilentlyContinue +if (-not $glslc) { + Write-Error "glslc not found on PATH after installing shaderc" + exit 1 +} + +# Expose glslc to the current process and, when running as a GitHub Actions step, +# to subsequent steps. +$glslcDir = Split-Path -Parent $glslc.Source +$env:PATH = "${glslcDir};${env:PATH}" +if ($env:GITHUB_PATH) { + Add-Content -Path $env:GITHUB_PATH -Value $glslcDir +} + +Write-Host "glslc available at $($glslc.Source)" +& glslc --version diff --git a/.ci/scripts/setup-windows-msvc-vulkan.ps1 b/.ci/scripts/setup-windows-msvc-vulkan.ps1 new file mode 100644 index 00000000000..7fa2006e83f --- /dev/null +++ b/.ci/scripts/setup-windows-msvc-vulkan.ps1 @@ -0,0 +1,51 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Build-validation for the Vulkan backend under MSVC on Windows. Mirrors +# setup-windows-msvc.ps1 but installs glslc (the Vulkan shader compiler) and +# configures/builds the vulkan_backend target. This is a bring-up job: it exists +# to surface MSVC portability issues in the Vulkan/volk/VMA code, so it may need +# iteration. + +conda create --yes --quiet -n et python=3.12 +conda activate et + +# Install cmake +conda install -y cmake + +# Activate the VS environment - this is required for MSVC to work. +& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64 + +# Install glslc (via conda-forge shaderc) and put it on PATH in this process. +.ci/scripts/setup-vulkan-windows-deps.ps1 + +# Install CI requirements +pip install -r .ci/docker/requirements-ci.txt + +$buildDir = "cmake-out-vulkan" +if (Test-Path -Path $buildDir) { + Remove-Item -Path $buildDir -Recurse -Force +} +New-Item -Path $buildDir -ItemType Directory + +cmake -S . -B $buildDir ` + -DCMAKE_BUILD_TYPE=Release ` + -DEXECUTORCH_BUILD_VULKAN=ON ` + -DPYTHON_EXECUTABLE=python + +if ($LASTEXITCODE -ne 0) { + Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE." + exit $LASTEXITCODE +} + +cmake --build $buildDir --config Release --target vulkan_backend -j16 + +if ($LASTEXITCODE -ne 0) { + Write-Host "Vulkan backend MSVC build failed. Exit code: $LASTEXITCODE." + exit $LASTEXITCODE +} + +Write-Host "Vulkan backend MSVC build completed successfully!" diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh index fe9b564a18f..80352fe1393 100755 --- a/.ci/scripts/test_backend.sh +++ b/.ci/scripts/test_backend.sh @@ -51,8 +51,15 @@ if [[ "$FLOW" == *qnn* ]]; then fi if [[ "$FLOW" == *vulkan* ]]; then - # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate. - source .ci/scripts/setup-vulkan-linux-deps.sh + # Setup the Vulkan SDK and select an ICD: use the real system GPU ICD when one + # is present (real-GPU runner), otherwise fall back to SwiftShader (CPU + # runner). The Vulkan loader searches both standard ICD directories. + if ls /etc/vulkan/icd.d/*.json /usr/share/vulkan/icd.d/*.json \ + >/dev/null 2>&1; then + source .ci/scripts/setup-vulkan-linux-deps.sh "real-gpu" + else + source .ci/scripts/setup-vulkan-linux-deps.sh "swiftshader" + fi EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON" fi diff --git a/.ci/scripts/wheel/pre_build_script.sh b/.ci/scripts/wheel/pre_build_script.sh index 365398d27a4..2c10e75fafd 100755 --- a/.ci/scripts/wheel/pre_build_script.sh +++ b/.ci/scripts/wheel/pre_build_script.sh @@ -69,3 +69,34 @@ if [[ "$(uname -s)" == "Linux" && "$(uname -m)" == "x86_64" ]]; then echo "QNN_SDK_ROOT=${QNN_SDK_ROOT}" >> "${GITHUB_ENV}" echo "QNN SDK downloaded to ${QNN_SDK_ROOT}" fi + +# Provision the Vulkan SDK (glslc) and submodules ONLY when explicitly requested +# via EXECUTORCH_BUILD_VULKAN. The default wheel build leaves this unset, so it +# does no extra work (no submodule fetch, no SDK download) and is unaffected. +if [[ "${EXECUTORCH_BUILD_VULKAN:-0}" != "0" \ + && "${EXECUTORCH_BUILD_VULKAN:-OFF}" != "OFF" ]]; then + echo "Initializing Vulkan backend third-party submodules..." + VULKAN_SUBMODULES=( + backends/vulkan/third-party/Vulkan-Headers + backends/vulkan/third-party/volk + backends/vulkan/third-party/VulkanMemoryAllocator + ) + if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then + git -c http.sslBackend=openssl submodule update --init "${VULKAN_SUBMODULES[@]}" + echo "Installing Vulkan SDK for Windows wheel build..." + powershell -ExecutionPolicy Bypass -File .ci/scripts/setup-vulkan-windows-deps.ps1 + else + git submodule update --init "${VULKAN_SUBMODULES[@]}" + # Install glslc from conda-forge rather than the LunarG SDK: the manylinux + # wheel image uses an old glibc where the SDK's prebuilt glslc cannot run + # ("GLIBC_2.29 not found"). conda-forge's shaderc is built against an old + # sysroot and runs there. Vulkan headers come from the submodules above and + # volk dlopen()s the loader at runtime, so only glslc is needed to build. + echo "Installing glslc (conda-forge shaderc) for Linux wheel build..." + _glslc_prefix="${HOME}/.shaderc" + conda create -y -p "${_glslc_prefix}" -c conda-forge shaderc + export PATH="${_glslc_prefix}/bin:${PATH}" + echo "${_glslc_prefix}/bin" >> "${GITHUB_PATH}" + echo "glslc installed: $(command -v glslc)" + fi +fi diff --git a/.ci/scripts/wheel/test_linux.py b/.ci/scripts/wheel/test_linux.py index c441bcec91f..7545b4c6f20 100644 --- a/.ci/scripts/wheel/test_linux.py +++ b/.ci/scripts/wheel/test_linux.py @@ -31,6 +31,13 @@ ), f"OpenvinoBackend not found in registered backends: {registered}" print("✓ OpenvinoBackend is registered") + # Vulkan backend is optional: only present when the wheel was built with + # EXECUTORCH_BUILD_VULKAN=1 and the Vulkan SDK (glslc) was available. + if "VulkanBackend" in registered: + print("✓ VulkanBackend is registered") + else: + print("⚠ VulkanBackend not registered (expected for the default wheel)") + test_base.run_tests( model_tests=[ test_base.ModelTest( diff --git a/.ci/scripts/wheel/test_windows.py b/.ci/scripts/wheel/test_windows.py index d2d8b29a534..ba141d4498c 100644 --- a/.ci/scripts/wheel/test_windows.py +++ b/.ci/scripts/wheel/test_windows.py @@ -5,6 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import platform from typing import List import torch @@ -15,6 +16,7 @@ from executorch.examples.xnnpack.quantization.utils import quantize as quantize_xnn from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower from executorch.extension.pybindings.portable_lib import ( + _get_registered_backend_names, _load_for_executorch_from_buffer, ) from test_base import ModelTest @@ -63,6 +65,15 @@ def run_tests(model_tests: List[ModelTest]) -> None: if __name__ == "__main__": + if platform.system() == "Windows": + registered = _get_registered_backend_names() + # Vulkan backend is optional: only present when the wheel was built with + # EXECUTORCH_BUILD_VULKAN=1 and the Vulkan SDK (glslc) was available. + if "VulkanBackend" in registered: + print("✓ VulkanBackend is registered") + else: + print("⚠ VulkanBackend not registered (expected for the default wheel)") + run_tests( model_tests=[ ModelTest( diff --git a/.github/workflows/test-backend-vulkan.yml b/.github/workflows/test-backend-vulkan.yml index 0461527b073..5745adbd38b 100644 --- a/.github/workflows/test-backend-vulkan.yml +++ b/.github/workflows/test-backend-vulkan.yml @@ -17,6 +17,8 @@ concurrency: cancel-in-progress: true jobs: + # Default coverage: builds + runs on SwiftShader (software Vulkan) on CPU + # runners. Runs on every PR and nightly. test-vulkan: uses: ./.github/workflows/_test_backend.yml with: @@ -28,3 +30,6 @@ jobs: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: 120 run-linux: true + + # Real-GPU (NVIDIA) and Windows MSVC coverage live in vulkan.yml, which gates + # those scarce/expensive runners behind path filtering. diff --git a/.github/workflows/vulkan.yml b/.github/workflows/vulkan.yml new file mode 100644 index 00000000000..276868836b9 --- /dev/null +++ b/.github/workflows/vulkan.yml @@ -0,0 +1,151 @@ +name: Test Vulkan Backend (specialized runners) + +# Vulkan CI jobs that require special runners (an NVIDIA GPU, or a Windows +# MSVC toolchain). These are separate from test-backend-vulkan.yml (which runs the +# default SwiftShader coverage on standard runners) so that they're only run +# when Vulkan related files change. + +on: + push: + branches: + - main + - release/* + tags: + - ciflow/nightly/* + pull_request: + paths: + - .github/workflows/vulkan.yml + - backends/vulkan/** + - examples/vulkan/** + - .ci/scripts/setup-vulkan-linux-deps.sh + - .ci/scripts/setup-vulkan-windows-deps.ps1 + - .ci/scripts/setup-windows-msvc-vulkan.ps1 + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + changed-files: + name: Get changed files + uses: ./.github/workflows/_get-changed-files.yml + with: + include-push-diff: true # so push commits can also be path-filtered + + run-decision: + name: CI run decision + uses: ./.github/workflows/_ci-run-decision.yml + + test-vulkan-nvidia: + needs: [changed-files, run-decision] + # Path-filtered: skip commits that don't touch Vulkan-relevant paths, except + # on sampled full runs (see _ci-run-decision.yml). + if: | + contains(needs.changed-files.outputs.changed-files, 'backends/vulkan/') || + contains(needs.changed-files.outputs.changed-files, 'examples/vulkan/') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-vulkan-linux-deps.sh') || + contains(needs.changed-files.outputs.changed-files, '.github/workflows/vulkan.yml') || + needs.run-decision.outputs.is-full-run == 'true' + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 120 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "12.6" + use-custom-docker-registry: false + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + # Install a glibc-compatible glslc (conda-forge), the Vulkan loader and + # tools, and select a real system ICD. The CUDA runner image is + # manylinux_2_28 (glibc 2.28), where the LunarG SDK's prebuilt glslc does + # not run, so setup-vulkan-linux-deps.sh sources those from conda-forge and + # the system package manager instead. The NVIDIA container runtime mounts + # the driver's Vulkan library but not its ICD manifest, so the script + # synthesizes one and pins the loader to it; if no NVIDIA library is found + # it falls back to SwiftShader. + # NOTE: first-run check - inspect the vulkaninfo output below to confirm a + # real NVIDIA device is selected (not llvmpipe/SwiftShader). + source .ci/scripts/setup-vulkan-linux-deps.sh real-gpu + vulkaninfo --summary || true + + # Full from-source install. Unlike the SwiftShader jobs in pull.yml, the + # CUDA runner image does not pre-install ExecuTorch's dependencies, so + # setup-linux.sh's "deps already in the image" assumption does not hold. + # CMAKE_ARGS enables Vulkan in the pybindings so the model --test runs and + # the pt2e/torchao e2e tests below execute on the GPU (default is OFF). + CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" PYTHON_EXECUTABLE=python ./install_executorch.sh + + # Model coverage (mirrors test-vulkan-models-linux, on real hardware). + PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_model.sh --build + + models="mv2 mv3 edsr resnet18 resnet50 dl3 w2l ic3 ic4" + for model in $models; do + python -m examples.vulkan.export --model_name=$model --test + done + + # For selected vision models, test with dynamic shapes + models="mv2 resnet18 resnet50 ic3 densenet161" + for model in $models; do + python -m examples.vulkan.export --model_name=$model --test -d + done + + # Operator coverage (mirrors test-vulkan-operators-linux, on real hardware). + # The custom-op prototyping binaries are GPU microbenchmarks that rely on + # GPU timestamp queries; they need a real device and crash on the + # SwiftShader software fallback. Always build them (compile coverage), but + # only run them when a real GPU was selected (setup-vulkan-linux-deps.sh + # exports ETVK_USING_SWIFTSHADER when it falls back to SwiftShader). + PYTHON_EXECUTABLE=python bash backends/vulkan/test/custom_ops/build_and_run.sh + if [ -z "${ETVK_USING_SWIFTSHADER:-}" ]; then + ./cmake-out/backends/vulkan/test/custom_ops/test_add + ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_linear + ./cmake-out/backends/vulkan/test/custom_ops/test_q8csw_conv2d + ./cmake-out/backends/vulkan/test/custom_ops/test_q4gsw_linear + ./cmake-out/backends/vulkan/test/custom_ops/test_choose_qparams_per_row + ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_qdq + ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_clone + ./cmake-out/backends/vulkan/test/custom_ops/test_q8ta_binary + else + echo "SwiftShader fallback active: built custom-op benchmarks but skipping execution (they require real-GPU timestamp queries)." + fi + + PYTHON_EXECUTABLE=python bash backends/vulkan/test/scripts/test_op.sh --build + + # Run e2e testing for selected operators. + python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*" + python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*torchao*" + + build-vulkan-windows-msvc: + needs: [changed-files, run-decision] + if: | + contains(needs.changed-files.outputs.changed-files, 'backends/vulkan/') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-vulkan-windows-deps.ps1') || + contains(needs.changed-files.outputs.changed-files, '.ci/scripts/setup-windows-msvc-vulkan.ps1') || + contains(needs.changed-files.outputs.changed-files, '.github/workflows/vulkan.yml') || + needs.run-decision.outputs.is-full-run == 'true' + name: build-vulkan-windows-msvc + uses: pytorch/test-infra/.github/workflows/windows_job.yml@main + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 + script: | + git config --global http.sslBackend openssl + git submodule update --init backends/vulkan/third-party/Vulkan-Headers backends/vulkan/third-party/volk backends/vulkan/third-party/VulkanMemoryAllocator + git submodule update --init + conda init powershell + powershell -Command "& { + Set-PSDebug -Trace 1 + \$ErrorActionPreference = 'Stop' + \$PSNativeCommandUseErrorActionPreference = \$true + .ci/scripts/setup-windows-msvc-vulkan.ps1 + }" diff --git a/backends/vulkan/CMakeLists.txt b/backends/vulkan/CMakeLists.txt index d9acde79ecf..5ee7fc03ef8 100644 --- a/backends/vulkan/CMakeLists.txt +++ b/backends/vulkan/CMakeLists.txt @@ -41,6 +41,24 @@ set(VULKAN_HEADERS_PATH ${VULKAN_THIRD_PARTY_PATH}/Vulkan-Headers) set(VOLK_PATH ${VULKAN_THIRD_PARTY_PATH}/volk) set(VMA_PATH ${VULKAN_THIRD_PARTY_PATH}/VulkanMemoryAllocator) +# These third-party dependencies are git submodules. They are not part of the +# default submodule set checked out by install_executorch.py, so fail early with +# an actionable message rather than a confusing missing-header error. +if(NOT EXISTS "${VOLK_PATH}/volk.c" + OR NOT EXISTS "${VULKAN_HEADERS_PATH}/include/vulkan/vulkan.h" + OR NOT EXISTS "${VMA_PATH}/include/vk_mem_alloc.h" +) + message( + FATAL_ERROR + "The Vulkan backend third-party submodules are missing. " + "Run the following from the repository root:\n" + " git submodule update --init " + "backends/vulkan/third-party/Vulkan-Headers " + "backends/vulkan/third-party/volk " + "backends/vulkan/third-party/VulkanMemoryAllocator" + ) +endif() + set(COMMON_INCLUDES $ $ @@ -49,7 +67,11 @@ set(COMMON_INCLUDES # Compile settings -set(VULKAN_CXX_FLAGS "-fexceptions") +# Exceptions are required: the vk_api layer throws on Vulkan errors (see +# vk_api/Exception.h). MSVC does not understand -fexceptions and enables C++ +# exceptions via /EHsc instead, so select the flag per compiler. +set(VULKAN_CXX_FLAGS "$<$>:-fexceptions>") +list(APPEND VULKAN_CXX_FLAGS "$<$:/EHsc>") list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_WRAPPER") list(APPEND VULKAN_CXX_FLAGS "-DUSE_VULKAN_VOLK") @@ -123,7 +145,13 @@ add_library(vulkan_backend ${vulkan_backend_cpp}) target_include_directories( vulkan_backend PRIVATE ${SCHEMA_INCLUDE_DIR} ${COMMON_INCLUDES} ) -target_link_libraries(vulkan_backend PRIVATE vulkan_schema executorch_core) +# volk (bundled into this library) calls dlopen/dlsym/dlclose to load libvulkan +# at runtime. On glibc < 2.34 those live in libdl, so consumers linking an +# executable against libvulkan_backend.a need -ldl; CMAKE_DL_LIBS provides it +# (and is harmless where dlopen is already in libc). Matches the other backends. +target_link_libraries( + vulkan_backend PRIVATE vulkan_schema executorch_core ${CMAKE_DL_LIBS} +) # Optionally link boost for stacktraces if boost is available if(DEFINED Boost_STACKTRACE_BASIC_LIBRARY) target_link_libraries( diff --git a/backends/vulkan/cmake/ShaderLibrary.cmake b/backends/vulkan/cmake/ShaderLibrary.cmake index e2045cbf7da..0fb99757b0c 100644 --- a/backends/vulkan/cmake/ShaderLibrary.cmake +++ b/backends/vulkan/cmake/ShaderLibrary.cmake @@ -24,17 +24,33 @@ if(NOT EXECUTORCH_ROOT) message("WARNING: EXECUTORCH_ROOT is not set! A failure is likely imminent.") endif() -find_program(GLSLC_PATH glslc PATHS $ENV{PATH}) +# find_program already searches the PATH environment variable and appends the +# platform executable suffix (.exe on Windows). Add the Vulkan SDK bin dir as a +# hint so glslc is found on Windows even when only VULKAN_SDK is set. +find_program(GLSLC_PATH glslc HINTS $ENV{VULKAN_SDK}/bin $ENV{VULKAN_SDK}/Bin) if(NOT GLSLC_PATH AND EXECUTORCH_BUILD_VULKAN) - message( - FATAL_ERROR - "glslc from the Vulkan SDK must be installed to build the Vulkan backend. " - "Please install the Vulkan SDK 1.4.341.1 or newer from " - "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. " - "Note that the glslc distributed with the Android NDK is not compatible since it " - "does not support the GL_EXT_integer_dot_product extension. " - ) + if(EXECUTORCH_BUILD_WHEEL_DO_NOT_USE) + # In a wheel/pybind build, degrade gracefully so the wheel can still be + # produced without the Vulkan backend rather than failing the whole build. + message( + STATUS + "glslc not found; the Vulkan backend will not be included in the wheel." + ) + set(EXECUTORCH_BUILD_VULKAN + OFF + CACHE BOOL "" FORCE + ) + else() + message( + FATAL_ERROR + "glslc from the Vulkan SDK must be installed to build the Vulkan backend. " + "Please install the Vulkan SDK 1.4.341.1 or newer from " + "https://vulkan.lunarg.com/sdk/home and ensure that the glslc binary is in your PATH. " + "Note that the glslc distributed with the Android NDK is not compatible since it " + "does not support the GL_EXT_integer_dot_product extension. " + ) + endif() endif() # Required to enable linking with --whole-archive diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py index 60b4c3346f3..fb51a0edfad 100644 --- a/backends/vulkan/partitioner/vulkan_partitioner.py +++ b/backends/vulkan/partitioner/vulkan_partitioner.py @@ -378,9 +378,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: exported_program.graph_module ) - texture_limits: utils.ImageExtents = self.options.get( - "texture_limits", utils.DEFAULT_TEXTURE_LIMITS - ) + # small_texture_limits opts into the conservative 3D texture limit that is + # compatible with most desktop/laptop GPUs (the Vulkan spec only guarantees + # 2048). An explicit texture_limits always takes precedence. + if "texture_limits" in self.options: + texture_limits: utils.ImageExtents = self.options["texture_limits"] + elif self.options.get("small_texture_limits", False): + texture_limits = utils.SMALL_TEXTURE_LIMITS + else: + texture_limits = utils.DEFAULT_TEXTURE_LIMITS buffer_limit: int = self.options.get("buffer_limit", utils.DEFAULT_BUFFER_LIMIT) capability_partitioner = CapabilityBasedPartitioner( exported_program.graph_module, diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp index d090a62f370..9d0694fa70f 100644 --- a/backends/vulkan/runtime/api/Context.cpp +++ b/backends/vulkan/runtime/api/Context.cpp @@ -212,6 +212,15 @@ void Context::register_blit( vkapi::PipelineBarrier& pipeline_barrier, vkapi::VulkanImage& src, vkapi::VulkanImage& dst) { + // vkCmdBlitImage requires a queue with graphics capability; transfer- or + // compute-only queues cannot perform blits. The queue is selected by compute + // capability only, so on desktop GPUs that expose compute-only queue families + // this could otherwise be invalid usage. On mobile the single universal queue + // always has this bit set. + VK_CHECK_COND( + queue_.capabilities & VK_QUEUE_GRAPHICS_BIT, + "The Vulkan queue selected for compute does not support blit operations " + "(VK_QUEUE_GRAPHICS_BIT is not set)."); cmd_.insert_barrier(pipeline_barrier); cmd_.blit(src, dst); } diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index 47cefa1031a..ef892206d48 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -680,10 +680,23 @@ vkapi::VulkanBuffer allocate_buffer( return vkapi::VulkanBuffer(); } + // Round the underlying allocation up to a whole number of 16-byte texels. + // Shaders may read a buffer-backed tensor as vec4/ivec4 (e.g. the + // per-output-channel weight scales/sums/bias in the tiled quantized kernels); + // when a dimension is not a multiple of 4, the final vec4 load would + // otherwise read past the end of the buffer, which silently zeroes the value + // on NVIDIA GPUs. This only grows the allocation; the tensor's + // physical_numel() is unchanged. + const size_t alloc_nbytes = + utils::align_up(element_size(dtype) * numel, static_cast(16)); + + // TODO: this check is incorrect. max_buffer_numel() returns + // maxStorageBufferRange, which is a size in bytes, so the comparison should + // use the buffer's byte size (alloc_nbytes), not the element count. VK_CHECK_COND(numel <= context_ptr->adapter_ptr()->max_buffer_numel()); return adapter_ptr->vma().create_storage_buffer( - element_size(dtype) * numel, allocate_memory); + alloc_nbytes, allocate_memory); } vTensorStorage::vTensorStorage( diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py index 93d6f9e41aa..69c87563bbd 100644 --- a/backends/vulkan/runtime/gen_vulkan_spv.py +++ b/backends/vulkan/runtime/gen_vulkan_spv.py @@ -1123,6 +1123,7 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]: # Construct name of SPIR-V file to be compiled spv_out_path = os.path.join(output_dir, f"{src_file_name}.spv") + cached_spv_out_path = None if cache_dir is not None: # Construct the file names of cached SPIR-V file to check if they exist # in the cache. @@ -1160,7 +1161,9 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]: subprocess.run(cmd_no_opt, check=True, capture_output=True) except subprocess.CalledProcessError as e_no_opt: # Delete any existing cached SPIR-V file if it exists - if os.path.exists(cached_spv_out_path): + if cached_spv_out_path is not None and os.path.exists( + cached_spv_out_path + ): os.remove(cached_spv_out_path) raise RuntimeError( @@ -1169,7 +1172,9 @@ def compile_spirv(shader_paths_pair) -> Tuple[str, str]: else: # Delete any existing cached SPIR-V file if it exists - if os.path.exists(cached_spv_out_path): + if cached_spv_out_path is not None and os.path.exists( + cached_spv_out_path + ): os.remove(cached_spv_out_path) raise RuntimeError(f"{err_msg_base} {e.stderr}") from e diff --git a/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml b/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml index bd5c2377cf6..05b26adfb24 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml +++ b/backends/vulkan/runtime/graph/ops/glsl/coopmat_mm.yaml @@ -12,6 +12,10 @@ coopmat_mm: parameter_names_with_default_values: + # GL_KHR_cooperative_matrix requires SPIR-V 1.6, so target Vulkan 1.3 when + # compiling this shader (the default target-env of 1.1 is too low). Other + # shaders are unaffected and keep the default. + VK_VERSION: '1.3' DTYPE: float PRECISION: highp WEIGHT_LAYOUT: row_major diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp index b762c95205b..3d9acae8975 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.cpp +++ b/backends/vulkan/runtime/vk_api/Adapter.cpp @@ -140,6 +140,20 @@ VkDevice create_logical_device( enabled_device_extensions, requested_device_extensions); + // Enable the base device features that ExecuTorch shaders rely on, but only + // those that the physical device reports as supported. With pEnabledFeatures + // left null, all base features are disabled; using a shader that performs + // e.g. int16 arithmetic without enabling shaderInt16 is invalid usage and + // crashes on drivers that enforce it. Unsupported features stay VK_FALSE, so + // this is a no-op on devices that lack them. + VkPhysicalDeviceFeatures enabled_features{}; + enabled_features.shaderInt16 = + physical_device.supports_int16_shader_types ? VK_TRUE : VK_FALSE; + enabled_features.shaderInt64 = + physical_device.supports_int64_shader_types ? VK_TRUE : VK_FALSE; + enabled_features.shaderFloat64 = + physical_device.supports_float64_shader_types ? VK_TRUE : VK_FALSE; + VkDeviceCreateInfo device_create_info{ VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, // sType nullptr, // pNext @@ -151,7 +165,7 @@ VkDevice create_logical_device( static_cast( enabled_device_extensions.size()), // enabledExtensionCount enabled_device_extensions.data(), // ppEnabledExtensionNames - nullptr, // pEnabledFeatures + &enabled_features, // pEnabledFeatures }; void* extension_list_top = nullptr; @@ -234,41 +248,31 @@ VkDevice create_logical_device( bool test_linear_tiling_3d_image_support( VkDevice device, VkPhysicalDevice physical_device) { - // Test creating a 3D image with linear tiling to see if it is supported. - // According to the Vulkan spec, linear tiling may not be supported for 3D - // images. - VkExtent3D image_extents{1u, 1u, 1u}; - const VkImageCreateInfo image_create_info{ - VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, // sType - nullptr, // pNext - 0u, // flags - VK_IMAGE_TYPE_3D, // imageType - VK_FORMAT_R32G32B32A32_SFLOAT, // format - image_extents, // extents - 1u, // mipLevels - 1u, // arrayLayers - VK_SAMPLE_COUNT_1_BIT, // samples - VK_IMAGE_TILING_LINEAR, // tiling - VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, // usage - VK_SHARING_MODE_EXCLUSIVE, // sharingMode - 0u, // queueFamilyIndexCount - nullptr, // pQueueFamilyIndices - VK_IMAGE_LAYOUT_UNDEFINED, // initialLayout - }; - VkImage image = VK_NULL_HANDLE; - VkResult res = vkCreateImage(device, &image_create_info, nullptr, &image); - - if (res == VK_SUCCESS) { - vkDestroyImage(device, image, nullptr); - - VkFormatProperties props; - vkGetPhysicalDeviceFormatProperties( - physical_device, VK_FORMAT_R32G32B32A32_SFLOAT, &props); - - return props.linearTilingFeatures & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT; - } - - return false; + (void)device; + // ExecuTorch allocates 3D image tensors that are used as both sampled and + // storage images, with FP32 (VK_FORMAT_R32G32B32A32_SFLOAT) being the most + // demanding format. Linear tiling may only be used if the physical device + // supports creating such images; per the Vulkan spec, linear tiling support + // for 3D images is optional. + // + // vkGetPhysicalDeviceImageFormatProperties is the authoritative query for + // this exact (format, type, tiling, usage) combination. A vkCreateImage probe + // is unreliable: some drivers (e.g. NVIDIA) accept a trivial 1x1x1 linear 3D + // image even though larger linear 3D storage images of the same format are + // unsupported, and checking only the SAMPLED format feature misses that the + // STORAGE usage is unsupported -- both lead to VK_ERROR_FORMAT_NOT_SUPPORTED + // when allocating real tensors. + VkImageFormatProperties format_props; + const VkResult res = vkGetPhysicalDeviceImageFormatProperties( + physical_device, + VK_FORMAT_R32G32B32A32_SFLOAT, + VK_IMAGE_TYPE_3D, + VK_IMAGE_TILING_LINEAR, + VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT, + 0u, + &format_props); + + return res == VK_SUCCESS; } } // namespace diff --git a/backends/vulkan/runtime/vk_api/Runtime.cpp b/backends/vulkan/runtime/vk_api/Runtime.cpp index 3d3a146d80d..d7e101d0865 100644 --- a/backends/vulkan/runtime/vk_api/Runtime.cpp +++ b/backends/vulkan/runtime/vk_api/Runtime.cpp @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -239,19 +240,64 @@ VkDebugReportCallbackEXT create_debug_report_callback( // Adapter selection methods // -uint32_t select_first(const std::vector& devices) { +// Ranks compute-capable devices so that a real GPU is preferred over a software +// rasterizer (e.g. SwiftShader/lavapipe, which report as CPU). On a single-GPU +// system (e.g. mobile) there is only one candidate, so the choice is unchanged. +int compute_device_priority(const PhysicalDevice& device) { + if (device.num_compute_queues == 0) { + return -1; // not compute-capable, never select + } + switch (device.properties.deviceType) { + case VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU: + return 5; + case VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU: + return 4; + case VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU: + return 3; + case VK_PHYSICAL_DEVICE_TYPE_CPU: + return 1; + default: + return 2; + } +} + +uint32_t select_compute_device( + const std::vector& devices) { + const uint32_t invalid = + devices.size() + 1; // out of range signals invalidity if (devices.empty()) { - return devices.size() + 1; // return out of range to signal invalidity + return invalid; + } + + // Allow overriding device selection via the ETVK_DEVICE_INDEX environment + // variable, which is useful on multi-GPU desktop systems. Invalid values fall + // through to automatic selection below. + const char* device_index_env = std::getenv("ETVK_DEVICE_INDEX"); + if (device_index_env != nullptr) { + char* end = nullptr; + const long idx = std::strtol(device_index_env, &end, 10); + // strtol always sets `end`; the explicit null-check makes the `*end` + // dereference safe for the static analyzer. + if (end != nullptr && end != device_index_env && *end == '\0' && idx >= 0 && + static_cast(idx) < devices.size() && + devices[static_cast(idx)].first.num_compute_queues > 0) { + return static_cast(idx); + } } - // Select the first adapter that has compute capability + // Otherwise pick the highest-priority compute-capable device, preferring the + // first one on ties (preserving the previous first-match behavior). + uint32_t best_i = invalid; + int best_priority = -1; for (size_t i = 0; i < devices.size(); ++i) { - if (devices[i].first.num_compute_queues > 0) { - return i; + const int priority = compute_device_priority(devices[i].first); + if (priority > best_priority) { + best_priority = priority; + best_i = static_cast(i); } } - return devices.size() + 1; + return best_i; } // @@ -283,7 +329,7 @@ std::unique_ptr init_global_vulkan_runtime( const RuntimeConfig default_config{ enable_validation_messages, init_default_device, - AdapterSelector::First, + AdapterSelector::Auto, num_requested_queues, cache_data_path, }; @@ -311,8 +357,8 @@ Runtime::Runtime(const RuntimeConfig config) if (config.init_default_device) { try { switch (config.default_selector) { - case AdapterSelector::First: - default_adapter_i_ = create_adapter(select_first); + case AdapterSelector::Auto: + default_adapter_i_ = create_adapter(select_compute_device); } } catch (...) { } diff --git a/backends/vulkan/runtime/vk_api/Runtime.h b/backends/vulkan/runtime/vk_api/Runtime.h index 3706d6c73d0..285e979eab3 100644 --- a/backends/vulkan/runtime/vk_api/Runtime.h +++ b/backends/vulkan/runtime/vk_api/Runtime.h @@ -31,7 +31,9 @@ namespace vkapi { // enum AdapterSelector { - First, + // Automatically select the best compute-capable device (highest priority + // device type, with an optional ETVK_DEVICE_INDEX override). + Auto, }; struct RuntimeConfig final { diff --git a/backends/vulkan/runtime/vk_api/memory/vma_api.h b/backends/vulkan/runtime/vk_api/memory/vma_api.h index dc7abbf8b1e..cf267a27d11 100644 --- a/backends/vulkan/runtime/vk_api/memory/vma_api.h +++ b/backends/vulkan/runtime/vk_api/memory/vma_api.h @@ -25,17 +25,28 @@ #define VMA_DYNAMIC_VULKAN_FUNCTIONS 1 #define VMA_VULKAN_VERSION 1002000 -#ifdef __clang__ +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnullability-completeness" #pragma clang diagnostic ignored "-Wunused-variable" -#endif /* __clang__ */ +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4100 4101 4189) +#endif #include -#ifdef __clang__ +#if defined(__clang__) #pragma clang diagnostic pop -#endif /* __clang__ */ +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif #else // !ETVK_USE_META_VMA @@ -71,16 +82,27 @@ */ #endif /* VULKAN_DEBUG */ -#ifdef __clang__ +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wnullability-completeness" #pragma clang diagnostic ignored "-Wunused-variable" -#endif /* __clang__ */ +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#pragma GCC diagnostic ignored "-Wunused-parameter" +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4100 4101 4189) +#endif #include -#ifdef __clang__ +#if defined(__clang__) #pragma clang diagnostic pop -#endif /* __clang__ */ +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif #endif // ETVK_USE_META_VMA diff --git a/backends/vulkan/test/custom_ops/build_and_run.sh b/backends/vulkan/test/custom_ops/build_and_run.sh index 2b9ce576e0e..b1195568b1b 100755 --- a/backends/vulkan/test/custom_ops/build_and_run.sh +++ b/backends/vulkan/test/custom_ops/build_and_run.sh @@ -120,7 +120,7 @@ ANDROID_MODE=false CMAKE_OUT_DIR="cmake-out" # Check for --android flag and adjust arguments accordingly -if [[ "$1" == "--android" ]]; then +if [[ "${1:-}" == "--android" ]]; then ANDROID_MODE=true CMAKE_OUT_DIR="cmake-android-out" shift # Remove --android from arguments diff --git a/backends/vulkan/test/test_vulkan_compile_options.py b/backends/vulkan/test/test_vulkan_compile_options.py new file mode 100644 index 00000000000..f44850d2915 --- /dev/null +++ b/backends/vulkan/test/test_vulkan_compile_options.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from typing import Any, Dict + +from executorch.backends.vulkan.partitioner.vulkan_partitioner import ( + parse_compile_options, +) +from executorch.backends.vulkan.vulkan_preprocess import parse_compile_spec + + +class TestVulkanCompileOptions(unittest.TestCase): + """Verify that compile options survive the partitioner -> backend round trip. + + The partitioner serializes the user-provided options into CompileSpecs + (parse_compile_options) and the backend deserializes them at preprocess time + (parse_compile_spec). Boolean options that are serialized but not handled on + the deserialization side are silently dropped, which is a class of bug that + previously hid the small_texture_limits desktop-compatibility option. + """ + + def _round_trip(self, options: Dict[str, Any]) -> Dict[str, Any]: + return parse_compile_spec(parse_compile_options(options)) + + def test_small_texture_limits_round_trips(self) -> None: + round_tripped = self._round_trip({"small_texture_limits": True}) + self.assertTrue(round_tripped.get("small_texture_limits")) + + def test_skip_memory_planning_round_trips(self) -> None: + round_tripped = self._round_trip({"skip_memory_planning": True}) + self.assertTrue(round_tripped.get("skip_memory_planning")) + + def test_force_fp16_round_trips(self) -> None: + round_tripped = self._round_trip({"force_fp16": True}) + self.assertTrue(round_tripped.get("force_fp16")) + + def test_unset_options_are_absent(self) -> None: + round_tripped = self._round_trip({}) + self.assertNotIn("small_texture_limits", round_tripped) + self.assertNotIn("skip_memory_planning", round_tripped) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/vulkan/utils.py b/backends/vulkan/utils.py index 7febff260c6..b349fb51001 100644 --- a/backends/vulkan/utils.py +++ b/backends/vulkan/utils.py @@ -588,6 +588,10 @@ def node_has_target(node: Any, target: str): ImageExtents = Tuple[int, int, int] DEFAULT_TEXTURE_LIMITS = (16384, 16384, 2048) +# Conservative 3D texture limit compatible with most desktop/laptop GPUs. The +# Vulkan spec only guarantees maxImageDimension3D >= 2048, whereas mobile GPUs +# commonly support 16384. Used when the small_texture_limits option is set. +SMALL_TEXTURE_LIMITS = (2048, 2048, 2048) DEFAULT_BUFFER_LIMIT = 128 * (1024 * 1024) all_storage_types: Set[VkStorageType] = { diff --git a/backends/vulkan/vulkan_preprocess.py b/backends/vulkan/vulkan_preprocess.py index e9d5613668a..53a81d1772e 100644 --- a/backends/vulkan/vulkan_preprocess.py +++ b/backends/vulkan/vulkan_preprocess.py @@ -6,7 +6,6 @@ # pyre-strict -import copy from functools import partial from typing import Any, Callable, Dict, final, List @@ -114,6 +113,12 @@ def parse_compile_spec(compile_specs: List[CompileSpec]) -> Dict[str, Any]: if spec.key == "force_fp16": options[spec.key] = bool.from_bytes(spec.value, byteorder="little") + if spec.key == "small_texture_limits": + options[spec.key] = bool.from_bytes(spec.value, byteorder="little") + + if spec.key == "skip_memory_planning": + options[spec.key] = bool.from_bytes(spec.value, byteorder="little") + # Unhandled options are ignored return options @@ -130,16 +135,15 @@ def preprocess( # noqa: C901 ) -> PreprocessResult: compile_options = parse_compile_spec(module_compile_spec) - default_texture_limits = copy.deepcopy(utils.DEFAULT_TEXTURE_LIMITS) # 2048 is the typical limit value for 3D textures, but mobile GPUs often support # 16384. Since the Vulkan delegate primarily targets mobile GPUs at the moment, - # 16394 is the default texture limit used. This option is provided as a - # convenient way to switch to using a limit of 2048 for image textures which - # will be compatible with most GPUs. + # 16384 is the default texture limit used. The small_texture_limits option is + # provided as a convenient way to switch to a limit of 2048 for image textures, + # which will be compatible with most desktop/laptop GPUs. if compile_options.get("small_texture_limits", False): - default_texture_limits[0] = 2048 - default_texture_limits[1] = 2048 - default_texture_limits[2] = 2048 + default_texture_limits = utils.SMALL_TEXTURE_LIMITS + else: + default_texture_limits = utils.DEFAULT_TEXTURE_LIMITS limits_x = compile_options.get("texture_limits_x", default_texture_limits[0]) limits_y = compile_options.get("texture_limits_y", default_texture_limits[1]) diff --git a/setup.py b/setup.py index 85228bd37ae..cd2bb5332cb 100644 --- a/setup.py +++ b/setup.py @@ -134,6 +134,7 @@ def _minimal_cmake_flags() -> List[str]: "-DEXECUTORCH_BUILD_PYBIND=OFF", "-DEXECUTORCH_BUILD_QNN=OFF", "-DEXECUTORCH_BUILD_TESTS=OFF", + "-DEXECUTORCH_BUILD_VULKAN=OFF", "-DEXECUTORCH_BUILD_XNNPACK=OFF", ] @@ -962,6 +963,9 @@ def run(self): # noqa C901 if cmake_cache.is_enabled("EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER"): cmake_build_args += ["--target", "_llm_runner"] + if cmake_cache.is_enabled("EXECUTORCH_BUILD_VULKAN"): + cmake_build_args += ["--target", "vulkan_backend"] + if cmake_cache.is_enabled("EXECUTORCH_BUILD_CUDA"): cmake_build_args += ["--target", "aoti_cuda_backend"] cmake_build_args += ["--target", "aoti_common_shims_slim"] diff --git a/tools/cmake/preset/pybind.cmake b/tools/cmake/preset/pybind.cmake index ecce850ab3c..9a17f561785 100644 --- a/tools/cmake/preset/pybind.cmake +++ b/tools/cmake/preset/pybind.cmake @@ -97,3 +97,31 @@ else() FATAL_ERROR "Unsupported CMAKE_SYSTEM_NAME for pybind: ${CMAKE_SYSTEM_NAME}" ) endif() + +# Opt-in Vulkan backend for Linux/Windows wheels. Enabled ONLY when the build +# requests it via the EXECUTORCH_BUILD_VULKAN env var AND glslc (Vulkan SDK) is +# available to compile the shaders. This keeps the default wheel (and +# macOS/Android) byte-for-byte unchanged: GPU backends are opt-in rather than +# bundled into the universal wheel. +if(CMAKE_SYSTEM_NAME STREQUAL "Linux" + OR CMAKE_SYSTEM_NAME STREQUAL "Windows" + OR CMAKE_SYSTEM_NAME STREQUAL "WIN32" +) + if(DEFINED ENV{EXECUTORCH_BUILD_VULKAN} + AND NOT "$ENV{EXECUTORCH_BUILD_VULKAN}" STREQUAL "0" + AND NOT "$ENV{EXECUTORCH_BUILD_VULKAN}" STREQUAL "OFF" + ) + find_program( + GLSLC_PATH glslc HINTS $ENV{VULKAN_SDK}/bin $ENV{VULKAN_SDK}/Bin + ) + if(GLSLC_PATH) + set_overridable_option(EXECUTORCH_BUILD_VULKAN ON) + message(STATUS "Enabling Vulkan backend for wheel; glslc: ${GLSLC_PATH}") + else() + message( + STATUS "EXECUTORCH_BUILD_VULKAN requested but glslc was not found; " + "the Vulkan backend will not be included." + ) + endif() + endif() +endif() From 391f139fc97693096bd1152970f69894963b48c9 Mon Sep 17 00:00:00 2001 From: Reuben Dunn Date: Wed, 17 Jun 2026 19:10:40 -0700 Subject: [PATCH 2/2] fix cast issue for Vulkan allocation sizes --- backends/vulkan/runtime/api/containers/Tensor.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp index ef892206d48..15e2660078a 100644 --- a/backends/vulkan/runtime/api/containers/Tensor.cpp +++ b/backends/vulkan/runtime/api/containers/Tensor.cpp @@ -687,8 +687,9 @@ vkapi::VulkanBuffer allocate_buffer( // otherwise read past the end of the buffer, which silently zeroes the value // on NVIDIA GPUs. This only grows the allocation; the tensor's // physical_numel() is unchanged. - const size_t alloc_nbytes = - utils::align_up(element_size(dtype) * numel, static_cast(16)); + const size_t alloc_nbytes = utils::align_up( + element_size(dtype) * static_cast(numel), + static_cast(16)); // TODO: this check is incorrect. max_buffer_numel() returns // maxStorageBufferRange, which is a size in bytes, so the comparison should