From d5d134e65726830acfc08f107acedecd595d5cda Mon Sep 17 00:00:00 2001
From: Mark Caldwell <mark@cloudhands.ai>
Date: Sun, 14 Jun 2026 09:46:13 -0700
Subject: [PATCH] feat: automatic VAE-tiling fallback when an untiled decode
 exceeds the backend buffer limit

VAE decode can hard-fail on integrated / low-VRAM GPUs because the untiled compute
buffer exceeds the backend's maximum single-buffer allocation (e.g. Vulkan's
suballocation limit) even when total memory is plentiful. sd.cpp already supports
tiling that keeps each compute buffer small, but it had to be requested up front
with --vae-tiling, so users hit a hard failure one flag away from the working path.

Make the fallback automatic and on by default:

- sd_tiling_params_t gains a bool auto_tile (appended, so the C ABI stays
  compatible). In AUTO (the default: --vae-tiling off, auto_tile on) VAE::decode
  tries the untiled decode and, if its compute buffer can't be allocated, frees it
  and retries once with tiling.
- --vae-tiling stays the original boolean flag (force tiling on);
  --no-vae-tiling-fallback turns the auto fallback off (hard-fail like before).
- GGMLRunner gets an opt-in probe (set_probe_compute_buffer_fits) so AUTO can
  decline a too-large untiled decode before the backend emits its raw allocation
  error. On Vulkan it checks each op against the device's real per-buffer limit via
  ggml_backend_supports_op (the reported max buffer size, not the smaller
  suballocation block); other backends compare the planned compute buffer against
  ggml_backend_buft_get_max_size. The reactive output-empty -> tile path still
  backstops a genuine runtime OOM.
- extra_tiling_args gains a max_buffer_size=<bytes> key: in AUTO the fallback also
  tiles when the planned untiled compute buffer would exceed it, letting a user cap
  VAE VRAM on any backend.
---
 examples/common/common.cpp | 13 +++++-
 examples/common/common.h   |  2 +-
 examples/server/api.md     |  2 +-
 include/stable-diffusion.h |  3 +-
 src/core/ggml_extend.hpp   | 83 +++++++++++++++++++++++++++++++++++++-
 src/model/vae/vae.hpp      | 48 ++++++++++++++++++++++
 src/stable-diffusion.cpp   |  6 +--
 7 files changed, 148 insertions(+), 9 deletions(-)
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
index e9b8bc85a..126b91e37 100644
--- a/examples/common/common.cpp
+++ b/examples/common/common.cpp
@@ -899,7 +899,7 @@ ArgOptions SDGenerationParams::get_options() {
          &extra_sample_args},
         {"",
          "--extra-tiling-args",
-         "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
+         "extra VAE tiling args, key=value list. max_buffer_size (bytes) forces the auto fallback to tile when an untiled VAE compute buffer would exceed it. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
          &extra_tiling_args},
     };
 
@@ -1084,6 +1084,12 @@ ArgOptions SDGenerationParams::get_options() {
          "process vae in tiles to reduce memory usage",
          true,
          &vae_tiling_params.enabled},
+        {"",
+         "--no-vae-tiling-fallback",
+         "disable the automatic fallback to VAE tiling when an untiled decode would exceed the "
+         "backend's max buffer size (fail instead of tiling)",
+         false,
+         &vae_tiling_params.auto_tile},
         {"",
          "--temporal-tiling",
          "enable temporal tiling for LTX video VAE decode",
@@ -1828,6 +1834,9 @@ bool SDGenerationParams::from_json_str(
         if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
             vae_tiling_params.enabled = tiling_json["enabled"];
         }
+        if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) {
+            vae_tiling_params.auto_tile = tiling_json["auto_tile"];
+        }
         if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
             vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
         }
@@ -2641,10 +2650,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
     }
 
     if (gen_params.vae_tiling_params.enabled ||
+        !gen_params.vae_tiling_params.auto_tile ||
         gen_params.vae_tiling_params.temporal_tiling ||
         !gen_params.extra_tiling_args.empty()) {
         root["vae_tiling"] = {
             {"enabled", gen_params.vae_tiling_params.enabled},
+            {"auto_tile", gen_params.vae_tiling_params.auto_tile},
             {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling},
             {"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
             {"tile_size_y", gen_params.vae_tiling_params.tile_size_y},
diff --git a/examples/common/common.h b/examples/common/common.h
index 55fa5ac0a..d82c1856b 100644
--- a/examples/common/common.h
+++ b/examples/common/common.h
@@ -227,7 +227,7 @@ struct SDGenerationParams {
     int video_frames                     = 1;
     int fps                              = 16;
     float vace_strength                  = 1.f;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     std::string extra_tiling_args;
 
     std::string pm_id_images_dir;
diff --git a/examples/server/api.md b/examples/server/api.md
index adcec26ff..ae68d379d 100644
--- a/examples/server/api.md
+++ b/examples/server/api.md
@@ -518,7 +518,7 @@ Shared default fields used by both `img_gen` and `vid_gen`:
 | `output_format` | `string` |
 | `output_compression` | `integer` |
 
-`vae_tiling_params.extra_tiling_args` accepts a key=value list. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`.
+`vae_tiling_params.extra_tiling_args` accepts a key=value list. `max_buffer_size` (bytes) forces the automatic tiling fallback when an untiled VAE compute buffer would exceed it. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`.
 
 `img_gen`-specific default fields:
 
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index 00f3e4e97..36a444572 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -153,7 +153,7 @@ enum lora_apply_mode_t {
 };
 
 typedef struct {
-    bool enabled;
+    bool enabled;  // true => always tile (ON)
     bool temporal_tiling;
     int tile_size_x;
     int tile_size_y;
@@ -161,6 +161,7 @@ typedef struct {
     float rel_size_x;
     float rel_size_y;
     const char* extra_tiling_args;
+    bool auto_tile;  // AUTO (default): tile only when an untiled VAE decode would exceed the backend's max buffer size
 } sd_tiling_params_t;
 
 typedef struct {
diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
index a3dda16b2..40910b4d5 100644
--- a/src/core/ggml_extend.hpp
+++ b/src/core/ggml_extend.hpp
@@ -1705,11 +1705,21 @@ struct GGMLRunner {
 
     ggml_context* compute_ctx    = nullptr;
     ggml_gallocr* compute_allocr = nullptr;
+    // Set when alloc_compute_buffer() deliberately defers to tiling (probe found the
+    // untiled buffer exceeds the backend max); lets callers skip the failure error.
+    bool compute_buffer_deferred_to_tiling = false;
 
     size_t max_graph_vram_bytes           = 0;
     bool stream_layers_enabled            = false;
     size_t observed_max_effective_budget_ = 0;
 
+    // When set, alloc_compute_buffer measures the planned compute buffer (no alloc)
+    // and bails if it exceeds the backend max, so VAE AUTO can fall back to tiling.
+    bool probe_compute_buffer_fits_ = false;
+    // Optional user cap (bytes): also fall back to tiling if the planned compute
+    // buffer would exceed this, regardless of the backend limit. 0 = no cap.
+    size_t probe_max_bytes_ = 0;
+
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
     std::weak_ptr<RunnerWeightManager> weight_manager;
     std::unordered_set<const ggml_tensor*> kept_compute_param_tensor_set;
@@ -1978,10 +1988,66 @@ struct GGMLRunner {
     }
 
     bool alloc_compute_buffer(ggml_cgraph* gf) {
+        compute_buffer_deferred_to_tiling = false;
         if (compute_allocr != nullptr) {
             return true;
         }
-        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend);
+
+        if (probe_compute_buffer_fits_) {
+            // Probe whether an untiled decode fits before allocating; if not, defer to
+            // tiling here instead of letting the real reserve below fail with a raw
+            // backend error. A genuine runtime OOM still surfaces below as a backstop.
+            if (probe_max_bytes_ > 0) {
+                // User-requested cap (extra_tiling_args max_buffer_size): tile when the
+                // planned untiled buffer would exceed it, on any backend.
+                ggml_gallocr* probe = ggml_gallocr_new(buft);
+                size_t sizes[1]     = {0};
+                ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
+                ggml_gallocr_free(probe);
+                if (sizes[0] > probe_max_bytes_) {
+                    LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds requested max_buffer_size %.2f MB; deferring to tiling",
+                              get_desc().c_str(),
+                              sizes[0] / 1024.0 / 1024.0,
+                              probe_max_bytes_ / 1024.0 / 1024.0);
+                    compute_buffer_deferred_to_tiling = true;
+                    return false;
+                }
+            }
+            if (sd_backend_is(runtime_backend, "Vulkan")) {
+                // supports_op rejects any op larger than the device's real max buffer
+                // size, which is the true per-buffer limit -- unlike buft_get_max_size,
+                // which on Vulkan reports only the ~1 GB suballocation block.
+                for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+                    ggml_tensor* op = ggml_graph_node(gf, i);
+                    if (!ggml_backend_supports_op(runtime_backend, op)) {
+                        LOG_DEBUG("%s: untiled compute op %.2f MB exceeds backend support; deferring to tiling",
+                                  get_desc().c_str(),
+                                  ggml_nbytes(op) / 1024.0 / 1024.0);
+                        compute_buffer_deferred_to_tiling = true;
+                        return false;
+                    }
+                }
+            } else {
+                size_t max_size = ggml_backend_buft_get_max_size(buft);
+                if (max_size > 0) {
+                    ggml_gallocr* probe = ggml_gallocr_new(buft);
+                    size_t sizes[1]     = {0};
+                    ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
+                    ggml_gallocr_free(probe);
+                    if (sizes[0] > max_size) {
+                        LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling",
+                                  get_desc().c_str(),
+                                  sizes[0] / 1024.0 / 1024.0,
+                                  max_size / 1024.0 / 1024.0);
+                        compute_buffer_deferred_to_tiling = true;
+                        return false;
+                    }
+                }
+            }
+        }
+
+        compute_allocr = ggml_gallocr_new(buft);
 
         if (!ggml_gallocr_reserve(compute_allocr, gf)) {
             // failed to allocate the compute buffer
@@ -2432,7 +2498,11 @@ struct GGMLRunner {
         GraphWeightDoneGuard graph_weight_done_guard(this, &params_to_prepare);
 
         if (!alloc_compute_buffer(gf)) {
-            LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
+            // compute_buffer_deferred_to_tiling: alloc_compute_buffer declined a too-large
+            // untiled buffer on purpose (VAE AUTO will retry with tiling) -- not a real error.
+            if (!compute_buffer_deferred_to_tiling) {
+                LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
+            }
             return std::nullopt;
         }
         struct ComputeBufferGuard {
@@ -2822,6 +2892,15 @@ struct GGMLRunner {
     void set_stream_layers_enabled(bool enabled) {
         stream_layers_enabled = enabled;
     }
+
+    // When enabled, the next compute() measures its planned compute buffer and
+    // declines to allocate (returning failure) if it would exceed the backend's
+    // max single-buffer size, instead of attempting the allocation and emitting
+    // the backend's raw error. See probe_compute_buffer_fits_.
+    void set_probe_compute_buffer_fits(bool enabled, size_t max_bytes = 0) {
+        probe_compute_buffer_fits_ = enabled;
+        probe_max_bytes_           = enabled ? max_bytes : 0;
+    }
 };
 
 class GGMLBlock {
diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp
index af091bb57..37901dfc2 100644
--- a/src/model/vae/vae.hpp
+++ b/src/model/vae/vae.hpp
@@ -199,7 +199,55 @@ struct VAE : public GGMLRunner {
                 "vae decode compute failed while processing a tile",
                 silent);
         } else {
+            // AUTO: probe the untiled decode buffer first so a too-large one defers to tiling below
+            // without the backend's raw alloc error; the output.empty() check still backstops a real OOM.
+            const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile;
+            if (auto_probe) {
+                size_t max_bytes = 0;
+                if (tiling_params.extra_tiling_args != nullptr) {
+                    for (const auto& [key, value] : parse_key_value_args(tiling_params.extra_tiling_args, "VAE extra tiling arg")) {
+                        if (key == "max_buffer_size") {
+                            max_bytes = strtoull(value.c_str(), nullptr, 10);
+                        }
+                    }
+                }
+                set_probe_compute_buffer_fits(true, max_bytes);
+            }
             output = _compute(n_threads, input, true);
+            if (auto_probe) {
+                set_probe_compute_buffer_fits(false);
+            }
+            if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) {
+                // Untiled decode exceeded the backend's per-buffer limit (common on iGPUs, where the
+                // cap is per-buffer, not total memory) -- fall back to tiling instead of failing.
+                free_compute_buffer();
+                if (!silent) {
+                    LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling");
+                }
+                sd_tiling_params_t auto_tiling = tiling_params;
+                auto_tiling.enabled            = true;  // default tile size (32) via get_tile_sizes
+                set_tiling_params(auto_tiling);
+                const int scale_factor = get_scale_factor();
+                int64_t W              = input.shape()[0] * scale_factor;
+                int64_t H              = input.shape()[1] * scale_factor;
+                float tile_overlap;
+                int tile_size_x, tile_size_y;
+                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]);
+                output = tiled_compute(
+                    input,
+                    n_threads,
+                    static_cast<int>(W),
+                    static_cast<int>(H),
+                    scale_factor,
+                    tile_size_x,
+                    tile_size_y,
+                    tile_overlap,
+                    circular_x,
+                    circular_y,
+                    true,
+                    "vae decode compute failed while processing a tile",
+                    silent);
+            }
         }
 
         free_compute_buffer();
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index 836b0f85b..38dd21696 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -187,7 +187,7 @@ class StableDiffusionGGML {
     bool apply_lora_immediately = false;
 
     std::string taesd_path;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true};  // auto_tile=true (AUTO default)
     bool enable_mmap                     = false;
     sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
     bool stream_layers = false;
@@ -2795,7 +2795,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
     sd_img_gen_params->batch_count       = 1;
     sd_img_gen_params->control_strength  = 0.9f;
     sd_img_gen_params->pm_params         = {nullptr, 0, nullptr, 20.f};
-    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_cache_params_init(&sd_img_gen_params->cache);
     sd_hires_params_init(&sd_img_gen_params->hires);
 }
@@ -2882,7 +2882,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     sd_vid_gen_params->fps                                   = 16;
     sd_vid_gen_params->moe_boundary                          = 0.875f;
     sd_vid_gen_params->vace_strength                         = 1.f;
-    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_vid_gen_params->hires.enabled                         = false;
     sd_vid_gen_params->hires.upscaler                        = SD_HIRES_UPSCALER_LATENT;
     sd_vid_gen_params->hires.scale                           = 2.f;