leejet · RapidMark · Jun 14, 2026
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -899,7 +899,7 @@ ArgOptions SDGenerationParams::get_options() {
          &extra_sample_args},
         {"",
          "--extra-tiling-args",
-         "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
+         "extra VAE tiling args, key=value list. max_buffer_size (bytes) forces the auto fallback to tile when an untiled VAE compute buffer would exceed it. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)",
          &extra_tiling_args},
     };
 
@@ -1084,6 +1084,12 @@ ArgOptions SDGenerationParams::get_options() {
          "process vae in tiles to reduce memory usage",
          true,
          &vae_tiling_params.enabled},
+        {"",
+         "--no-vae-tiling-fallback",
+         "disable the automatic fallback to VAE tiling when an untiled decode would exceed the "
+         "backend's max buffer size (fail instead of tiling)",
+         false,
+         &vae_tiling_params.auto_tile},
         {"",
          "--temporal-tiling",
          "enable temporal tiling for LTX video VAE decode",
@@ -1828,6 +1834,9 @@ bool SDGenerationParams::from_json_str(
         if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) {
             vae_tiling_params.enabled = tiling_json["enabled"];
         }
+        if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) {
+            vae_tiling_params.auto_tile = tiling_json["auto_tile"];
+        }
         if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) {
             vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"];
         }
@@ -2641,10 +2650,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params,
     }
 
     if (gen_params.vae_tiling_params.enabled ||
+        !gen_params.vae_tiling_params.auto_tile ||
         gen_params.vae_tiling_params.temporal_tiling ||
         !gen_params.extra_tiling_args.empty()) {
         root["vae_tiling"] = {
             {"enabled", gen_params.vae_tiling_params.enabled},
+            {"auto_tile", gen_params.vae_tiling_params.auto_tile},
             {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling},
             {"tile_size_x", gen_params.vae_tiling_params.tile_size_x},
             {"tile_size_y", gen_params.vae_tiling_params.tile_size_y},

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -227,7 +227,7 @@ struct SDGenerationParams {
     int video_frames                     = 1;
     int fps                              = 16;
     float vace_strength                  = 1.f;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     std::string extra_tiling_args;
 
     std::string pm_id_images_dir;

diff --git a/examples/server/api.md b/examples/server/api.md
@@ -518,7 +518,7 @@ Shared default fields used by both `img_gen` and `vid_gen`:
 | `output_format` | `string` |
 | `output_compression` | `integer` |
 
-`vae_tiling_params.extra_tiling_args` accepts a key=value list. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`.
+`vae_tiling_params.extra_tiling_args` accepts a key=value list. `max_buffer_size` (bytes) forces the automatic tiling fallback when an untiled VAE compute buffer would exceed it. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`.
 
 `img_gen`-specific default fields:
 

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -153,14 +153,15 @@ enum lora_apply_mode_t {
 };
 
 typedef struct {
-    bool enabled;
+    bool enabled;  // true => always tile (ON)
     bool temporal_tiling;
     int tile_size_x;
     int tile_size_y;
     float target_overlap;
     float rel_size_x;
     float rel_size_y;
     const char* extra_tiling_args;
+    bool auto_tile;  // AUTO (default): tile only when an untiled VAE decode would exceed the backend's max buffer size
 } sd_tiling_params_t;
 
 typedef struct {

diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp
@@ -1705,11 +1705,21 @@ struct GGMLRunner {
 
     ggml_context* compute_ctx    = nullptr;
     ggml_gallocr* compute_allocr = nullptr;
+    // Set when alloc_compute_buffer() deliberately defers to tiling (probe found the
+    // untiled buffer exceeds the backend max); lets callers skip the failure error.
+    bool compute_buffer_deferred_to_tiling = false;
 
     size_t max_graph_vram_bytes           = 0;
     bool stream_layers_enabled            = false;
     size_t observed_max_effective_budget_ = 0;
 
+    // When set, alloc_compute_buffer measures the planned compute buffer (no alloc)
+    // and bails if it exceeds the backend max, so VAE AUTO can fall back to tiling.
+    bool probe_compute_buffer_fits_ = false;
+    // Optional user cap (bytes): also fall back to tiling if the planned compute
+    // buffer would exceed this, regardless of the backend limit. 0 = no cap.
+    size_t probe_max_bytes_ = 0;
+
     std::shared_ptr<WeightAdapter> weight_adapter = nullptr;
     std::weak_ptr<RunnerWeightManager> weight_manager;
     std::unordered_set<const ggml_tensor*> kept_compute_param_tensor_set;
@@ -1978,10 +1988,66 @@ struct GGMLRunner {
     }
 
     bool alloc_compute_buffer(ggml_cgraph* gf) {
+        compute_buffer_deferred_to_tiling = false;
         if (compute_allocr != nullptr) {
             return true;
         }
-        compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend));
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend);
+
+        if (probe_compute_buffer_fits_) {
+            // Probe whether an untiled decode fits before allocating; if not, defer to
+            // tiling here instead of letting the real reserve below fail with a raw
+            // backend error. A genuine runtime OOM still surfaces below as a backstop.
+            if (probe_max_bytes_ > 0) {
+                // User-requested cap (extra_tiling_args max_buffer_size): tile when the
+                // planned untiled buffer would exceed it, on any backend.
+                ggml_gallocr* probe = ggml_gallocr_new(buft);
+                size_t sizes[1]     = {0};
+                ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
+                ggml_gallocr_free(probe);
+                if (sizes[0] > probe_max_bytes_) {
+                    LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds requested max_buffer_size %.2f MB; deferring to tiling",
+                              get_desc().c_str(),
+                              sizes[0] / 1024.0 / 1024.0,
+                              probe_max_bytes_ / 1024.0 / 1024.0);
+                    compute_buffer_deferred_to_tiling = true;
+                    return false;
+                }
+            }
+            if (sd_backend_is(runtime_backend, "Vulkan")) {
+                // supports_op rejects any op larger than the device's real max buffer
+                // size, which is the true per-buffer limit -- unlike buft_get_max_size,
+                // which on Vulkan reports only the ~1 GB suballocation block.
+                for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+                    ggml_tensor* op = ggml_graph_node(gf, i);
+                    if (!ggml_backend_supports_op(runtime_backend, op)) {
+                        LOG_DEBUG("%s: untiled compute op %.2f MB exceeds backend support; deferring to tiling",
+                                  get_desc().c_str(),
+                                  ggml_nbytes(op) / 1024.0 / 1024.0);
+                        compute_buffer_deferred_to_tiling = true;
+                        return false;
+                    }
+                }
+            } else {
+                size_t max_size = ggml_backend_buft_get_max_size(buft);
+                if (max_size > 0) {
+                    ggml_gallocr* probe = ggml_gallocr_new(buft);
+                    size_t sizes[1]     = {0};
+                    ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes);
+                    ggml_gallocr_free(probe);
+                    if (sizes[0] > max_size) {
+                        LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling",
+                                  get_desc().c_str(),
+                                  sizes[0] / 1024.0 / 1024.0,
+                                  max_size / 1024.0 / 1024.0);
+                        compute_buffer_deferred_to_tiling = true;
+                        return false;
+                    }
+                }
+            }
+        }
+
+        compute_allocr = ggml_gallocr_new(buft);
 
         if (!ggml_gallocr_reserve(compute_allocr, gf)) {
             // failed to allocate the compute buffer
@@ -2432,7 +2498,11 @@ struct GGMLRunner {
         GraphWeightDoneGuard graph_weight_done_guard(this, &params_to_prepare);
 
         if (!alloc_compute_buffer(gf)) {
-            LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
+            // compute_buffer_deferred_to_tiling: alloc_compute_buffer declined a too-large
+            // untiled buffer on purpose (VAE AUTO will retry with tiling) -- not a real error.
+            if (!compute_buffer_deferred_to_tiling) {
+                LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str());
+            }
             return std::nullopt;
         }
         struct ComputeBufferGuard {
@@ -2822,6 +2892,15 @@ struct GGMLRunner {
     void set_stream_layers_enabled(bool enabled) {
         stream_layers_enabled = enabled;
     }
+
+    // When enabled, the next compute() measures its planned compute buffer and
+    // declines to allocate (returning failure) if it would exceed the backend's
+    // max single-buffer size, instead of attempting the allocation and emitting
+    // the backend's raw error. See probe_compute_buffer_fits_.
+    void set_probe_compute_buffer_fits(bool enabled, size_t max_bytes = 0) {
+        probe_compute_buffer_fits_ = enabled;
+        probe_max_bytes_           = enabled ? max_bytes : 0;
+    }
 };
 
 class GGMLBlock {

diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp
@@ -199,7 +199,55 @@ struct VAE : public GGMLRunner {
                 "vae decode compute failed while processing a tile",
                 silent);
         } else {
+            // AUTO: probe the untiled decode buffer first so a too-large one defers to tiling below
+            // without the backend's raw alloc error; the output.empty() check still backstops a real OOM.
+            const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile;
+            if (auto_probe) {
+                size_t max_bytes = 0;
+                if (tiling_params.extra_tiling_args != nullptr) {
+                    for (const auto& [key, value] : parse_key_value_args(tiling_params.extra_tiling_args, "VAE extra tiling arg")) {
+                        if (key == "max_buffer_size") {
+                            max_bytes = strtoull(value.c_str(), nullptr, 10);
+                        }
+                    }
+                }
+                set_probe_compute_buffer_fits(true, max_bytes);
+            }
             output = _compute(n_threads, input, true);
+            if (auto_probe) {
+                set_probe_compute_buffer_fits(false);
+            }
+            if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) {
+                // Untiled decode exceeded the backend's per-buffer limit (common on iGPUs, where the
+                // cap is per-buffer, not total memory) -- fall back to tiling instead of failing.
+                free_compute_buffer();
+                if (!silent) {
+                    LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling");
+                }
+                sd_tiling_params_t auto_tiling = tiling_params;
+                auto_tiling.enabled            = true;  // default tile size (32) via get_tile_sizes
+                set_tiling_params(auto_tiling);
+                const int scale_factor = get_scale_factor();
+                int64_t W              = input.shape()[0] * scale_factor;
+                int64_t H              = input.shape()[1] * scale_factor;
+                float tile_overlap;
+                int tile_size_x, tile_size_y;
+                get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]);
+                output = tiled_compute(
+                    input,
+                    n_threads,
+                    static_cast<int>(W),
+                    static_cast<int>(H),
+                    scale_factor,
+                    tile_size_x,
+                    tile_size_y,
+                    tile_overlap,
+                    circular_x,
+                    circular_y,
+                    true,
+                    "vae decode compute failed while processing a tile",
+                    silent);
+            }
         }
 
         free_compute_buffer();

diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -187,7 +187,7 @@ class StableDiffusionGGML {
     bool apply_lora_immediately = false;
 
     std::string taesd_path;
-    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr};
+    sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true};  // auto_tile=true (AUTO default)
     bool enable_mmap                     = false;
     sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment;
     bool stream_layers = false;
@@ -2795,7 +2795,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) {
     sd_img_gen_params->batch_count       = 1;
     sd_img_gen_params->control_strength  = 0.9f;
     sd_img_gen_params->pm_params         = {nullptr, 0, nullptr, 20.f};
-    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_cache_params_init(&sd_img_gen_params->cache);
     sd_hires_params_init(&sd_img_gen_params->hires);
 }
@@ -2882,7 +2882,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) {
     sd_vid_gen_params->fps                                   = 16;
     sd_vid_gen_params->moe_boundary                          = 0.875f;
     sd_vid_gen_params->vace_strength                         = 1.f;
-    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr};
+    sd_vid_gen_params->vae_tiling_params                     = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true};  // auto_tile=true (AUTO)
     sd_vid_gen_params->hires.enabled                         = false;
     sd_vid_gen_params->hires.upscaler                        = SD_HIRES_UPSCALER_LATENT;
     sd_vid_gen_params->hires.scale                           = 2.f;