diff --git a/examples/common/common.cpp b/examples/common/common.cpp index e9b8bc85a..126b91e37 100644 --- a/examples/common/common.cpp +++ b/examples/common/common.cpp @@ -899,7 +899,7 @@ ArgOptions SDGenerationParams::get_options() { &extra_sample_args}, {"", "--extra-tiling-args", - "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)", + "extra VAE tiling args, key=value list. max_buffer_size (bytes) forces the auto fallback to tile when an untiled VAE compute buffer would exceed it. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)", &extra_tiling_args}, }; @@ -1084,6 +1084,12 @@ ArgOptions SDGenerationParams::get_options() { "process vae in tiles to reduce memory usage", true, &vae_tiling_params.enabled}, + {"", + "--no-vae-tiling-fallback", + "disable the automatic fallback to VAE tiling when an untiled decode would exceed the " + "backend's max buffer size (fail instead of tiling)", + false, + &vae_tiling_params.auto_tile}, {"", "--temporal-tiling", "enable temporal tiling for LTX video VAE decode", @@ -1828,6 +1834,9 @@ bool SDGenerationParams::from_json_str( if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) { vae_tiling_params.enabled = tiling_json["enabled"]; } + if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) { + vae_tiling_params.auto_tile = tiling_json["auto_tile"]; + } if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) { vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"]; } @@ -2641,10 +2650,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, } if (gen_params.vae_tiling_params.enabled || + !gen_params.vae_tiling_params.auto_tile || gen_params.vae_tiling_params.temporal_tiling || !gen_params.extra_tiling_args.empty()) { root["vae_tiling"] = { {"enabled", gen_params.vae_tiling_params.enabled}, + {"auto_tile", gen_params.vae_tiling_params.auto_tile}, {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling}, {"tile_size_x", gen_params.vae_tiling_params.tile_size_x}, {"tile_size_y", gen_params.vae_tiling_params.tile_size_y}, diff --git a/examples/common/common.h b/examples/common/common.h index 55fa5ac0a..d82c1856b 100644 --- a/examples/common/common.h +++ b/examples/common/common.h @@ -227,7 +227,7 @@ struct SDGenerationParams { int video_frames = 1; int fps = 16; float vace_strength = 1.f; - sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) std::string extra_tiling_args; std::string pm_id_images_dir; diff --git a/examples/server/api.md b/examples/server/api.md index adcec26ff..ae68d379d 100644 --- a/examples/server/api.md +++ b/examples/server/api.md @@ -518,7 +518,7 @@ Shared default fields used by both `img_gen` and `vid_gen`: | `output_format` | `string` | | `output_compression` | `integer` | -`vae_tiling_params.extra_tiling_args` accepts a key=value list. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`. +`vae_tiling_params.extra_tiling_args` accepts a key=value list. `max_buffer_size` (bytes) forces the automatic tiling fallback when an untiled VAE compute buffer would exceed it. For LTX video VAE temporal tiling, `temporal_tile_frames` defaults to `4` and `temporal_tile_overlap` defaults to `1`. `img_gen`-specific default fields: diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 00f3e4e97..36a444572 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -153,7 +153,7 @@ enum lora_apply_mode_t { }; typedef struct { - bool enabled; + bool enabled; // true => always tile (ON) bool temporal_tiling; int tile_size_x; int tile_size_y; @@ -161,6 +161,7 @@ typedef struct { float rel_size_x; float rel_size_y; const char* extra_tiling_args; + bool auto_tile; // AUTO (default): tile only when an untiled VAE decode would exceed the backend's max buffer size } sd_tiling_params_t; typedef struct { diff --git a/src/core/ggml_extend.hpp b/src/core/ggml_extend.hpp index a3dda16b2..40910b4d5 100644 --- a/src/core/ggml_extend.hpp +++ b/src/core/ggml_extend.hpp @@ -1705,11 +1705,21 @@ struct GGMLRunner { ggml_context* compute_ctx = nullptr; ggml_gallocr* compute_allocr = nullptr; + // Set when alloc_compute_buffer() deliberately defers to tiling (probe found the + // untiled buffer exceeds the backend max); lets callers skip the failure error. + bool compute_buffer_deferred_to_tiling = false; size_t max_graph_vram_bytes = 0; bool stream_layers_enabled = false; size_t observed_max_effective_budget_ = 0; + // When set, alloc_compute_buffer measures the planned compute buffer (no alloc) + // and bails if it exceeds the backend max, so VAE AUTO can fall back to tiling. + bool probe_compute_buffer_fits_ = false; + // Optional user cap (bytes): also fall back to tiling if the planned compute + // buffer would exceed this, regardless of the backend limit. 0 = no cap. + size_t probe_max_bytes_ = 0; + std::shared_ptr weight_adapter = nullptr; std::weak_ptr weight_manager; std::unordered_set kept_compute_param_tensor_set; @@ -1978,10 +1988,66 @@ struct GGMLRunner { } bool alloc_compute_buffer(ggml_cgraph* gf) { + compute_buffer_deferred_to_tiling = false; if (compute_allocr != nullptr) { return true; } - compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); + ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend); + + if (probe_compute_buffer_fits_) { + // Probe whether an untiled decode fits before allocating; if not, defer to + // tiling here instead of letting the real reserve below fail with a raw + // backend error. A genuine runtime OOM still surfaces below as a backstop. + if (probe_max_bytes_ > 0) { + // User-requested cap (extra_tiling_args max_buffer_size): tile when the + // planned untiled buffer would exceed it, on any backend. + ggml_gallocr* probe = ggml_gallocr_new(buft); + size_t sizes[1] = {0}; + ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes); + ggml_gallocr_free(probe); + if (sizes[0] > probe_max_bytes_) { + LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds requested max_buffer_size %.2f MB; deferring to tiling", + get_desc().c_str(), + sizes[0] / 1024.0 / 1024.0, + probe_max_bytes_ / 1024.0 / 1024.0); + compute_buffer_deferred_to_tiling = true; + return false; + } + } + if (sd_backend_is(runtime_backend, "Vulkan")) { + // supports_op rejects any op larger than the device's real max buffer + // size, which is the true per-buffer limit -- unlike buft_get_max_size, + // which on Vulkan reports only the ~1 GB suballocation block. + for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { + ggml_tensor* op = ggml_graph_node(gf, i); + if (!ggml_backend_supports_op(runtime_backend, op)) { + LOG_DEBUG("%s: untiled compute op %.2f MB exceeds backend support; deferring to tiling", + get_desc().c_str(), + ggml_nbytes(op) / 1024.0 / 1024.0); + compute_buffer_deferred_to_tiling = true; + return false; + } + } + } else { + size_t max_size = ggml_backend_buft_get_max_size(buft); + if (max_size > 0) { + ggml_gallocr* probe = ggml_gallocr_new(buft); + size_t sizes[1] = {0}; + ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes); + ggml_gallocr_free(probe); + if (sizes[0] > max_size) { + LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling", + get_desc().c_str(), + sizes[0] / 1024.0 / 1024.0, + max_size / 1024.0 / 1024.0); + compute_buffer_deferred_to_tiling = true; + return false; + } + } + } + } + + compute_allocr = ggml_gallocr_new(buft); if (!ggml_gallocr_reserve(compute_allocr, gf)) { // failed to allocate the compute buffer @@ -2432,7 +2498,11 @@ struct GGMLRunner { GraphWeightDoneGuard graph_weight_done_guard(this, ¶ms_to_prepare); if (!alloc_compute_buffer(gf)) { - LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); + // compute_buffer_deferred_to_tiling: alloc_compute_buffer declined a too-large + // untiled buffer on purpose (VAE AUTO will retry with tiling) -- not a real error. + if (!compute_buffer_deferred_to_tiling) { + LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); + } return std::nullopt; } struct ComputeBufferGuard { @@ -2822,6 +2892,15 @@ struct GGMLRunner { void set_stream_layers_enabled(bool enabled) { stream_layers_enabled = enabled; } + + // When enabled, the next compute() measures its planned compute buffer and + // declines to allocate (returning failure) if it would exceed the backend's + // max single-buffer size, instead of attempting the allocation and emitting + // the backend's raw error. See probe_compute_buffer_fits_. + void set_probe_compute_buffer_fits(bool enabled, size_t max_bytes = 0) { + probe_compute_buffer_fits_ = enabled; + probe_max_bytes_ = enabled ? max_bytes : 0; + } }; class GGMLBlock { diff --git a/src/model/vae/vae.hpp b/src/model/vae/vae.hpp index af091bb57..37901dfc2 100644 --- a/src/model/vae/vae.hpp +++ b/src/model/vae/vae.hpp @@ -199,7 +199,55 @@ struct VAE : public GGMLRunner { "vae decode compute failed while processing a tile", silent); } else { + // AUTO: probe the untiled decode buffer first so a too-large one defers to tiling below + // without the backend's raw alloc error; the output.empty() check still backstops a real OOM. + const bool auto_probe = !tiling_params.enabled && tiling_params.auto_tile; + if (auto_probe) { + size_t max_bytes = 0; + if (tiling_params.extra_tiling_args != nullptr) { + for (const auto& [key, value] : parse_key_value_args(tiling_params.extra_tiling_args, "VAE extra tiling arg")) { + if (key == "max_buffer_size") { + max_bytes = strtoull(value.c_str(), nullptr, 10); + } + } + } + set_probe_compute_buffer_fits(true, max_bytes); + } output = _compute(n_threads, input, true); + if (auto_probe) { + set_probe_compute_buffer_fits(false); + } + if (output.empty() && !tiling_params.enabled && tiling_params.auto_tile) { + // Untiled decode exceeded the backend's per-buffer limit (common on iGPUs, where the + // cap is per-buffer, not total memory) -- fall back to tiling instead of failing. + free_compute_buffer(); + if (!silent) { + LOG_WARN("vae: untiled decode buffer exceeded the backend limit; retrying with tiling"); + } + sd_tiling_params_t auto_tiling = tiling_params; + auto_tiling.enabled = true; // default tile size (32) via get_tile_sizes + set_tiling_params(auto_tiling); + const int scale_factor = get_scale_factor(); + int64_t W = input.shape()[0] * scale_factor; + int64_t H = input.shape()[1] * scale_factor; + float tile_overlap; + int tile_size_x, tile_size_y; + get_tile_sizes(tile_size_x, tile_size_y, tile_overlap, auto_tiling, input.shape()[0], input.shape()[1]); + output = tiled_compute( + input, + n_threads, + static_cast(W), + static_cast(H), + scale_factor, + tile_size_x, + tile_size_y, + tile_overlap, + circular_x, + circular_y, + true, + "vae decode compute failed while processing a tile", + silent); + } } free_compute_buffer(); diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 836b0f85b..38dd21696 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -187,7 +187,7 @@ class StableDiffusionGGML { bool apply_lora_immediately = false; std::string taesd_path; - sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr}; + sd_tiling_params_t vae_tiling_params = {false, false, 0, 0, 0.5f, 0, 0, nullptr, true}; // auto_tile=true (AUTO default) bool enable_mmap = false; sd::ggml_graph_cut::MaxVramAssignment max_vram_assignment; bool stream_layers = false; @@ -2795,7 +2795,7 @@ void sd_img_gen_params_init(sd_img_gen_params_t* sd_img_gen_params) { sd_img_gen_params->batch_count = 1; sd_img_gen_params->control_strength = 0.9f; sd_img_gen_params->pm_params = {nullptr, 0, nullptr, 20.f}; - sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_img_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) sd_cache_params_init(&sd_img_gen_params->cache); sd_hires_params_init(&sd_img_gen_params->hires); } @@ -2882,7 +2882,7 @@ void sd_vid_gen_params_init(sd_vid_gen_params_t* sd_vid_gen_params) { sd_vid_gen_params->fps = 16; sd_vid_gen_params->moe_boundary = 0.875f; sd_vid_gen_params->vace_strength = 1.f; - sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr}; + sd_vid_gen_params->vae_tiling_params = {false, false, 0, 0, 0.5f, 0.0f, 0.0f, nullptr, true}; // auto_tile=true (AUTO) sd_vid_gen_params->hires.enabled = false; sd_vid_gen_params->hires.upscaler = SD_HIRES_UPSCALER_LATENT; sd_vid_gen_params->hires.scale = 2.f;