leejet · pwilkin · Apr 30, 2026 · Jun 12, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -431,6 +431,18 @@ ArgOptions SDContextParams::get_options() {
          "--rpc-servers",
          "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
          &rpc_servers},
+        {"",
+         "--multi-gpu-mode",
+         "how to split a too-large DiT across GPUs (auto-fit): "
+         "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off "
+         "(default: row)",
+         &multi_gpu_mode},
+        {"",
+         "--fit-compute-reserve",
+         "auto-fit: per-component compute-buffer reserve in MiB as a component "
+         "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in "
+         "defaults)",
+         &fit_compute_reserve},
         {"",
          "--max-vram",
          "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
@@ -447,6 +459,10 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-t5-mask-pad",
          "t5 mask pad size of chroma",
          &chroma_t5_mask_pad},
+        {"",
+         "--fit-target",
+         "auto-fit: MiB of free memory to leave on each GPU (default: 512)",
+         &auto_fit_target_mb},
     };
 
     options.bool_options = {
@@ -518,6 +534,24 @@ ArgOptions SDContextParams::get_options() {
          "--chroma-enable-t5-mask",
          "enable t5 mask for chroma",
          true, &chroma_use_t5_mask},
+        {"",
+         "--auto-fit",
+         "automatically pick DiT/VAE/Conditioner device placements based on "
+         "free GPU memory (default ON)",
+         true, &auto_fit},
+        {"",
+         "--no-auto-fit",
+         "disable auto-fit and use the explicit --backend / --params-backend flags",
+         false, &auto_fit},
+        {"",
+         "--no-multi-gpu",
+         "auto-fit: keep all components on a single GPU when they fit "
+         "(by default, multi-GPU placements are preferred to balance load)",
+         false, &auto_multi_gpu},
+        {"",
+         "--fit-dry-run",
+         "auto-fit: print the computed plan and exit without loading models",
+         true, &auto_fit_dry_run},
     };
 
     auto on_type_arg = [&](int argc, const char** argv, int index) {
@@ -616,6 +650,15 @@ ArgOptions SDContextParams::get_options() {
          "but it usually offers faster inference speed and, in some cases, lower memory usage. "
          "The at_runtime mode, on the other hand, is exactly the opposite.",
          on_lora_apply_mode_arg},
+        {"",
+         "--list-devices",
+         "list available ggml backend devices (one per line, "
+         "name<TAB>description) and exit",
+         [](int /*argc*/, const char** /*argv*/, int /*index*/) {
+             sd_list_devices();
+             std::exit(0);
+             return 0;
+         }},
     };
 
     return options;
@@ -760,9 +803,12 @@ std::string SDContextParams::to_string() const {
         << "  backend: \"" << backend << "\",\n"
         << "  params_backend: \"" << params_backend << "\",\n"
         << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-        << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-        << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-        << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+        << "  auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
+        << "  auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
+        << "  auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
+        << "  fit_compute_reserve: \"" << fit_compute_reserve << "\",\n"
+        << "  auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
+        << "  multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
         << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
         << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -837,6 +883,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
     sd_ctx_params.stream_layers                   = stream_layers;
     sd_ctx_params.backend                         = effective_backend.c_str();
     sd_ctx_params.params_backend                  = effective_params_backend.c_str();
+    sd_ctx_params.auto_fit                        = auto_fit;
+    sd_ctx_params.auto_fit_target_mb              = auto_fit_target_mb;
+    sd_ctx_params.auto_fit_dry_run                = auto_fit_dry_run;
+    sd_ctx_params.auto_fit_compute_reserve        = fit_compute_reserve.c_str();
+    sd_ctx_params.auto_multi_gpu                  = auto_multi_gpu;
+    sd_ctx_params.multi_gpu_mode                  = multi_gpu_mode.c_str();
     sd_ctx_params.rpc_servers                     = rpc_servers.c_str();
     return sd_ctx_params;
 }

diff --git a/examples/common/common.h b/examples/common/common.h
@@ -152,9 +152,6 @@ struct SDContextParams {
     std::string effective_backend;
     std::string effective_params_backend;
     bool enable_mmap           = false;
-    bool control_net_cpu       = false;
-    bool clip_on_cpu           = false;
-    bool vae_on_cpu            = false;
     bool flash_attn            = false;
     bool diffusion_flash_attn  = false;
     bool diffusion_conv_direct = false;
@@ -170,6 +167,23 @@ struct SDContextParams {
 
     bool qwen_image_zero_cond_t = false;
 
+    // Auto-fit defaults — placement is computed automatically based on free
+    // VRAM. Pass --no-auto-fit to disable and use explicit --backend specs.
+    bool auto_fit           = true;
+    int  auto_fit_target_mb = 512;
+    bool auto_fit_dry_run   = false;
+    // Per-component compute-buffer reserve in MiB as a component map,
+    // e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults.
+    std::string fit_compute_reserve;
+    bool auto_multi_gpu = true;
+    std::string multi_gpu_mode = "row";
+
+    // Deprecated aliases for --backend <component>=cpu (kept for
+    // backwards compatibility with the pre-auto-fit CLI).
+    bool control_net_cpu = false;
+    bool clip_on_cpu     = false;
+    bool vae_on_cpu      = false;
+
     prediction_t prediction           = PREDICTION_COUNT;
     lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;
 

diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -220,6 +220,35 @@ typedef struct {
     bool stream_layers;  // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
     const char* backend;
     const char* params_backend;
+
+    // Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
+    // When `auto_fit` is true (default), `backend` / `params_backend` are
+    // ignored and the placement is computed automatically (the plan is fed
+    // into the same backend assignment that `backend` / `params_backend` use).
+    // `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
+    // `auto_fit_dry_run` prints the plan and aborts init before loading.
+    // `auto_fit_compute_reserve` tunes the per-component compute-buffer
+    // reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512"
+    // (same component-key style as `backend`); missing keys / NULL keep the
+    // built-in defaults.
+    bool auto_fit;
+    int  auto_fit_target_mb;
+    bool auto_fit_dry_run;
+    const char* auto_fit_compute_reserve;
+
+    // When more than one GPU device is present, prefer placing different
+    // components on different GPUs to balance load and fit larger total
+    // working sets. Set false to keep all components on a single GPU when
+    // they fit. Defaults to true. Each component still lives entirely on
+    // one device unless multi_gpu_mode splits it (see below).
+    bool auto_multi_gpu;
+
+    // How to split a single component (currently only the DiT) across GPUs
+    // when it doesn't fit on one but fits across several: "row" (matmul rows
+    // split via the backend's stock split buffer type, CUDA/SYCL),
+    // "layer" (whole blocks per GPU, routed by a scheduler, backend-generic),
+    // or "off" (never split a single component). NULL / empty => "row".
+    const char* multi_gpu_mode;
     const char* rpc_servers;
 } sd_ctx_params_t;
 
@@ -485,6 +514,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
+// per-line format. The output is intended to be parsed by tools and used as
+// device names in the --backend / --params-backend assignment specs.
+SD_API void sd_list_devices(void);
+
 // for C API, caller needs to call free_sd_images to free the memory after use
 // This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT.
 SD_API void free_sd_images(sd_image_t* result_images, int num_images);