Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 55 additions & 3 deletions examples/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,18 @@ ArgOptions SDContextParams::get_options() {
"--rpc-servers",
"comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052",
&rpc_servers},
{"",
"--multi-gpu-mode",
"how to split a too-large DiT across GPUs (auto-fit): "
"row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off "
"(default: row)",
&multi_gpu_mode},
{"",
"--fit-compute-reserve",
"auto-fit: per-component compute-buffer reserve in MiB as a component "
"map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in "
"defaults)",
&fit_compute_reserve},
{"",
"--max-vram",
"maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value",
Expand All @@ -447,6 +459,10 @@ ArgOptions SDContextParams::get_options() {
"--chroma-t5-mask-pad",
"t5 mask pad size of chroma",
&chroma_t5_mask_pad},
{"",

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar concern here: this introduces multiple --fit-* flags per component (dit/vae/cond).

Could we consider a generic form instead, for example:

--fit-compute-reserve dit=...,vae=...,cond=...

and parse it into a component -> value map?

This would keep the CLI/API more scalable and consistent, and avoid growing a large number of per-component flags as new modules are added.

"--fit-target",
"auto-fit: MiB of free memory to leave on each GPU (default: 512)",
&auto_fit_target_mb},
};

options.bool_options = {
Expand Down Expand Up @@ -518,6 +534,24 @@ ArgOptions SDContextParams::get_options() {
"--chroma-enable-t5-mask",
"enable t5 mask for chroma",
true, &chroma_use_t5_mask},
{"",
"--auto-fit",
"automatically pick DiT/VAE/Conditioner device placements based on "
"free GPU memory (default ON)",
true, &auto_fit},

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this include Vulkan backends with a main dGPU and an iGPU? If so, auto-fit by default may not be a good idea.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I'd still have to check how well this works on Vulkan (I don't think I have a setup with an iGPU to test though), but the idea is the algorithm is supposed to take those types of quirks into account. For now I only tested on my CUDA setup (also have to fix row-split to be supported on Vulkan if possible).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do have a Vulkan SDK setup FWIW and I've build llama.cpp with Vulkan already, so I'll setup the Vulkan build and test next.

{"",
"--no-auto-fit",
"disable auto-fit and use the explicit --backend / --params-backend flags",
false, &auto_fit},
{"",
"--no-multi-gpu",
"auto-fit: keep all components on a single GPU when they fit "
"(by default, multi-GPU placements are preferred to balance load)",
false, &auto_multi_gpu},
{"",
"--fit-dry-run",
"auto-fit: print the computed plan and exit without loading models",
true, &auto_fit_dry_run},
};

auto on_type_arg = [&](int argc, const char** argv, int index) {
Expand Down Expand Up @@ -616,6 +650,15 @@ ArgOptions SDContextParams::get_options() {
"but it usually offers faster inference speed and, in some cases, lower memory usage. "
"The at_runtime mode, on the other hand, is exactly the opposite.",
on_lora_apply_mode_arg},
{"",
"--list-devices",
"list available ggml backend devices (one per line, "
"name<TAB>description) and exit",
[](int /*argc*/, const char** /*argv*/, int /*index*/) {
sd_list_devices();
std::exit(0);
return 0;
}},
};

return options;
Expand Down Expand Up @@ -760,9 +803,12 @@ std::string SDContextParams::to_string() const {
<< " backend: \"" << backend << "\",\n"
<< " params_backend: \"" << params_backend << "\",\n"
<< " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
<< " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
<< " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
<< " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
<< " auto_fit: " << (auto_fit ? "true" : "false") << ",\n"
<< " auto_fit_target_mb: " << auto_fit_target_mb << ",\n"
<< " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n"
<< " fit_compute_reserve: \"" << fit_compute_reserve << "\",\n"
<< " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n"
<< " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n"
<< " flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
<< " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
<< " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
Expand Down Expand Up @@ -837,6 +883,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) {
sd_ctx_params.stream_layers = stream_layers;
sd_ctx_params.backend = effective_backend.c_str();
sd_ctx_params.params_backend = effective_params_backend.c_str();
sd_ctx_params.auto_fit = auto_fit;
sd_ctx_params.auto_fit_target_mb = auto_fit_target_mb;
sd_ctx_params.auto_fit_dry_run = auto_fit_dry_run;
sd_ctx_params.auto_fit_compute_reserve = fit_compute_reserve.c_str();
sd_ctx_params.auto_multi_gpu = auto_multi_gpu;
sd_ctx_params.multi_gpu_mode = multi_gpu_mode.c_str();
sd_ctx_params.rpc_servers = rpc_servers.c_str();
return sd_ctx_params;
}
Expand Down
20 changes: 17 additions & 3 deletions examples/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,6 @@ struct SDContextParams {
std::string effective_backend;
std::string effective_params_backend;
bool enable_mmap = false;
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;
bool flash_attn = false;
bool diffusion_flash_attn = false;
bool diffusion_conv_direct = false;
Expand All @@ -170,6 +167,23 @@ struct SDContextParams {

bool qwen_image_zero_cond_t = false;

// Auto-fit defaults — placement is computed automatically based on free
// VRAM. Pass --no-auto-fit to disable and use explicit --backend specs.
bool auto_fit = true;
int auto_fit_target_mb = 512;
bool auto_fit_dry_run = false;
// Per-component compute-buffer reserve in MiB as a component map,
// e.g. "dit=2048,vae=1024,cond=512"; missing keys keep built-in defaults.
std::string fit_compute_reserve;
bool auto_multi_gpu = true;
std::string multi_gpu_mode = "row";

// Deprecated aliases for --backend <component>=cpu (kept for
// backwards compatibility with the pre-auto-fit CLI).
bool control_net_cpu = false;
bool clip_on_cpu = false;
bool vae_on_cpu = false;

prediction_t prediction = PREDICTION_COUNT;
lora_apply_mode_t lora_apply_mode = LORA_APPLY_AUTO;

Expand Down
34 changes: 34 additions & 0 deletions include/stable-diffusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,35 @@ typedef struct {
bool stream_layers; // Enable residency+prefetch streaming on top of --max-vram (no effect without --max-vram)
const char* backend;
const char* params_backend;

// Auto-fit: pick DiT/VAE/Conditioner devices based on free GPU memory.
// When `auto_fit` is true (default), `backend` / `params_backend` are
// ignored and the placement is computed automatically (the plan is fed
// into the same backend assignment that `backend` / `params_backend` use).
// `auto_fit_target_mb` is the memory to leave free per GPU (default 512).
// `auto_fit_dry_run` prints the plan and aborts init before loading.
// `auto_fit_compute_reserve` tunes the per-component compute-buffer
// reserve in MiB as a component map, e.g. "dit=2048,vae=1024,cond=512"
// (same component-key style as `backend`); missing keys / NULL keep the
// built-in defaults.
bool auto_fit;
int auto_fit_target_mb;
bool auto_fit_dry_run;
const char* auto_fit_compute_reserve;

// When more than one GPU device is present, prefer placing different
// components on different GPUs to balance load and fit larger total
// working sets. Set false to keep all components on a single GPU when
// they fit. Defaults to true. Each component still lives entirely on
// one device unless multi_gpu_mode splits it (see below).
bool auto_multi_gpu;

// How to split a single component (currently only the DiT) across GPUs
// when it doesn't fit on one but fits across several: "row" (matmul rows
// split via the backend's stock split buffer type, CUDA/SYCL),
// "layer" (whole blocks per GPU, routed by a scheduler, backend-generic),
// or "off" (never split a single component). NULL / empty => "row".
const char* multi_gpu_mode;
const char* rpc_servers;
} sd_ctx_params_t;

Expand Down Expand Up @@ -485,6 +514,11 @@ SD_API bool preprocess_canny(sd_image_t image,
SD_API const char* sd_commit(void);
SD_API const char* sd_version(void);

// List available ggml backend devices to stdout, in `name<TAB>description<NL>`
// per-line format. The output is intended to be parsed by tools and used as
// device names in the --backend / --params-backend assignment specs.
SD_API void sd_list_devices(void);

// for C API, caller needs to call free_sd_images to free the memory after use
// This helps avoid CRT problems on Windows when memory is allocated in the library but freed in the caller, which may use a different CRT.
SD_API void free_sd_images(sd_image_t* result_images, int num_images);
Expand Down
Loading
Loading