-
Notifications
You must be signed in to change notification settings - Fork 665
Add tensor splitting (row + tensor), lazy loading and autofitting logic #1470
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -431,6 +431,18 @@ ArgOptions SDContextParams::get_options() { | |
| "--rpc-servers", | ||
| "comma-separated list of RPC servers to connect to for offloading, in the format host:port, e.g. localhost:50052,192.168.1.3:50052", | ||
| &rpc_servers}, | ||
| {"", | ||
| "--multi-gpu-mode", | ||
| "how to split a too-large DiT across GPUs (auto-fit): " | ||
| "row (matmul rows, CUDA/SYCL), layer (whole blocks, generic), or off " | ||
| "(default: row)", | ||
| &multi_gpu_mode}, | ||
| {"", | ||
| "--fit-compute-reserve", | ||
| "auto-fit: per-component compute-buffer reserve in MiB as a component " | ||
| "map, e.g. dit=2048,vae=1024,cond=512 (missing keys keep the built-in " | ||
| "defaults)", | ||
| &fit_compute_reserve}, | ||
| {"", | ||
| "--max-vram", | ||
| "maximum VRAM budget in GiB for graph-cut segmented execution. Accepts a single value or assignments by backend/device, e.g. 6 or cuda0=6,vulkan0=4. 0 disables graph splitting; a negative value auto-detects free VRAM, sparing the specified value", | ||
|
|
@@ -447,6 +459,10 @@ ArgOptions SDContextParams::get_options() { | |
| "--chroma-t5-mask-pad", | ||
| "t5 mask pad size of chroma", | ||
| &chroma_t5_mask_pad}, | ||
| {"", | ||
| "--fit-target", | ||
| "auto-fit: MiB of free memory to leave on each GPU (default: 512)", | ||
| &auto_fit_target_mb}, | ||
| }; | ||
|
|
||
| options.bool_options = { | ||
|
|
@@ -518,6 +534,24 @@ ArgOptions SDContextParams::get_options() { | |
| "--chroma-enable-t5-mask", | ||
| "enable t5 mask for chroma", | ||
| true, &chroma_use_t5_mask}, | ||
| {"", | ||
| "--auto-fit", | ||
| "automatically pick DiT/VAE/Conditioner device placements based on " | ||
| "free GPU memory (default ON)", | ||
| true, &auto_fit}, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this include Vulkan backends with a main dGPU and an iGPU? If so, auto-fit by default may not be a good idea.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be honest, I'd still have to check how well this works on Vulkan (I don't think I have a setup with an iGPU to test though), but the idea is the algorithm is supposed to take those types of quirks into account. For now I only tested on my CUDA setup (also have to fix row-split to be supported on Vulkan if possible).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do have a Vulkan SDK setup FWIW and I've build llama.cpp with Vulkan already, so I'll setup the Vulkan build and test next. |
||
| {"", | ||
| "--no-auto-fit", | ||
| "disable auto-fit and use the explicit --backend / --params-backend flags", | ||
| false, &auto_fit}, | ||
| {"", | ||
| "--no-multi-gpu", | ||
| "auto-fit: keep all components on a single GPU when they fit " | ||
| "(by default, multi-GPU placements are preferred to balance load)", | ||
| false, &auto_multi_gpu}, | ||
| {"", | ||
| "--fit-dry-run", | ||
| "auto-fit: print the computed plan and exit without loading models", | ||
| true, &auto_fit_dry_run}, | ||
| }; | ||
|
|
||
| auto on_type_arg = [&](int argc, const char** argv, int index) { | ||
|
|
@@ -616,6 +650,15 @@ ArgOptions SDContextParams::get_options() { | |
| "but it usually offers faster inference speed and, in some cases, lower memory usage. " | ||
| "The at_runtime mode, on the other hand, is exactly the opposite.", | ||
| on_lora_apply_mode_arg}, | ||
| {"", | ||
| "--list-devices", | ||
| "list available ggml backend devices (one per line, " | ||
| "name<TAB>description) and exit", | ||
| [](int /*argc*/, const char** /*argv*/, int /*index*/) { | ||
| sd_list_devices(); | ||
| std::exit(0); | ||
| return 0; | ||
| }}, | ||
| }; | ||
|
|
||
| return options; | ||
|
|
@@ -760,9 +803,12 @@ std::string SDContextParams::to_string() const { | |
| << " backend: \"" << backend << "\",\n" | ||
| << " params_backend: \"" << params_backend << "\",\n" | ||
| << " enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n" | ||
| << " control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n" | ||
| << " clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n" | ||
| << " vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n" | ||
| << " auto_fit: " << (auto_fit ? "true" : "false") << ",\n" | ||
| << " auto_fit_target_mb: " << auto_fit_target_mb << ",\n" | ||
| << " auto_fit_dry_run: " << (auto_fit_dry_run ? "true" : "false") << ",\n" | ||
| << " fit_compute_reserve: \"" << fit_compute_reserve << "\",\n" | ||
| << " auto_multi_gpu: " << (auto_multi_gpu ? "true" : "false") << ",\n" | ||
| << " multi_gpu_mode: \"" << multi_gpu_mode << "\",\n" | ||
| << " flash_attn: " << (flash_attn ? "true" : "false") << ",\n" | ||
| << " diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n" | ||
| << " diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n" | ||
|
|
@@ -837,6 +883,12 @@ sd_ctx_params_t SDContextParams::to_sd_ctx_params_t(bool taesd_preview) { | |
| sd_ctx_params.stream_layers = stream_layers; | ||
| sd_ctx_params.backend = effective_backend.c_str(); | ||
| sd_ctx_params.params_backend = effective_params_backend.c_str(); | ||
| sd_ctx_params.auto_fit = auto_fit; | ||
| sd_ctx_params.auto_fit_target_mb = auto_fit_target_mb; | ||
| sd_ctx_params.auto_fit_dry_run = auto_fit_dry_run; | ||
| sd_ctx_params.auto_fit_compute_reserve = fit_compute_reserve.c_str(); | ||
| sd_ctx_params.auto_multi_gpu = auto_multi_gpu; | ||
| sd_ctx_params.multi_gpu_mode = multi_gpu_mode.c_str(); | ||
| sd_ctx_params.rpc_servers = rpc_servers.c_str(); | ||
| return sd_ctx_params; | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar concern here: this introduces multiple --fit-* flags per component (dit/vae/cond).
Could we consider a generic form instead, for example:
--fit-compute-reserve dit=...,vae=...,cond=...
and parse it into a component -> value map?
This would keep the CLI/API more scalable and consistent, and avoid growing a large number of per-component flags as new modules are added.