From 6e49bbddde8706312f12f14c0f68481337a2ebf7 Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 21:21:54 +0100 Subject: [PATCH 1/5] LoRA: adapter loading + example + README - Add safetensors reader and dit_ggml_load_lora (PEFT adapter_model.safetensors) - Apply LoRA at linear layers in DiT (self/cross-attn, MLP) when base weights are separate - CLI: --lora and --lora-scale in dit-vae - Example: examples/lora.sh + lora.json (duckdbot/acestep-lora-cryda) - README: LoRA section, example list, dit-vae options Made-with: Cursor --- CMakeLists.txt | 4 +- README.md | 7 ++ examples/lora.json | 11 +++ examples/lora.sh | 31 +++++++ src/dit-graph.h | 47 +++++++---- src/dit-lora.cpp | 198 +++++++++++++++++++++++++++++++++++++++++++++ src/dit.h | 20 +++++ src/safetensors.h | 107 ++++++++++++++++++++++++ tools/dit-vae.cpp | 23 +++++- 9 files changed, 429 insertions(+), 19 deletions(-) create mode 100644 examples/lora.json create mode 100755 examples/lora.sh create mode 100644 src/dit-lora.cpp create mode 100644 src/safetensors.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 551a072..503145a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,8 +53,8 @@ add_library(acestep-core STATIC link_ggml_backends(acestep-core) target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) -add_executable(dit-vae tools/dit-vae.cpp) +# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support +add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp) target_link_libraries(dit-vae PRIVATE acestep-core) link_ggml_backends(dit-vae) diff --git a/README.md b/README.md index 178fe39..9ad2a2b 100644 --- a/README.md +++ b/README.md @@ -146,11 +146,14 @@ cd examples ./cover.sh # cover mode: decode precomputed audio_codes (no LLM) ./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) ./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength +./lora.sh # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/) ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). +**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly. + ## Generation modes The LLM fills what's missing in the JSON and generates audio codes. @@ -254,6 +257,10 @@ Required: --dit DiT GGUF file --vae VAE GGUF file +LoRA: + --lora LoRA adapter (adapter_model.safetensors) + --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0) + Batch: --batch DiT variations per request (default: 1, max 9) diff --git a/examples/lora.json b/examples/lora.json new file mode 100644 index 0000000..8317521 --- /dev/null +++ b/examples/lora.json @@ -0,0 +1,11 @@ +{ + "task_type": "text2music", + "caption": "Emotional vocal track with soft synths", + "lyrics": "", + "duration": 10, + "inference_steps": 8, + "guidance_scale": 1, + "shift": 3, + "seed": 42, + "vocal_language": "en" +} diff --git a/examples/lora.sh b/examples/lora.sh new file mode 100755 index 0000000..9c25d33 --- /dev/null +++ b/examples/lora.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda). +# Requires adapter_model.safetensors in lora/ (download once; see below). +set -eu +cd "$(dirname "$0")" + +ADAPTER="lora/adapter_model.safetensors" +if [ ! -f "$ADAPTER" ]; then + echo "LoRA adapter not found at $ADAPTER" + echo "Download once (e.g. from Hugging Face):" + echo " mkdir -p lora" + echo " curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'" + echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora" + exit 1 +fi + +# LLM: fill lyrics + codes +../build/ace-qwen3 \ + --request lora.json \ + --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf + +# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical) +../build/dit-vae \ + --request lora0.json \ + --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \ + --dit ../models/acestep-v15-turbo-Q8_0.gguf \ + --vae ../models/vae-BF16.gguf \ + --lora "$ADAPTER" \ + --lora-scale 1.0 + +echo "Done. Check lora00.wav" diff --git a/src/dit-graph.h b/src/dit-graph.h index 2a92324..ab5839e 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -44,6 +44,23 @@ static struct ggml_tensor * dit_ggml_linear( return ggml_mul_mat(ctx, weight, input); } +// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL. +static struct ggml_tensor * dit_ggml_linear_lora( + struct ggml_context * ctx, + struct ggml_tensor * weight, + struct ggml_tensor * lora_a, // [in, r] + struct ggml_tensor * lora_b, // [r, out] + float lora_scale, + struct ggml_tensor * input) { + struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input); + if (lora_a && lora_b && lora_scale != 0.0f) { + struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input); + struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax); + out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale)); + } + return out; +} + // Helper: Linear layer with bias static struct ggml_tensor * dit_ggml_linear_bias( struct ggml_context * ctx, @@ -164,6 +181,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn( struct ggml_tensor * q, * k, * v; int q_dim = Nh * D; int kv_dim = Nkv * D; + float lora_scale = m->lora_scale; if (ly->sa_qkv) { struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0)); @@ -173,11 +191,11 @@ static struct ggml_tensor * dit_ggml_build_self_attn( struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0])); - v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); + v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } else { - q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa); - k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa); - v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa); + q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa); + k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa); + v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N] @@ -239,7 +257,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn( } // 8) O projection: [Nh*D, S, N] -> [H, S, N] - struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn); + struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn); return out; } @@ -253,20 +271,21 @@ static struct ggml_tensor * dit_ggml_build_mlp( struct ggml_tensor * norm_ffn, int S) { + float lora_scale = m->lora_scale; struct ggml_tensor * ff; if (ly->gate_up) { // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0] struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn); ff = ggml_swiglu(ctx, gu); } else { - // Separate: two matmuls + split swiglu - struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn); - struct ggml_tensor * up = dit_ggml_linear(ctx, ly->up_proj, norm_ffn); + // Separate: two matmuls + split swiglu (with optional LoRA) + struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn); + struct ggml_tensor * up = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn); ff = ggml_swiglu_split(ctx, gate, up); } // Down projection: [I, S] -> [H, S] - return dit_ggml_linear(ctx, ly->down_proj, ff); + return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff); } // Build cross-attention sub-graph for a single layer. @@ -305,14 +324,14 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); } else if (ly->ca_kv) { // Q separate, K+V fused - q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); } else { - q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca); - k = dit_ggml_linear(ctx, ly->ca_k_proj, enc); - v = dit_ggml_linear(ctx, ly->ca_v_proj, enc); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); + k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc); + v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc); } // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N] @@ -342,7 +361,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N); // O projection - return dit_ggml_linear(ctx, ly->ca_o_proj, attn); + return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn); } // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals) diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp new file mode 100644 index 0000000..881d941 --- /dev/null +++ b/src/dit-lora.cpp @@ -0,0 +1,198 @@ +// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step). +// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer). + +#include "dit.h" +#include "safetensors.h" +#include +#include +#include +#include + +// Normalize adapter key to base name: decoder.layers.N. +// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj" +static std::string lora_key_to_base(const std::string & key) { + std::string s = key; + const char * prefixes[] = { "base_model.model.model.", "base_model.model." }; + for (const char * p : prefixes) { + size_t pl = strlen(p); + if (s.size() >= pl && s.compare(0, pl, p) == 0) { + s = s.substr(pl); + break; + } + } + if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0) + s = s.substr(0, s.size() - 7); + else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0) + s = s.substr(0, s.size() - 7); + return s; +} + +static bool is_lora_a(const std::string & key) { + return key.find("lora_A") != std::string::npos; +} + +// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down +enum LoraSlot { + SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS +}; + +static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) { + int L = -1; + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; } + if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; } + return false; +} + +static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) { + if (is_b) { + switch (slot) { + case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b; + case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b; + case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b; + default: return nullptr; + } + } else { + switch (slot) { + case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a; + case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a; + case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a; + default: return nullptr; + } + } +} + +bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) { + FILE * fp = fopen(lora_path, "rb"); + if (!fp) { + fprintf(stderr, "[LoRA] cannot open %s\n", lora_path); + return false; + } + uint8_t h8[8]; + if (fread(h8, 1, 8, fp) != 8) { + fclose(fp); + return false; + } + uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) + | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); + uint64_t data_section_start = 8 + header_len; + + std::unordered_map tensors; + int n = safetensors_parse_lora(fp, &tensors); + if (n == 0) { + fclose(fp); + fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path); + return false; + } + + // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name) + std::unordered_map> pairs; // base -> (key_a, key_b) + std::unordered_map base_to_b; + for (const auto & kv : tensors) { + std::string base = lora_key_to_base(kv.first); + if (base.empty()) continue; + if (is_lora_a(kv.first)) + base_to_b[base] = ""; // mark base as having A; we'll find B next + } + for (const auto & kv : tensors) { + std::string base = lora_key_to_base(kv.first); + if (base.empty()) continue; + if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos) + base_to_b[base] = kv.first; + } + for (const auto & kv : tensors) { + if (!is_lora_a(kv.first)) continue; + std::string base = lora_key_to_base(kv.first); + auto it = base_to_b.find(base); + if (it != base_to_b.end() && !it->second.empty()) + pairs[base] = { kv.first, it->second }; + } + + int n_pairs = (int)pairs.size(); + wctx_init(&m->lora_wctx, n_pairs * 2); // A and B per pair + + fseek(fp, (long)data_section_start, SEEK_SET); + + for (const auto & p : pairs) { + const std::string & base = p.first; + const std::string & key_a = p.second.first; + const std::string & key_b = p.second.second; + int layer_idx = 0; + LoraSlot slot = N_SLOTS; + if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue; + + DiTGGMLLayer * ly = &m->layers[layer_idx]; + SafeTensorInfo & info_a = tensors[key_a]; + SafeTensorInfo & info_b = tensors[key_b]; + if (info_a.n_dims != 2 || info_b.n_dims != 2) continue; + // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S]. + // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out. + int64_t r = info_a.shape[0], in_dim = info_a.shape[1]; + int64_t out_dim = info_b.shape[0]; + if (info_b.shape[1] != r) continue; + + struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r); + struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim); + ggml_set_name(ta, key_a.c_str()); + ggml_set_name(tb, key_b.c_str()); + + // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose) + size_t na = (size_t)(r * in_dim); + m->lora_wctx.staging.emplace_back(na); + float * buf_a = m->lora_wctx.staging.back().data(); + if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) { + fclose(fp); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_wctx.staging.emplace_back(na); + float * transposed_a = m->lora_wctx.staging.back().data(); + for (int64_t i = 0; i < r; i++) + for (int64_t j = 0; j < in_dim; j++) + transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)]; + m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 }); + + size_t nb = (size_t)(out_dim * r); + m->lora_wctx.staging.emplace_back(nb); + float * buf_b = m->lora_wctx.staging.back().data(); + if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) { + fclose(fp); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_wctx.staging.emplace_back(nb); + float * transposed_b = m->lora_wctx.staging.back().data(); + for (int64_t i = 0; i < out_dim; i++) + for (int64_t j = 0; j < r; j++) + transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)]; + m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 }); + + struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false); + struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true); + if (pa) *pa = ta; + if (pb) *pb = tb; + } + fclose(fp); + fp = nullptr; + + if (!wctx_alloc(&m->lora_wctx, m->backend)) { + fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n"); + wctx_free(&m->lora_wctx); + return false; + } + m->lora_scale = scale; + fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale); + return true; +} diff --git a/src/dit.h b/src/dit.h index 524dd76..9c842b0 100644 --- a/src/dit.h +++ b/src/dit.h @@ -81,6 +81,19 @@ struct DiTGGMLLayer { // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden]) struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout + // Optional LoRA adapters (F32, applied when base projection is separate) + struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b; + struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b; + struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b; + struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b; + struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b; + struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b; + struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b; + struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b; + struct ggml_tensor * lora_gate_a, * lora_gate_b; + struct ggml_tensor * lora_up_a, * lora_up_b; + struct ggml_tensor * lora_down_a, * lora_down_b; + int layer_type; // 0=sliding, 1=full }; @@ -122,6 +135,8 @@ struct DiTGGML { // Weight storage WeightCtx wctx; + WeightCtx lora_wctx; // optional LoRA adapter tensors (when lora_scale > 0) + float lora_scale; // alpha/rank for LoRA (0 = no LoRA) // Pre-allocated constant for AdaLN (1+scale) fusion struct ggml_tensor * scalar_one; // [1] = 1.0f, broadcast in ggml_add @@ -389,10 +404,15 @@ static void dit_ggml_init_backend(DiTGGML * m) { m->use_flash_attn = (bp.backend != bp.cpu_backend); } +// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors). +// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error. +bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale); + static void dit_ggml_free(DiTGGML * m) { if (m->sched) ggml_backend_sched_free(m->sched); if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend); if (m->cpu_backend) ggml_backend_free(m->cpu_backend); wctx_free(&m->wctx); + if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx); *m = {}; } diff --git a/src/safetensors.h b/src/safetensors.h new file mode 100644 index 0000000..74d5967 --- /dev/null +++ b/src/safetensors.h @@ -0,0 +1,107 @@ +#pragma once +// safetensors.h: minimal reader for LoRA adapter_model.safetensors +// +// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data. +// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets. + +#include +#include +#include +#include +#include +#include +#include + +struct SafeTensorInfo { + std::string dtype; // "F32", "F16", "BF16" + int64_t shape[2]; // [dim0, dim1] from JSON + int n_dims; + uint64_t data_start; // byte offset in file (after header) + uint64_t data_end; +}; + +// Open file, read header, parse tensor metadata for LoRA tensors. +// Returns number of LoRA tensors found; fills *out with tensor name -> info. +// Caller must fclose(fp) and free the map; file position is left at start of data section. +static int safetensors_parse_lora(FILE * fp, std::unordered_map * out) { + out->clear(); + uint64_t header_len = 0; + uint8_t h8[8]; + if (fread(h8, 1, 8, fp) != 8) return 0; + header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) + | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); + if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0; // cap 10MB header + std::vector buf(header_len + 1); + if (fread(buf.data(), 1, header_len, fp) != header_len) return 0; + buf[header_len] = '\0'; + const char * json = buf.data(); + + // Find each key that contains "lora_A" or "lora_B" + const char * p = json; + int count = 0; + while ((p = strstr(p, "\"")) != nullptr) { + const char * key_start = p + 1; + p = strchr(key_start, '"'); + if (!p) break; + std::string key(key_start, (size_t)(p - key_start)); + p++; + if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) { + continue; + } + // Find the value object for this key: skip ": + while (*p && (*p == ' ' || *p == ':')) p++; + if (*p != '{') continue; + const char * obj = p; + SafeTensorInfo info = {}; + info.shape[0] = info.shape[1] = 1; + info.n_dims = 0; + // "shape":[n,m] or [n] + const char * sh = strstr(obj, "\"shape\""); + if (sh) { + const char * br = strchr(sh, '['); + if (br) { + long long a = 0, b = 0; + int n = sscanf(br, "[%lld,%lld]", &a, &b); + if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; } + if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; } + } + } + const char * dt = strstr(obj, "\"dtype\""); + if (dt) { + const char * q = strchr(dt, '"'); + if (q) q = strchr(q + 1, '"'); + if (q) { + const char * start = q + 1; + const char * end = strchr(start, '"'); + if (end) info.dtype = std::string(start, end - start); + } + } + const char * off = strstr(obj, "\"data_offsets\""); + if (off) { + const char * br = strchr(off, '['); + if (br) { + uint64_t s = 0, e = 0; + if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) { + info.data_start = s; + info.data_end = e; + } + } + } + if (info.dtype.empty() || info.n_dims == 0) continue; + (*out)[key] = info; + count++; + } + return count; +} + +// Read raw tensor data from file. File must be positioned at start of data section +// (i.e. after the 8-byte header length + header bytes). +// data_offset in the JSON is relative to the start of the data section. +static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start, + uint64_t tensor_start, uint64_t tensor_end, void * out_buf) { + uint64_t off = data_section_start + tensor_start; + uint64_t nbytes = tensor_end - tensor_start; + if (fseek(fp, (long)off, SEEK_SET) != 0) return false; + if (fread(out_buf, 1, nbytes, fp) != nbytes) return false; + return true; +} diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index cac80a5..d889da1 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -67,6 +67,9 @@ static void print_usage(const char * prog) { " --text-encoder Text encoder GGUF file\n" " --dit DiT GGUF file\n" " --vae VAE GGUF file\n\n" + "LoRA:\n" + " --lora LoRA adapter (adapter_model.safetensors)\n" + " --lora-scale LoRA scale, e.g. alpha/rank (default: 1.0)\n\n" "Batch:\n" " --batch DiT variations per request (default: 1, max 9)\n\n" "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n" @@ -99,9 +102,11 @@ int main(int argc, char ** argv) { const char * dit_gguf = NULL; const char * vae_gguf = NULL; const char * dump_dir = NULL; - int batch_n = 1; - int vae_chunk = 256; - int vae_overlap = 64; + const char * lora_path = NULL; + float lora_scale = 1.0f; + int batch_n = 1; + int vae_chunk = 256; + int vae_overlap = 64; for (int i = 1; i < argc; i++) { if (strcmp(argv[i], "--request") == 0) { @@ -116,6 +121,8 @@ int main(int argc, char ** argv) { else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]); else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]); + else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i]; + else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]); else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { print_usage(argv[0]); return 0; } else { @@ -161,6 +168,16 @@ int main(int argc, char ** argv) { } fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms()); + if (lora_path) { + timer.reset(); + if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) { + fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path); + dit_ggml_free(&model); + return 1; + } + fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms()); + } + // Read DiT GGUF metadata + silence_latent tensor (once) bool is_turbo = false; std::vector silence_full; // [15000, 64] f32 From 907a068a24dba3208671edd45775a7cd1334d9ad Mon Sep 17 00:00:00 2001 From: qxip Date: Sat, 28 Feb 2026 22:57:57 +0100 Subject: [PATCH 2/5] LoRA: apply on fused layers, add custom_tag/genre, update example - dit-graph.h: apply LoRA deltas when base uses fused QKV/gate_up/ca_qkv so self-attn, MLP, and cross-attn all use adapters (fixes no audible effect) - dit-lora.cpp: fix safetensors parse (rewind fp before parse); normalize keys for base_model.model.layers.* and .lora_A.weight/.lora_B.weight - request: add custom_tag (LoRA trigger) and genre; parse language, is_instrumental, formatted_lyrics - dit-vae: append custom_tag to caption for condition encoder when set - examples/lora.json: nu-disco example with custom_tag crydamoure Made-with: Cursor --- examples/lora.json | 16 +++++++++++----- src/dit-graph.h | 48 ++++++++++++++++++++++++++++++++++++++++++++-- src/dit-lora.cpp | 20 +++++++++++-------- src/request.cpp | 13 +++++++++++++ src/request.h | 6 +++++- tools/dit-vae.cpp | 7 +++++-- 6 files changed, 92 insertions(+), 18 deletions(-) diff --git a/examples/lora.json b/examples/lora.json index 8317521..c872efb 100644 --- a/examples/lora.json +++ b/examples/lora.json @@ -1,11 +1,17 @@ { "task_type": "text2music", - "caption": "Emotional vocal track with soft synths", - "lyrics": "", - "duration": 10, + "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.", + "genre": "Nu-disco", + "lyrics": "[Instrumental]", + "bpm": 115, + "keyscale": "C# major", + "timesignature": "4", + "duration": 256, + "language": "unknown", + "instrumental": true, + "custom_tag": "crydamoure", "inference_steps": 8, "guidance_scale": 1, "shift": 3, - "seed": 42, - "vocal_language": "en" + "seed": -1 } diff --git a/src/dit-graph.h b/src/dit-graph.h index ab5839e..1241bc5 100644 --- a/src/dit-graph.h +++ b/src/dit-graph.h @@ -187,10 +187,25 @@ static struct ggml_tensor * dit_ggml_build_self_attn( q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0])); v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0])); + // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded + if (lora_scale != 0.0f) { + if (ly->lora_sa_q_a && ly->lora_sa_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); + if (ly->lora_sa_k_a && ly->lora_sa_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); + if (ly->lora_sa_v_a && ly->lora_sa_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale)); + } } else if (ly->sa_qk) { struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa); q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0)); k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0])); + if (lora_scale != 0.0f) { + if (ly->lora_sa_q_a && ly->lora_sa_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale)); + if (ly->lora_sa_k_a && ly->lora_sa_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale)); + } v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa); } else { q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa); @@ -271,12 +286,25 @@ static struct ggml_tensor * dit_ggml_build_mlp( struct ggml_tensor * norm_ffn, int S) { + DiTGGMLConfig & c = m->cfg; + int I = c.intermediate_size; + int N = (int)norm_ffn->ne[2]; float lora_scale = m->lora_scale; struct ggml_tensor * ff; if (ly->gate_up) { // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0] struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn); - ff = ggml_swiglu(ctx, gu); + if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) { + struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0)); + struct ggml_tensor * up = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0])); + if (ly->lora_gate_a && ly->lora_gate_b) + gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale)); + if (ly->lora_up_a && ly->lora_up_b) + up = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale)); + ff = ggml_swiglu_split(ctx, gate, up); + } else { + ff = ggml_swiglu(ctx, gu); + } } else { // Separate: two matmuls + split swiglu (with optional LoRA) struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn); @@ -311,6 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( // Q from hidden, KV from encoder (full fused, Q+KV partial, separate) int q_dim = Nh * D; int kv_dim = Nkv * D; + float lora_scale = m->lora_scale; struct ggml_tensor * q, * k, * v; if (ly->ca_qkv) { // Full QKV fused: split Q from hidden, KV from enc via weight views @@ -322,12 +351,27 @@ static struct ggml_tensor * dit_ggml_build_cross_attn( struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); + // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc) + if (lora_scale != 0.0f) { + if (ly->lora_ca_q_a && ly->lora_ca_q_b) + q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale)); + if (ly->lora_ca_k_a && ly->lora_ca_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); + if (ly->lora_ca_v_a && ly->lora_ca_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); + } } else if (ly->ca_kv) { // Q separate, K+V fused - q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); + q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca); struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc); k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0)); v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0])); + if (lora_scale != 0.0f) { + if (ly->lora_ca_k_a && ly->lora_ca_k_b) + k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale)); + if (ly->lora_ca_v_a && ly->lora_ca_v_b) + v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale)); + } } else { q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca); k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc); diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp index 881d941..b14e090 100644 --- a/src/dit-lora.cpp +++ b/src/dit-lora.cpp @@ -9,7 +9,7 @@ #include // Normalize adapter key to base name: decoder.layers.N. -// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj" +// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight static std::string lora_key_to_base(const std::string & key) { std::string s = key; const char * prefixes[] = { "base_model.model.model.", "base_model.model." }; @@ -20,14 +20,22 @@ static std::string lora_key_to_base(const std::string & key) { break; } } + // PEFT-style suffix if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0) s = s.substr(0, s.size() - 14); else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0) s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0) + s = s.substr(0, s.size() - 14); + else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0) + s = s.substr(0, s.size() - 14); else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0) s = s.substr(0, s.size() - 7); else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0) s = s.substr(0, s.size() - 7); + // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming + if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0) + s = "decoder." + s; return s; } @@ -80,17 +88,13 @@ bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) { fprintf(stderr, "[LoRA] cannot open %s\n", lora_path); return false; } - uint8_t h8[8]; - if (fread(h8, 1, 8, fp) != 8) { + std::unordered_map tensors; + if (fseek(fp, 0, SEEK_SET) != 0) { fclose(fp); return false; } - uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24) - | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56); - uint64_t data_section_start = 8 + header_len; - - std::unordered_map tensors; int n = safetensors_parse_lora(fp, &tensors); + uint64_t data_section_start = (uint64_t)ftell(fp); if (n == 0) { fclose(fp); fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path); diff --git a/src/request.cpp b/src/request.cpp index a24d838..f85873a 100644 --- a/src/request.cpp +++ b/src/request.cpp @@ -16,6 +16,8 @@ void request_init(AceRequest * r) { r->caption = ""; r->lyrics = ""; r->instrumental = false; + r->custom_tag = ""; + r->genre = ""; r->bpm = 0; r->duration = -1.0f; r->keyscale = ""; @@ -227,7 +229,11 @@ bool request_parse(AceRequest * r, const char * path) { if (k == "task_type") r->task_type = v; else if (k == "caption") r->caption = v; else if (k == "lyrics") r->lyrics = v; + else if (k == "custom_tag") r->custom_tag = v; + else if (k == "genre") r->genre = v; else if (k == "keyscale") r->keyscale = v; + else if (k == "formatted_lyrics") r->lyrics = v; // alias for lyrics + else if (k == "language") r->vocal_language = v; // alias for vocal_language else if (k == "timesignature") r->timesignature = v; else if (k == "vocal_language") r->vocal_language = v; else if (k == "reference_audio") r->reference_audio = v; @@ -254,6 +260,7 @@ bool request_parse(AceRequest * r, const char * path) { // bools else if (k == "instrumental") r->instrumental = (v == "true"); + else if (k == "is_instrumental") r->instrumental = (v == "true"); // unknown keys: silently ignored (forward compat) } @@ -274,6 +281,10 @@ bool request_write(const AceRequest * r, const char * path) { fprintf(f, " \"lyrics\": \"%s\",\n", json_escape(r->lyrics).c_str()); if (r->instrumental) fprintf(f, " \"instrumental\": true,\n"); + if (!r->custom_tag.empty()) + fprintf(f, " \"custom_tag\": \"%s\",\n", json_escape(r->custom_tag).c_str()); + if (!r->genre.empty()) + fprintf(f, " \"genre\": \"%s\",\n", json_escape(r->genre).c_str()); fprintf(f, " \"bpm\": %d,\n", r->bpm); fprintf(f, " \"duration\": %.1f,\n", r->duration); fprintf(f, " \"keyscale\": \"%s\",\n", json_escape(r->keyscale).c_str()); @@ -310,6 +321,8 @@ void request_dump(const AceRequest * r, FILE * f) { fprintf(f, " caption: %.60s%s\n", r->caption.c_str(), r->caption.size() > 60 ? "..." : ""); fprintf(f, " lyrics: %zu bytes\n", r->lyrics.size()); + if (!r->custom_tag.empty()) + fprintf(f, " custom_tag: %s\n", r->custom_tag.c_str()); fprintf(f, " bpm=%d dur=%.0f key=%s ts=%s lang=%s\n", r->bpm, r->duration, r->keyscale.c_str(), r->timesignature.c_str(), r->vocal_language.c_str()); diff --git a/src/request.h b/src/request.h index e9222a0..ef4b41f 100644 --- a/src/request.h +++ b/src/request.h @@ -19,7 +19,11 @@ struct AceRequest { // text content std::string caption; // "" std::string lyrics; // "" - bool instrumental; // false + bool instrumental; // false + + // LoRA / style trigger (appended to caption for condition encoder when set) + std::string custom_tag; // "" e.g. "crydamoure" + std::string genre; // "" e.g. "Nu-disco" // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp index d889da1..fd5fe47 100644 --- a/tools/dit-vae.cpp +++ b/tools/dit-vae.cpp @@ -243,8 +243,11 @@ int main(int argc, char ** argv) { continue; } - // Extract params - const char * caption = req.caption.c_str(); + // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text) + std::string caption_for_cond = req.caption; + if (!req.custom_tag.empty()) + caption_for_cond += ", " + req.custom_tag; + const char * caption = caption_for_cond.c_str(); const char * lyrics = req.lyrics.c_str(); char bpm_str[16] = "N/A"; if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm); From acd84020a3f43a0ebfd0dbec323dca7f2781cafa Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:23:08 +0100 Subject: [PATCH 3/5] Fix formatting and clarify LoRA adapter instructions --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9ad2a2b..d809146 100644 --- a/README.md +++ b/README.md @@ -146,13 +146,13 @@ cd examples ./cover.sh # cover mode: decode precomputed audio_codes (no LLM) ./cover-reference.sh # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3) ./test-reference.sh # reference_audio (WAV or MP3) + audio_cover_strength -./lora.sh # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/) +./lora.sh # DiT + LoRA adapter ``` Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0) alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights). -**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly. +**LoRA adapters**: use `--lora ` and optional `--lora-scale ` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs. ## Generation modes @@ -214,7 +214,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music Key fields: `seed` -1 means random (resolved once, then +1 per batch element). `audio_codes` is generated by ace-qwen3 and consumed by dit-vae (comma separated FSQ token IDs). When present, the LLM is -skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md). +skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md). Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG). SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`. From 9b087b2ee191ecfce4c97bbb09cdae3cb93a7aa7 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:24:06 +0100 Subject: [PATCH 4/5] Remove download instructions for LoRA adapter Removed instructions for downloading LoRA adapter from Hugging Face. --- examples/lora.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/lora.sh b/examples/lora.sh index 9c25d33..db7ce2b 100755 --- a/examples/lora.sh +++ b/examples/lora.sh @@ -7,10 +7,6 @@ cd "$(dirname "$0")" ADAPTER="lora/adapter_model.safetensors" if [ ! -f "$ADAPTER" ]; then echo "LoRA adapter not found at $ADAPTER" - echo "Download once (e.g. from Hugging Face):" - echo " mkdir -p lora" - echo " curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'" - echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora" exit 1 fi From fc2408ae7cda0e0fdd5b5fb59318d4db88545f69 Mon Sep 17 00:00:00 2001 From: Lorenzo Mangani Date: Sat, 28 Feb 2026 23:24:56 +0100 Subject: [PATCH 5/5] Update comments for custom_tag and genre fields --- src/request.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/request.h b/src/request.h index ef4b41f..ba85821 100644 --- a/src/request.h +++ b/src/request.h @@ -22,8 +22,8 @@ struct AceRequest { bool instrumental; // false // LoRA / style trigger (appended to caption for condition encoder when set) - std::string custom_tag; // "" e.g. "crydamoure" - std::string genre; // "" e.g. "Nu-disco" + std::string custom_tag; // "" LoRA trigger word + std::string genre; // "" LoRA genre // metadata (user-provided or LLM-enriched) int bpm; // 0 = unset