From 6e49bbddde8706312f12f14c0f68481337a2ebf7 Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 21:21:54 +0100
Subject: [PATCH 1/5] LoRA: adapter loading + example + README

- Add safetensors reader and dit_ggml_load_lora (PEFT adapter_model.safetensors)
- Apply LoRA at linear layers in DiT (self/cross-attn, MLP) when base weights are separate
- CLI: --lora <path> and --lora-scale <float> in dit-vae
- Example: examples/lora.sh + lora.json (duckdbot/acestep-lora-cryda)
- README: LoRA section, example list, dit-vae options

Made-with: Cursor
---
 CMakeLists.txt     |   4 +-
 README.md          |   7 ++
 examples/lora.json |  11 +++
 examples/lora.sh   |  31 +++++++
 src/dit-graph.h    |  47 +++++++----
 src/dit-lora.cpp   | 198 +++++++++++++++++++++++++++++++++++++++++++++
 src/dit.h          |  20 +++++
 src/safetensors.h  | 107 ++++++++++++++++++++++++
 tools/dit-vae.cpp  |  23 +++++-
 9 files changed, 429 insertions(+), 19 deletions(-)
 create mode 100644 examples/lora.json
 create mode 100755 examples/lora.sh
 create mode 100644 src/dit-lora.cpp
 create mode 100644 src/safetensors.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 551a072..503145a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,8 @@ add_library(acestep-core STATIC
 link_ggml_backends(acestep-core)
 target_include_directories(acestep-core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
-# dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
-add_executable(dit-vae tools/dit-vae.cpp)
+# dit-vae: full pipeline (text-enc + cond + dit + vae + wav) + LoRA support
+add_executable(dit-vae tools/dit-vae.cpp src/dit-lora.cpp)
 target_link_libraries(dit-vae PRIVATE acestep-core)
 link_ggml_backends(dit-vae)
 
diff --git a/README.md b/README.md
index 178fe39..9ad2a2b 100644
--- a/README.md
+++ b/README.md
@@ -146,11 +146,14 @@ cd examples
 ./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
 ./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
 ./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
+./lora.sh             # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/)
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
 alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
+**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly.
+
 ## Generation modes
 
 The LLM fills what's missing in the JSON and generates audio codes.
@@ -254,6 +257,10 @@ Required:
   --dit <gguf>            DiT GGUF file
   --vae <gguf>            VAE GGUF file
 
+LoRA:
+  --lora <path>           LoRA adapter (adapter_model.safetensors)
+  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)
+
 Batch:
   --batch <N>             DiT variations per request (default: 1, max 9)
 
diff --git a/examples/lora.json b/examples/lora.json
new file mode 100644
index 0000000..8317521
--- /dev/null
+++ b/examples/lora.json
@@ -0,0 +1,11 @@
+{
+  "task_type": "text2music",
+  "caption": "Emotional vocal track with soft synths",
+  "lyrics": "",
+  "duration": 10,
+  "inference_steps": 8,
+  "guidance_scale": 1,
+  "shift": 3,
+  "seed": 42,
+  "vocal_language": "en"
+}
diff --git a/examples/lora.sh b/examples/lora.sh
new file mode 100755
index 0000000..9c25d33
--- /dev/null
+++ b/examples/lora.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# LoRA example: generate with a PEFT LoRA adapter (e.g. duckdbot/acestep-lora-cryda).
+# Requires adapter_model.safetensors in lora/ (download once; see below).
+set -eu
+cd "$(dirname "$0")"
+
+ADAPTER="lora/adapter_model.safetensors"
+if [ ! -f "$ADAPTER" ]; then
+    echo "LoRA adapter not found at $ADAPTER"
+    echo "Download once (e.g. from Hugging Face):"
+    echo "  mkdir -p lora"
+    echo "  curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'"
+    echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora"
+    exit 1
+fi
+
+# LLM: fill lyrics + codes
+../build/ace-qwen3 \
+    --request lora.json \
+    --model ../models/acestep-5Hz-lm-4B-Q8_0.gguf
+
+# DiT+VAE with LoRA (scale = alpha/rank; 1.0 is typical)
+../build/dit-vae \
+    --request lora0.json \
+    --text-encoder ../models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+    --dit ../models/acestep-v15-turbo-Q8_0.gguf \
+    --vae ../models/vae-BF16.gguf \
+    --lora "$ADAPTER" \
+    --lora-scale 1.0
+
+echo "Done. Check lora00.wav"
diff --git a/src/dit-graph.h b/src/dit-graph.h
index 2a92324..ab5839e 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -44,6 +44,23 @@ static struct ggml_tensor * dit_ggml_linear(
     return ggml_mul_mat(ctx, weight, input);
 }
 
+// Linear with optional LoRA: out = W@x + scale * (B@(A@x)). lora_a/lora_b may be NULL.
+static struct ggml_tensor * dit_ggml_linear_lora(
+        struct ggml_context * ctx,
+        struct ggml_tensor * weight,
+        struct ggml_tensor * lora_a,   // [in, r]
+        struct ggml_tensor * lora_b,   // [r, out]
+        float lora_scale,
+        struct ggml_tensor * input) {
+    struct ggml_tensor * out = ggml_mul_mat(ctx, weight, input);
+    if (lora_a && lora_b && lora_scale != 0.0f) {
+        struct ggml_tensor * ax = ggml_mul_mat(ctx, lora_a, input);
+        struct ggml_tensor * bax = ggml_mul_mat(ctx, lora_b, ax);
+        out = ggml_add(ctx, out, ggml_scale(ctx, bax, lora_scale));
+    }
+    return out;
+}
+
 // Helper: Linear layer with bias
 static struct ggml_tensor * dit_ggml_linear_bias(
         struct ggml_context * ctx,
@@ -164,6 +181,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     struct ggml_tensor * q, * k, * v;
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
+    float lora_scale = m->lora_scale;
     if (ly->sa_qkv) {
         struct ggml_tensor * qkv = dit_ggml_linear(ctx, ly->sa_qkv, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0));
@@ -173,11 +191,11 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
         struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0]));
-        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
+        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     } else {
-        q = dit_ggml_linear(ctx, ly->sa_q_proj, norm_sa);
-        k = dit_ggml_linear(ctx, ly->sa_k_proj, norm_sa);
-        v = dit_ggml_linear(ctx, ly->sa_v_proj, norm_sa);
+        q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa);
+        k = dit_ggml_linear_lora(ctx, ly->sa_k_proj, ly->lora_sa_k_a, ly->lora_sa_k_b, lora_scale, norm_sa);
+        v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     }
 
     // 2) Reshape to heads: [Nh*D, S, N] -> [D, Nh, S, N]
@@ -239,7 +257,7 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
     }
 
     // 8) O projection: [Nh*D, S, N] -> [H, S, N]
-    struct ggml_tensor * out = dit_ggml_linear(ctx, ly->sa_o_proj, attn);
+    struct ggml_tensor * out = dit_ggml_linear_lora(ctx, ly->sa_o_proj, ly->lora_sa_o_a, ly->lora_sa_o_b, m->lora_scale, attn);
     return out;
 }
 
@@ -253,20 +271,21 @@ static struct ggml_tensor * dit_ggml_build_mlp(
         struct ggml_tensor * norm_ffn,
         int S) {
 
+    float lora_scale = m->lora_scale;
     struct ggml_tensor * ff;
     if (ly->gate_up) {
         // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0]
         struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn);
         ff = ggml_swiglu(ctx, gu);
     } else {
-        // Separate: two matmuls + split swiglu
-        struct ggml_tensor * gate = dit_ggml_linear(ctx, ly->gate_proj, norm_ffn);
-        struct ggml_tensor * up   = dit_ggml_linear(ctx, ly->up_proj, norm_ffn);
+        // Separate: two matmuls + split swiglu (with optional LoRA)
+        struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn);
+        struct ggml_tensor * up   = dit_ggml_linear_lora(ctx, ly->up_proj, ly->lora_up_a, ly->lora_up_b, lora_scale, norm_ffn);
         ff = ggml_swiglu_split(ctx, gate, up);
     }
 
     // Down projection: [I, S] -> [H, S]
-    return dit_ggml_linear(ctx, ly->down_proj, ff);
+    return dit_ggml_linear_lora(ctx, ly->down_proj, ly->lora_down_a, ly->lora_down_b, lora_scale, ff);
 }
 
 // Build cross-attention sub-graph for a single layer.
@@ -305,14 +324,14 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
     } else if (ly->ca_kv) {
         // Q separate, K+V fused
-        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
         struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
     } else {
-        q = dit_ggml_linear(ctx, ly->ca_q_proj, norm_ca);
-        k = dit_ggml_linear(ctx, ly->ca_k_proj, enc);
-        v = dit_ggml_linear(ctx, ly->ca_v_proj, enc);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
+        k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc);
+        v = dit_ggml_linear_lora(ctx, ly->ca_v_proj, ly->lora_ca_v_a, ly->lora_ca_v_b, m->lora_scale, enc);
     }
 
     // reshape to [D, heads, seq, N] then permute to [D, seq, heads, N]
@@ -342,7 +361,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     attn = ggml_reshape_3d(ctx, attn, Nh * D, S, N);
 
     // O projection
-    return dit_ggml_linear(ctx, ly->ca_o_proj, attn);
+    return dit_ggml_linear_lora(ctx, ly->ca_o_proj, ly->lora_ca_o_a, ly->lora_ca_o_b, m->lora_scale, attn);
 }
 
 // Build one full DiT layer (AdaLN + self-attn + cross-attn + FFN + gated residuals)
diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp
new file mode 100644
index 0000000..881d941
--- /dev/null
+++ b/src/dit-lora.cpp
@@ -0,0 +1,198 @@
+// dit-lora.cpp: Load LoRA adapters from safetensors into DiT (ACE-Step).
+// Compatible with PEFT adapter_model.safetensors (lora_A / lora_B per target layer).
+
+#include "dit.h"
+#include "safetensors.h"
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <unordered_map>
+
+// Normalize adapter key to base name: decoder.layers.N.<proj>
+// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj"
+static std::string lora_key_to_base(const std::string & key) {
+    std::string s = key;
+    const char * prefixes[] = { "base_model.model.model.", "base_model.model." };
+    for (const char * p : prefixes) {
+        size_t pl = strlen(p);
+        if (s.size() >= pl && s.compare(0, pl, p) == 0) {
+            s = s.substr(pl);
+            break;
+        }
+    }
+    if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0)
+        s = s.substr(0, s.size() - 7);
+    else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0)
+        s = s.substr(0, s.size() - 7);
+    return s;
+}
+
+static bool is_lora_a(const std::string & key) {
+    return key.find("lora_A") != std::string::npos;
+}
+
+// Slot index for layer: 0=sa_q, 1=sa_k, 2=sa_v, 3=sa_o, 4=ca_q, 5=ca_k, 6=ca_v, 7=ca_o, 8=gate, 9=up, 10=down
+enum LoraSlot {
+    SA_Q, SA_K, SA_V, SA_O, CA_Q, CA_K, CA_V, CA_O, GATE, UP, DOWN, N_SLOTS
+};
+
+static bool parse_base_name(const std::string & base, int * layer_idx, LoraSlot * slot) {
+    int L = -1;
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = SA_Q; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = SA_K; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = SA_V; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.self_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = SA_O; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.q_proj", &L) == 1) { *layer_idx = L; *slot = CA_Q; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.k_proj", &L) == 1) { *layer_idx = L; *slot = CA_K; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.v_proj", &L) == 1) { *layer_idx = L; *slot = CA_V; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.cross_attn.o_proj", &L) == 1) { *layer_idx = L; *slot = CA_O; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.gate_proj", &L) == 1) { *layer_idx = L; *slot = GATE; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.up_proj", &L) == 1) { *layer_idx = L; *slot = UP; return true; }
+    if (sscanf(base.c_str(), "decoder.layers.%d.mlp.down_proj", &L) == 1) { *layer_idx = L; *slot = DOWN; return true; }
+    return false;
+}
+
+static struct ggml_tensor ** slot_to_ptr(DiTGGMLLayer * ly, LoraSlot slot, bool is_b) {
+    if (is_b) {
+        switch (slot) {
+            case SA_Q: return &ly->lora_sa_q_b; case SA_K: return &ly->lora_sa_k_b; case SA_V: return &ly->lora_sa_v_b; case SA_O: return &ly->lora_sa_o_b;
+            case CA_Q: return &ly->lora_ca_q_b; case CA_K: return &ly->lora_ca_k_b; case CA_V: return &ly->lora_ca_v_b; case CA_O: return &ly->lora_ca_o_b;
+            case GATE: return &ly->lora_gate_b; case UP: return &ly->lora_up_b; case DOWN: return &ly->lora_down_b;
+            default: return nullptr;
+        }
+    } else {
+        switch (slot) {
+            case SA_Q: return &ly->lora_sa_q_a; case SA_K: return &ly->lora_sa_k_a; case SA_V: return &ly->lora_sa_v_a; case SA_O: return &ly->lora_sa_o_a;
+            case CA_Q: return &ly->lora_ca_q_a; case CA_K: return &ly->lora_ca_k_a; case CA_V: return &ly->lora_ca_v_a; case CA_O: return &ly->lora_ca_o_a;
+            case GATE: return &ly->lora_gate_a; case UP: return &ly->lora_up_a; case DOWN: return &ly->lora_down_a;
+            default: return nullptr;
+        }
+    }
+}
+
+bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) {
+    FILE * fp = fopen(lora_path, "rb");
+    if (!fp) {
+        fprintf(stderr, "[LoRA] cannot open %s\n", lora_path);
+        return false;
+    }
+    uint8_t h8[8];
+    if (fread(h8, 1, 8, fp) != 8) {
+        fclose(fp);
+        return false;
+    }
+    uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
+        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
+    uint64_t data_section_start = 8 + header_len;
+
+    std::unordered_map<std::string, SafeTensorInfo> tensors;
+    int n = safetensors_parse_lora(fp, &tensors);
+    if (n == 0) {
+        fclose(fp);
+        fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path);
+        return false;
+    }
+
+    // Count pairs we will load: for each lora_A key, find the matching lora_B (same base name)
+    std::unordered_map<std::string, std::pair<std::string, std::string>> pairs;  // base -> (key_a, key_b)
+    std::unordered_map<std::string, std::string> base_to_b;
+    for (const auto & kv : tensors) {
+        std::string base = lora_key_to_base(kv.first);
+        if (base.empty()) continue;
+        if (is_lora_a(kv.first))
+            base_to_b[base] = "";  // mark base as having A; we'll find B next
+    }
+    for (const auto & kv : tensors) {
+        std::string base = lora_key_to_base(kv.first);
+        if (base.empty()) continue;
+        if (base_to_b.count(base) && kv.first.find("lora_B") != std::string::npos)
+            base_to_b[base] = kv.first;
+    }
+    for (const auto & kv : tensors) {
+        if (!is_lora_a(kv.first)) continue;
+        std::string base = lora_key_to_base(kv.first);
+        auto it = base_to_b.find(base);
+        if (it != base_to_b.end() && !it->second.empty())
+            pairs[base] = { kv.first, it->second };
+    }
+
+    int n_pairs = (int)pairs.size();
+    wctx_init(&m->lora_wctx, n_pairs * 2);  // A and B per pair
+
+    fseek(fp, (long)data_section_start, SEEK_SET);
+
+    for (const auto & p : pairs) {
+        const std::string & base = p.first;
+        const std::string & key_a = p.second.first;
+        const std::string & key_b = p.second.second;
+        int layer_idx = 0;
+        LoraSlot slot = N_SLOTS;
+        if (!parse_base_name(base, &layer_idx, &slot) || layer_idx < 0 || layer_idx >= m->cfg.n_layers) continue;
+
+        DiTGGMLLayer * ly = &m->layers[layer_idx];
+        SafeTensorInfo & info_a = tensors[key_a];
+        SafeTensorInfo & info_b = tensors[key_b];
+        if (info_a.n_dims != 2 || info_b.n_dims != 2) continue;
+        // A_pt [r, in], B_pt [out, r]. We need A_ggml [r, in] for mul_mat(A,x)=[r,S], B_ggml [out, r] for mul_mat(B, Ax)=[out,S].
+        // ggml layout: ne[0]=cols, ne[1]=rows. So A: [r, in] -> ne[0]=in, ne[1]=r. B: [out, r] -> ne[0]=r, ne[1]=out.
+        int64_t r = info_a.shape[0], in_dim = info_a.shape[1];
+        int64_t out_dim = info_b.shape[0];
+        if (info_b.shape[1] != r) continue;
+
+        struct ggml_tensor * ta = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)in_dim, (int64_t)r);
+        struct ggml_tensor * tb = ggml_new_tensor_2d(m->lora_wctx.ctx, GGML_TYPE_F32, (int64_t)r, (int64_t)out_dim);
+        ggml_set_name(ta, key_a.c_str());
+        ggml_set_name(tb, key_b.c_str());
+
+        // Copy A: file is row-major [r, in], we need ggml col-major [in, r] (transpose)
+        size_t na = (size_t)(r * in_dim);
+        m->lora_wctx.staging.emplace_back(na);
+        float * buf_a = m->lora_wctx.staging.back().data();
+        if (!safetensors_read_tensor_data(fp, data_section_start, info_a.data_start, info_a.data_end, buf_a)) {
+            fclose(fp);
+            wctx_free(&m->lora_wctx);
+            return false;
+        }
+        m->lora_wctx.staging.emplace_back(na);
+        float * transposed_a = m->lora_wctx.staging.back().data();
+        for (int64_t i = 0; i < r; i++)
+            for (int64_t j = 0; j < in_dim; j++)
+                transposed_a[(size_t)(j * r + i)] = buf_a[(size_t)(i * in_dim + j)];
+        m->lora_wctx.pending.push_back({ ta, transposed_a, na * sizeof(float), 0 });
+
+        size_t nb = (size_t)(out_dim * r);
+        m->lora_wctx.staging.emplace_back(nb);
+        float * buf_b = m->lora_wctx.staging.back().data();
+        if (!safetensors_read_tensor_data(fp, data_section_start, info_b.data_start, info_b.data_end, buf_b)) {
+            fclose(fp);
+            wctx_free(&m->lora_wctx);
+            return false;
+        }
+        m->lora_wctx.staging.emplace_back(nb);
+        float * transposed_b = m->lora_wctx.staging.back().data();
+        for (int64_t i = 0; i < out_dim; i++)
+            for (int64_t j = 0; j < r; j++)
+                transposed_b[(size_t)(j * out_dim + i)] = buf_b[(size_t)(i * r + j)];
+        m->lora_wctx.pending.push_back({ tb, transposed_b, nb * sizeof(float), 0 });
+
+        struct ggml_tensor ** pa = slot_to_ptr(ly, slot, false);
+        struct ggml_tensor ** pb = slot_to_ptr(ly, slot, true);
+        if (pa) *pa = ta;
+        if (pb) *pb = tb;
+    }
+    fclose(fp);
+    fp = nullptr;
+
+    if (!wctx_alloc(&m->lora_wctx, m->backend)) {
+        fprintf(stderr, "[LoRA] failed to allocate LoRA tensors on backend\n");
+        wctx_free(&m->lora_wctx);
+        return false;
+    }
+    m->lora_scale = scale;
+    fprintf(stderr, "[LoRA] loaded %d adapter pairs from %s (scale=%.4f)\n", n_pairs, lora_path, scale);
+    return true;
+}
diff --git a/src/dit.h b/src/dit.h
index 524dd76..9c842b0 100644
--- a/src/dit.h
+++ b/src/dit.h
@@ -81,6 +81,19 @@ struct DiTGGMLLayer {
     // AdaLN scale-shift table: [6*hidden] (6 rows of [hidden])
     struct ggml_tensor * scale_shift_table; // [hidden, 6] in ggml layout
 
+    // Optional LoRA adapters (F32, applied when base projection is separate)
+    struct ggml_tensor * lora_sa_q_a, * lora_sa_q_b;
+    struct ggml_tensor * lora_sa_k_a, * lora_sa_k_b;
+    struct ggml_tensor * lora_sa_v_a, * lora_sa_v_b;
+    struct ggml_tensor * lora_sa_o_a, * lora_sa_o_b;
+    struct ggml_tensor * lora_ca_q_a, * lora_ca_q_b;
+    struct ggml_tensor * lora_ca_k_a, * lora_ca_k_b;
+    struct ggml_tensor * lora_ca_v_a, * lora_ca_v_b;
+    struct ggml_tensor * lora_ca_o_a, * lora_ca_o_b;
+    struct ggml_tensor * lora_gate_a, * lora_gate_b;
+    struct ggml_tensor * lora_up_a, * lora_up_b;
+    struct ggml_tensor * lora_down_a, * lora_down_b;
+
     int layer_type;  // 0=sliding, 1=full
 };
 
@@ -122,6 +135,8 @@ struct DiTGGML {
 
     // Weight storage
     WeightCtx wctx;
+    WeightCtx lora_wctx;     // optional LoRA adapter tensors (when lora_scale > 0)
+    float lora_scale;        // alpha/rank for LoRA (0 = no LoRA)
 
     // Pre-allocated constant for AdaLN (1+scale) fusion
     struct ggml_tensor * scalar_one;  // [1] = 1.0f, broadcast in ggml_add
@@ -389,10 +404,15 @@ static void dit_ggml_init_backend(DiTGGML * m) {
     m->use_flash_attn = (bp.backend != bp.cpu_backend);
 }
 
+// Load LoRA adapter from safetensors (e.g. adapter_model.safetensors).
+// scale = alpha/rank (typical 1.0). Call after dit_ggml_load. Returns false on error.
+bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale);
+
 static void dit_ggml_free(DiTGGML * m) {
     if (m->sched) ggml_backend_sched_free(m->sched);
     if (m->backend && m->backend != m->cpu_backend) ggml_backend_free(m->backend);
     if (m->cpu_backend) ggml_backend_free(m->cpu_backend);
     wctx_free(&m->wctx);
+    if (m->lora_wctx.ctx) wctx_free(&m->lora_wctx);
     *m = {};
 }
diff --git a/src/safetensors.h b/src/safetensors.h
new file mode 100644
index 0000000..74d5967
--- /dev/null
+++ b/src/safetensors.h
@@ -0,0 +1,107 @@
+#pragma once
+// safetensors.h: minimal reader for LoRA adapter_model.safetensors
+//
+// Format: 8-byte header length (LE uint64), then JSON header, then raw tensor data.
+// We only parse keys that look like "*lora_A*" / "*lora_B*" and extract shape + data_offsets.
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+struct SafeTensorInfo {
+    std::string dtype;       // "F32", "F16", "BF16"
+    int64_t shape[2];        // [dim0, dim1] from JSON
+    int n_dims;
+    uint64_t data_start;     // byte offset in file (after header)
+    uint64_t data_end;
+};
+
+// Open file, read header, parse tensor metadata for LoRA tensors.
+// Returns number of LoRA tensors found; fills *out with tensor name -> info.
+// Caller must fclose(fp) and free the map; file position is left at start of data section.
+static int safetensors_parse_lora(FILE * fp, std::unordered_map<std::string, SafeTensorInfo> * out) {
+    out->clear();
+    uint64_t header_len = 0;
+    uint8_t h8[8];
+    if (fread(h8, 1, 8, fp) != 8) return 0;
+    header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
+        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
+    if (header_len == 0 || header_len > 10 * 1024 * 1024) return 0;  // cap 10MB header
+    std::vector<char> buf(header_len + 1);
+    if (fread(buf.data(), 1, header_len, fp) != header_len) return 0;
+    buf[header_len] = '\0';
+    const char * json = buf.data();
+
+    // Find each key that contains "lora_A" or "lora_B"
+    const char * p = json;
+    int count = 0;
+    while ((p = strstr(p, "\"")) != nullptr) {
+        const char * key_start = p + 1;
+        p = strchr(key_start, '"');
+        if (!p) break;
+        std::string key(key_start, (size_t)(p - key_start));
+        p++;
+        if (key.find("lora_A") == std::string::npos && key.find("lora_B") == std::string::npos) {
+            continue;
+        }
+        // Find the value object for this key: skip ":
+        while (*p && (*p == ' ' || *p == ':')) p++;
+        if (*p != '{') continue;
+        const char * obj = p;
+        SafeTensorInfo info = {};
+        info.shape[0] = info.shape[1] = 1;
+        info.n_dims = 0;
+        // "shape":[n,m] or [n]
+        const char * sh = strstr(obj, "\"shape\"");
+        if (sh) {
+            const char * br = strchr(sh, '[');
+            if (br) {
+                long long a = 0, b = 0;
+                int n = sscanf(br, "[%lld,%lld]", &a, &b);
+                if (n >= 1) { info.shape[0] = (int64_t)a; info.n_dims = 1; }
+                if (n >= 2) { info.shape[1] = (int64_t)b; info.n_dims = 2; }
+            }
+        }
+        const char * dt = strstr(obj, "\"dtype\"");
+        if (dt) {
+            const char * q = strchr(dt, '"');
+            if (q) q = strchr(q + 1, '"');
+            if (q) {
+                const char * start = q + 1;
+                const char * end = strchr(start, '"');
+                if (end) info.dtype = std::string(start, end - start);
+            }
+        }
+        const char * off = strstr(obj, "\"data_offsets\"");
+        if (off) {
+            const char * br = strchr(off, '[');
+            if (br) {
+                uint64_t s = 0, e = 0;
+                if (sscanf(br, "[%llu,%llu]", (unsigned long long*)&s, (unsigned long long*)&e) == 2) {
+                    info.data_start = s;
+                    info.data_end = e;
+                }
+            }
+        }
+        if (info.dtype.empty() || info.n_dims == 0) continue;
+        (*out)[key] = info;
+        count++;
+    }
+    return count;
+}
+
+// Read raw tensor data from file. File must be positioned at start of data section
+// (i.e. after the 8-byte header length + header bytes).
+// data_offset in the JSON is relative to the start of the data section.
+static bool safetensors_read_tensor_data(FILE * fp, uint64_t data_section_start,
+        uint64_t tensor_start, uint64_t tensor_end, void * out_buf) {
+    uint64_t off = data_section_start + tensor_start;
+    uint64_t nbytes = tensor_end - tensor_start;
+    if (fseek(fp, (long)off, SEEK_SET) != 0) return false;
+    if (fread(out_buf, 1, nbytes, fp) != nbytes) return false;
+    return true;
+}
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index cac80a5..d889da1 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -67,6 +67,9 @@ static void print_usage(const char * prog) {
         "  --text-encoder <gguf>   Text encoder GGUF file\n"
         "  --dit <gguf>            DiT GGUF file\n"
         "  --vae <gguf>            VAE GGUF file\n\n"
+        "LoRA:\n"
+        "  --lora <path>           LoRA adapter (adapter_model.safetensors)\n"
+        "  --lora-scale <float>    LoRA scale, e.g. alpha/rank (default: 1.0)\n\n"
         "Batch:\n"
         "  --batch <N>             DiT variations per request (default: 1, max 9)\n\n"
         "Output naming: input.json -> input0.wav, input1.wav, ... (last digit = batch index)\n\n"
@@ -99,9 +102,11 @@ int main(int argc, char ** argv) {
     const char * dit_gguf      = NULL;
     const char * vae_gguf       = NULL;
     const char * dump_dir      = NULL;
-    int batch_n                = 1;
-    int vae_chunk              = 256;
-    int vae_overlap            = 64;
+    const char * lora_path     = NULL;
+    float lora_scale            = 1.0f;
+    int batch_n                 = 1;
+    int vae_chunk               = 256;
+    int vae_overlap             = 64;
 
     for (int i = 1; i < argc; i++) {
         if (strcmp(argv[i], "--request") == 0) {
@@ -116,6 +121,8 @@ int main(int argc, char ** argv) {
         else if (strcmp(argv[i], "--batch") == 0 && i+1 < argc) batch_n = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-chunk") == 0 && i+1 < argc) vae_chunk = atoi(argv[++i]);
         else if (strcmp(argv[i], "--vae-overlap") == 0 && i+1 < argc) vae_overlap = atoi(argv[++i]);
+        else if (strcmp(argv[i], "--lora") == 0 && i+1 < argc) lora_path = argv[++i];
+        else if (strcmp(argv[i], "--lora-scale") == 0 && i+1 < argc) lora_scale = (float)atof(argv[++i]);
         else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
             print_usage(argv[0]); return 0;
         } else {
@@ -161,6 +168,16 @@ int main(int argc, char ** argv) {
     }
     fprintf(stderr, "[Load] DiT weight load: %.1f ms\n", timer.ms());
 
+    if (lora_path) {
+        timer.reset();
+        if (!dit_ggml_load_lora(&model, lora_path, lora_scale)) {
+            fprintf(stderr, "FATAL: failed to load LoRA from %s\n", lora_path);
+            dit_ggml_free(&model);
+            return 1;
+        }
+        fprintf(stderr, "[Load] LoRA: %.1f ms\n", timer.ms());
+    }
+
     // Read DiT GGUF metadata + silence_latent tensor (once)
     bool is_turbo = false;
     std::vector<float> silence_full;  // [15000, 64] f32

From 907a068a24dba3208671edd45775a7cd1334d9ad Mon Sep 17 00:00:00 2001
From: qxip <qxip@mini-ams.local>
Date: Sat, 28 Feb 2026 22:57:57 +0100
Subject: [PATCH 2/5] LoRA: apply on fused layers, add custom_tag/genre, update
 example

- dit-graph.h: apply LoRA deltas when base uses fused QKV/gate_up/ca_qkv
  so self-attn, MLP, and cross-attn all use adapters (fixes no audible effect)
- dit-lora.cpp: fix safetensors parse (rewind fp before parse); normalize
  keys for base_model.model.layers.* and .lora_A.weight/.lora_B.weight
- request: add custom_tag (LoRA trigger) and genre; parse language,
  is_instrumental, formatted_lyrics
- dit-vae: append custom_tag to caption for condition encoder when set
- examples/lora.json: nu-disco example with custom_tag crydamoure

Made-with: Cursor
---
 examples/lora.json | 16 +++++++++++-----
 src/dit-graph.h    | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 src/dit-lora.cpp   | 20 +++++++++++--------
 src/request.cpp    | 13 +++++++++++++
 src/request.h      |  6 +++++-
 tools/dit-vae.cpp  |  7 +++++--
 6 files changed, 92 insertions(+), 18 deletions(-)

diff --git a/examples/lora.json b/examples/lora.json
index 8317521..c872efb 100644
--- a/examples/lora.json
+++ b/examples/lora.json
@@ -1,11 +1,17 @@
 {
   "task_type": "text2music",
-  "caption": "Emotional vocal track with soft synths",
-  "lyrics": "",
-  "duration": 10,
+  "caption": "An energetic nu-disco track built on a foundation of a tight, funky slap bassline and a crisp, four-on-the-floor drum machine beat. The song opens with a distinctive, filtered wah-wah guitar riff that serves as a recurring motif. The arrangement is layered with shimmering synth pads, punchy synth stabs, and subtle arpeggiated synth textures that add movement. The track progresses through dynamic sections, including a brief atmospheric breakdown before rebuilding the main groove.",
+  "genre": "Nu-disco",
+  "lyrics": "[Instrumental]",
+  "bpm": 115,
+  "keyscale": "C# major",
+  "timesignature": "4",
+  "duration": 256,
+  "language": "unknown",
+  "instrumental": true,
+  "custom_tag": "crydamoure",
   "inference_steps": 8,
   "guidance_scale": 1,
   "shift": 3,
-  "seed": 42,
-  "vocal_language": "en"
+  "seed": -1
 }
diff --git a/src/dit-graph.h b/src/dit-graph.h
index ab5839e..1241bc5 100644
--- a/src/dit-graph.h
+++ b/src/dit-graph.h
@@ -187,10 +187,25 @@ static struct ggml_tensor * dit_ggml_build_self_attn(
         q = ggml_cont(ctx, ggml_view_3d(ctx, qkv, q_dim, S, N, qkv->nb[1], qkv->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)q_dim * qkv->nb[0]));
         v = ggml_cont(ctx, ggml_view_3d(ctx, qkv, kv_dim, S, N, qkv->nb[1], qkv->nb[2], (size_t)(q_dim + kv_dim) * qkv->nb[0]));
+        // LoRA on fused path: add scale * (B @ (A @ x)) per projection when adapters are loaded
+        if (lora_scale != 0.0f) {
+            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_v_a && ly->lora_sa_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_v_b, ggml_mul_mat(ctx, ly->lora_sa_v_a, norm_sa)), lora_scale));
+        }
     } else if (ly->sa_qk) {
         struct ggml_tensor * qk = dit_ggml_linear(ctx, ly->sa_qk, norm_sa);
         q = ggml_cont(ctx, ggml_view_3d(ctx, qk, q_dim, S, N, qk->nb[1], qk->nb[2], 0));
         k = ggml_cont(ctx, ggml_view_3d(ctx, qk, kv_dim, S, N, qk->nb[1], qk->nb[2], (size_t)q_dim * qk->nb[0]));
+        if (lora_scale != 0.0f) {
+            if (ly->lora_sa_q_a && ly->lora_sa_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_q_b, ggml_mul_mat(ctx, ly->lora_sa_q_a, norm_sa)), lora_scale));
+            if (ly->lora_sa_k_a && ly->lora_sa_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_sa_k_b, ggml_mul_mat(ctx, ly->lora_sa_k_a, norm_sa)), lora_scale));
+        }
         v = dit_ggml_linear_lora(ctx, ly->sa_v_proj, ly->lora_sa_v_a, ly->lora_sa_v_b, lora_scale, norm_sa);
     } else {
         q = dit_ggml_linear_lora(ctx, ly->sa_q_proj, ly->lora_sa_q_a, ly->lora_sa_q_b, lora_scale, norm_sa);
@@ -271,12 +286,25 @@ static struct ggml_tensor * dit_ggml_build_mlp(
         struct ggml_tensor * norm_ffn,
         int S) {
 
+    DiTGGMLConfig & c = m->cfg;
+    int I = c.intermediate_size;
+    int N = (int)norm_ffn->ne[2];
     float lora_scale = m->lora_scale;
     struct ggml_tensor * ff;
     if (ly->gate_up) {
         // Fused: single matmul [H, 2*I] x [H, S, N] -> [2*I, S, N], then swiglu splits ne[0]
         struct ggml_tensor * gu = dit_ggml_linear(ctx, ly->gate_up, norm_ffn);
-        ff = ggml_swiglu(ctx, gu);
+        if (lora_scale != 0.0f && ((ly->lora_gate_a && ly->lora_gate_b) || (ly->lora_up_a && ly->lora_up_b))) {
+            struct ggml_tensor * gate = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], 0));
+            struct ggml_tensor * up   = ggml_cont(ctx, ggml_view_3d(ctx, gu, I, S, N, gu->nb[1], gu->nb[2], (size_t)I * gu->nb[0]));
+            if (ly->lora_gate_a && ly->lora_gate_b)
+                gate = ggml_add(ctx, gate, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_gate_b, ggml_mul_mat(ctx, ly->lora_gate_a, norm_ffn)), lora_scale));
+            if (ly->lora_up_a && ly->lora_up_b)
+                up   = ggml_add(ctx, up, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_up_b, ggml_mul_mat(ctx, ly->lora_up_a, norm_ffn)), lora_scale));
+            ff = ggml_swiglu_split(ctx, gate, up);
+        } else {
+            ff = ggml_swiglu(ctx, gu);
+        }
     } else {
         // Separate: two matmuls + split swiglu (with optional LoRA)
         struct ggml_tensor * gate = dit_ggml_linear_lora(ctx, ly->gate_proj, ly->lora_gate_a, ly->lora_gate_b, lora_scale, norm_ffn);
@@ -311,6 +339,7 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
     // Q from hidden, KV from encoder (full fused, Q+KV partial, separate)
     int q_dim  = Nh * D;
     int kv_dim = Nkv * D;
+    float lora_scale = m->lora_scale;
     struct ggml_tensor * q, * k, * v;
     if (ly->ca_qkv) {
         // Full QKV fused: split Q from hidden, KV from enc via weight views
@@ -322,12 +351,27 @@ static struct ggml_tensor * dit_ggml_build_cross_attn(
         struct ggml_tensor * kv = ggml_mul_mat(ctx, w_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
+        // LoRA on fused path: add scale * (B @ (A @ x)) for Q (from norm_ca), K/V (from enc)
+        if (lora_scale != 0.0f) {
+            if (ly->lora_ca_q_a && ly->lora_ca_q_b)
+                q = ggml_add(ctx, q, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_q_b, ggml_mul_mat(ctx, ly->lora_ca_q_a, norm_ca)), lora_scale));
+            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
+            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
+        }
     } else if (ly->ca_kv) {
         // Q separate, K+V fused
-        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
+        q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, lora_scale, norm_ca);
         struct ggml_tensor * kv = ggml_mul_mat(ctx, ly->ca_kv, enc);
         k = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], 0));
         v = ggml_cont(ctx, ggml_view_3d(ctx, kv, kv_dim, enc_S, N, kv->nb[1], kv->nb[2], (size_t)kv_dim * kv->nb[0]));
+        if (lora_scale != 0.0f) {
+            if (ly->lora_ca_k_a && ly->lora_ca_k_b)
+                k = ggml_add(ctx, k, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_k_b, ggml_mul_mat(ctx, ly->lora_ca_k_a, enc)), lora_scale));
+            if (ly->lora_ca_v_a && ly->lora_ca_v_b)
+                v = ggml_add(ctx, v, ggml_scale(ctx, ggml_mul_mat(ctx, ly->lora_ca_v_b, ggml_mul_mat(ctx, ly->lora_ca_v_a, enc)), lora_scale));
+        }
     } else {
         q = dit_ggml_linear_lora(ctx, ly->ca_q_proj, ly->lora_ca_q_a, ly->lora_ca_q_b, m->lora_scale, norm_ca);
         k = dit_ggml_linear_lora(ctx, ly->ca_k_proj, ly->lora_ca_k_a, ly->lora_ca_k_b, m->lora_scale, enc);
diff --git a/src/dit-lora.cpp b/src/dit-lora.cpp
index 881d941..b14e090 100644
--- a/src/dit-lora.cpp
+++ b/src/dit-lora.cpp
@@ -9,7 +9,7 @@
 #include <unordered_map>
 
 // Normalize adapter key to base name: decoder.layers.N.<proj>
-// e.g. "base_model.model.model.decoder.layers.0.self_attn.q_proj.lora_A.default" -> "decoder.layers.0.self_attn.q_proj"
+// Handles: base_model.model.model., base_model.model.; decoder.layers. or layers.; .lora_A.default/.lora_B.default or .lora_A.weight/.lora_B.weight
 static std::string lora_key_to_base(const std::string & key) {
     std::string s = key;
     const char * prefixes[] = { "base_model.model.model.", "base_model.model." };
@@ -20,14 +20,22 @@ static std::string lora_key_to_base(const std::string & key) {
             break;
         }
     }
+    // PEFT-style suffix
     if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.default") == 0)
         s = s.substr(0, s.size() - 14);
     else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.default") == 0)
         s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_A.weight") == 0)
+        s = s.substr(0, s.size() - 14);
+    else if (s.size() > 14 && s.compare(s.size() - 14, 14, ".lora_B.weight") == 0)
+        s = s.substr(0, s.size() - 14);
     else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_A") == 0)
         s = s.substr(0, s.size() - 7);
     else if (s.size() > 7 && s.compare(s.size() - 7, 7, ".lora_B") == 0)
         s = s.substr(0, s.size() - 7);
+    // HuggingFace adapter: layers.N -> decoder.layers.N for our DiT naming
+    if (s.size() >= 7 && s.compare(0, 7, "layers.") == 0)
+        s = "decoder." + s;
     return s;
 }
 
@@ -80,17 +88,13 @@ bool dit_ggml_load_lora(DiTGGML * m, const char * lora_path, float scale) {
         fprintf(stderr, "[LoRA] cannot open %s\n", lora_path);
         return false;
     }
-    uint8_t h8[8];
-    if (fread(h8, 1, 8, fp) != 8) {
+    std::unordered_map<std::string, SafeTensorInfo> tensors;
+    if (fseek(fp, 0, SEEK_SET) != 0) {
         fclose(fp);
         return false;
     }
-    uint64_t header_len = (uint64_t)h8[0] | ((uint64_t)h8[1] << 8) | ((uint64_t)h8[2] << 16) | ((uint64_t)h8[3] << 24)
-        | ((uint64_t)h8[4] << 32) | ((uint64_t)h8[5] << 40) | ((uint64_t)h8[6] << 48) | ((uint64_t)h8[7] << 56);
-    uint64_t data_section_start = 8 + header_len;
-
-    std::unordered_map<std::string, SafeTensorInfo> tensors;
     int n = safetensors_parse_lora(fp, &tensors);
+    uint64_t data_section_start = (uint64_t)ftell(fp);
     if (n == 0) {
         fclose(fp);
         fprintf(stderr, "[LoRA] no LoRA tensors found in %s\n", lora_path);
diff --git a/src/request.cpp b/src/request.cpp
index a24d838..f85873a 100644
--- a/src/request.cpp
+++ b/src/request.cpp
@@ -16,6 +16,8 @@ void request_init(AceRequest * r) {
     r->caption            = "";
     r->lyrics             = "";
     r->instrumental       = false;
+    r->custom_tag         = "";
+    r->genre              = "";
     r->bpm                = 0;
     r->duration           = -1.0f;
     r->keyscale           = "";
@@ -227,7 +229,11 @@ bool request_parse(AceRequest * r, const char * path) {
         if      (k == "task_type")          r->task_type          = v;
         else if (k == "caption")            r->caption            = v;
         else if (k == "lyrics")             r->lyrics             = v;
+        else if (k == "custom_tag")         r->custom_tag         = v;
+        else if (k == "genre")              r->genre             = v;
         else if (k == "keyscale")           r->keyscale           = v;
+        else if (k == "formatted_lyrics")    r->lyrics             = v;  // alias for lyrics
+        else if (k == "language")           r->vocal_language    = v;  // alias for vocal_language
         else if (k == "timesignature")      r->timesignature      = v;
         else if (k == "vocal_language")     r->vocal_language     = v;
         else if (k == "reference_audio")   r->reference_audio    = v;
@@ -254,6 +260,7 @@ bool request_parse(AceRequest * r, const char * path) {
 
         // bools
         else if (k == "instrumental")       r->instrumental       = (v == "true");
+        else if (k == "is_instrumental")    r->instrumental       = (v == "true");
         // unknown keys: silently ignored (forward compat)
     }
 
@@ -274,6 +281,10 @@ bool request_write(const AceRequest * r, const char * path) {
     fprintf(f, "  \"lyrics\": \"%s\",\n",             json_escape(r->lyrics).c_str());
     if (r->instrumental)
         fprintf(f, "  \"instrumental\": true,\n");
+    if (!r->custom_tag.empty())
+        fprintf(f, "  \"custom_tag\": \"%s\",\n",     json_escape(r->custom_tag).c_str());
+    if (!r->genre.empty())
+        fprintf(f, "  \"genre\": \"%s\",\n",          json_escape(r->genre).c_str());
     fprintf(f, "  \"bpm\": %d,\n",                    r->bpm);
     fprintf(f, "  \"duration\": %.1f,\n",             r->duration);
     fprintf(f, "  \"keyscale\": \"%s\",\n",           json_escape(r->keyscale).c_str());
@@ -310,6 +321,8 @@ void request_dump(const AceRequest * r, FILE * f) {
     fprintf(f, "  caption:    %.60s%s\n",
             r->caption.c_str(), r->caption.size() > 60 ? "..." : "");
     fprintf(f, "  lyrics:     %zu bytes\n", r->lyrics.size());
+    if (!r->custom_tag.empty())
+        fprintf(f, "  custom_tag: %s\n", r->custom_tag.c_str());
     fprintf(f, "  bpm=%d dur=%.0f key=%s ts=%s lang=%s\n",
             r->bpm, r->duration, r->keyscale.c_str(),
             r->timesignature.c_str(), r->vocal_language.c_str());
diff --git a/src/request.h b/src/request.h
index e9222a0..ef4b41f 100644
--- a/src/request.h
+++ b/src/request.h
@@ -19,7 +19,11 @@ struct AceRequest {
     // text content
     std::string caption;            // ""
     std::string lyrics;             // ""
-    bool        instrumental;       // false
+    bool        instrumental;      // false
+
+    // LoRA / style trigger (appended to caption for condition encoder when set)
+    std::string custom_tag;         // "" e.g. "crydamoure"
+    std::string genre;             // "" e.g. "Nu-disco"
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset
diff --git a/tools/dit-vae.cpp b/tools/dit-vae.cpp
index d889da1..fd5fe47 100644
--- a/tools/dit-vae.cpp
+++ b/tools/dit-vae.cpp
@@ -243,8 +243,11 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-        // Extract params
-        const char * caption  = req.caption.c_str();
+        // Extract params (append custom_tag to caption for LoRA/condition so trigger is in text)
+        std::string caption_for_cond = req.caption;
+        if (!req.custom_tag.empty())
+            caption_for_cond += ", " + req.custom_tag;
+        const char * caption  = caption_for_cond.c_str();
         const char * lyrics   = req.lyrics.c_str();
         char bpm_str[16] = "N/A";
         if (req.bpm > 0) snprintf(bpm_str, sizeof(bpm_str), "%d", req.bpm);

From acd84020a3f43a0ebfd0dbec323dca7f2781cafa Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:23:08 +0100
Subject: [PATCH 3/5] Fix formatting and clarify LoRA adapter instructions

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9ad2a2b..d809146 100644
--- a/README.md
+++ b/README.md
@@ -146,13 +146,13 @@ cd examples
 ./cover.sh            # cover mode: decode precomputed audio_codes (no LLM)
 ./cover-reference.sh  # cover + reference_audio for timbre (WAV/MP3; needs reference.wav or .mp3)
 ./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
-./lora.sh             # DiT + LoRA adapter (e.g. duckdbot/acestep-lora-cryda; put adapter in examples/lora/)
+./lora.sh             # DiT + LoRA adapter
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
 alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
-**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with a PEFT-style LoRA (e.g. [duckdbot/acestep-lora-cryda](https://huggingface.co/duckdbot/acestep-lora-cryda)). Adapter must be `adapter_model.safetensors` (safetensors with `lora_A` / `lora_B` keys matching `decoder.layers.*`). Put the file in `examples/lora/` and run `./lora.sh`, or pass the path explicitly.
+**LoRA adapters**: use `--lora <path>` and optional `--lora-scale <float>` with dit-vae to run the DiT with PEFT-style Ace-Step LoRAs.
 
 ## Generation modes
 
@@ -214,7 +214,7 @@ All fields with defaults. Only `caption` is required. Built-in modes (text2music
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md).
+skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style. `src_audio`: not yet implemented (see docs/MODES.md).
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.

From 9b087b2ee191ecfce4c97bbb09cdae3cb93a7aa7 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:24:06 +0100
Subject: [PATCH 4/5] Remove download instructions for LoRA adapter

Removed instructions for downloading LoRA adapter from Hugging Face.
---
 examples/lora.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/lora.sh b/examples/lora.sh
index 9c25d33..db7ce2b 100755
--- a/examples/lora.sh
+++ b/examples/lora.sh
@@ -7,10 +7,6 @@ cd "$(dirname "$0")"
 ADAPTER="lora/adapter_model.safetensors"
 if [ ! -f "$ADAPTER" ]; then
     echo "LoRA adapter not found at $ADAPTER"
-    echo "Download once (e.g. from Hugging Face):"
-    echo "  mkdir -p lora"
-    echo "  curl -L -o $ADAPTER 'https://huggingface.co/duckdbot/acestep-lora-cryda/resolve/main/adapter_model.safetensors'"
-    echo "Or: pip install hf && huggingface-cli download duckdbot/acestep-lora-cryda adapter_model.safetensors --local-dir lora"
     exit 1
 fi
 

From fc2408ae7cda0e0fdd5b5fb59318d4db88545f69 Mon Sep 17 00:00:00 2001
From: Lorenzo Mangani <lorenzo.mangani@gmail.com>
Date: Sat, 28 Feb 2026 23:24:56 +0100
Subject: [PATCH 5/5] Update comments for custom_tag and genre fields

---
 src/request.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/request.h b/src/request.h
index ef4b41f..ba85821 100644
--- a/src/request.h
+++ b/src/request.h
@@ -22,8 +22,8 @@ struct AceRequest {
     bool        instrumental;      // false
 
     // LoRA / style trigger (appended to caption for condition encoder when set)
-    std::string custom_tag;         // "" e.g. "crydamoure"
-    std::string genre;             // "" e.g. "Nu-disco"
+    std::string custom_tag;         // "" LoRA trigger word
+    std::string genre;             // "" LoRA genre
 
     // metadata (user-provided or LLM-enriched)
     int         bpm;                // 0 = unset