audiohacking · lmangani · Feb 28, 2026 · Feb 28, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,44 @@
+# Validate that the project builds on Ubuntu and macOS (no model download).
+name: Build
+
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+    branches: [main, master]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build (Ubuntu)
+        if: matrix.os == 'ubuntu-latest'
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
+          mkdir build && cd build
+          cmake .. -DGGML_BLAS=ON
+          cmake --build . --config Release -j$(nproc)
+
+      - name: Build (macOS)
+        if: matrix.os == 'macos-latest'
+        run: |
+          mkdir build && cd build
+          cmake ..
+          cmake --build . --config Release -j$(sysctl -n hw.ncpu)
+
+      - name: Smoke test
+        run: |
+          ./build/ace-qwen3 --help 2>&1 | head -5
+          ./build/dit-vae --help 2>&1 | head -5
+          ./build/quantize --help 2>&1 | head -3
diff --git a/.github/workflows/test-generation.yml b/.github/workflows/test-generation.yml
@@ -0,0 +1,72 @@
+# Build, download models (cached), and run short generation tests for various modes.
+# Runs on release (published) or manual trigger only. Uses short duration (5s) and few steps (4).
+name: Test generation
+
+on:
+  workflow_dispatch: {}
+  release:
+    types: [published]
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 60
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Build
+        run: |
+          sudo apt-get update -qq
+          sudo apt-get install -y -qq cmake build-essential pkg-config libopenblas-dev
+          mkdir build && cd build
+          cmake .. -DGGML_BLAS=ON
+          cmake --build . --config Release -j$(nproc)
+
+      - name: Cache models
+        id: cache-models
+        uses: actions/cache@v4
+        with:
+          path: models
+          key: acestep-models-q8-${{ hashFiles('models.sh') }}
+          restore-keys: acestep-models-q8-
+
+      - name: Download models
+        if: steps.cache-models.outputs.cache-hit != 'true'
+        run: |
+          pip install -q hf
+          ./models.sh
+
+      - name: Test mode text2music (short)
+        run: |
+          ./build/dit-vae \
+            --request tests/fixtures/ci-text2music.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f tests/fixtures/ci-text2music0.wav && echo "text2music WAV OK"
+
+      - name: Test mode cover with WAV reference (short)
+        run: |
+          ./build/dit-vae \
+            --request tests/fixtures/ci-cover.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f tests/fixtures/ci-cover0.wav && echo "cover WAV OK"
+
+      - name: Test full pipeline (LLM + DiT, short)
+        run: |
+          ./build/ace-qwen3 \
+            --request tests/fixtures/ci-text2music.json \
+            --model models/acestep-5Hz-lm-4B-Q8_0.gguf
+          test -f request0.json
+          ./build/dit-vae \
+            --request request0.json \
+            --text-encoder models/Qwen3-Embedding-0.6B-Q8_0.gguf \
+            --dit models/acestep-v15-turbo-Q8_0.gguf \
+            --vae models/vae-BF16.gguf
+          test -f request00.wav && echo "full pipeline WAV OK"
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@ build/
 *.bf16
 
 tests/*/
+!tests/fixtures/
+!tests/fixtures/*.json
 
 checkpoints/
 models/

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -46,7 +46,7 @@ macro(link_ggml_backends target)
 endmacro()
 
 # dit-vae: full pipeline (text-enc + cond + dit + vae + wav)
-add_executable(dit-vae dit-vae.cpp request.cpp)
+add_executable(dit-vae dit-vae.cpp request.cpp audio_loader.cpp)
 link_ggml_backends(dit-vae)
 
 # ace-qwen3: LLM inference (CoT + audio codes)

diff --git a/README.md b/README.md
@@ -31,6 +31,10 @@ cmake --build . --config Release -j$(nproc)
 
 Builds two binaries: `ace-qwen3` (LLM) and `dit-vae` (DiT + VAE).
 
+**CI (GitHub Actions)**  
+- **Build**: on every push/PR, builds on Ubuntu (BLAS) and macOS (Metal); smoke test runs each binary `--help`.  
+- **Test generation**: on manual trigger or push to `main`; builds, caches models, then runs short (5 s, 4 steps) generation for text2music, cover, and full pipeline (LLM → DiT → WAV). See `.github/workflows/`.
+
 ## Models
 
 Pre-quantized GGUFs on [Hugging Face](https://huggingface.co/Serveurperso/ACE-Step-1.5-GGUF).
@@ -139,10 +143,11 @@ cd examples
 ./partial.sh          # caption + lyrics + duration
 ./full.sh             # all metadata provided
 ./dit-only.sh         # skip LLM, DiT from noise
+./test-reference.sh   # reference_audio (WAV or MP3) + audio_cover_strength
 ```
 
 Each example has a `-sft` variant (SFT model, 50 steps, CFG 7.0)
-alongside the turbo default (8 steps, no CFG).
+alongside the turbo default (8 steps, no CFG). For **reference timbre**, set `reference_audio` to a **WAV or MP3** path; dit-vae loads it (MP3 decoded in memory via header-only minimp3, no temp files), encodes with the VAE encoder (requires a full VAE GGUF that includes encoder weights).
 
 ## Generation modes
 
@@ -170,10 +175,11 @@ Run `dit-vae` to decode existing codes. See `examples/dit-only.json`.
 
 ## Request JSON reference
 
-All fields with defaults. Only `caption` is required.
+All fields with defaults. Only `caption` is required. Built-in modes (text2music, cover, repaint) and audio inputs follow the [ACE-Step 1.5 Tutorial](https://github.com/ace-step/ACE-Step-1.5/blob/main/docs/en/Tutorial.md); see [docs/MODES.md](docs/MODES.md) for what is implemented.
 
 ```json
 {
+    "task_type":          "text2music",
     "caption":            "",
     "lyrics":             "",
     "instrumental":       false,
@@ -188,7 +194,12 @@ All fields with defaults. Only `caption` is required.
     "lm_top_p":           0.9,
     "lm_top_k":           0,
     "lm_negative_prompt": "",
+    "reference_audio":    "",
+    "src_audio":          "",
     "audio_codes":        "",
+    "audio_cover_strength": 1.0,
+    "repainting_start":   0.0,
+    "repainting_end":     0.0,
     "inference_steps":    8,
     "guidance_scale":     7.0,
     "shift":              3.0
@@ -198,7 +209,7 @@ All fields with defaults. Only `caption` is required.
 Key fields: `seed` -1 means random (resolved once, then +1 per batch
 element). `audio_codes` is generated by ace-qwen3 and consumed by
 dit-vae (comma separated FSQ token IDs). When present, the LLM is
-skipped entirely.
+skipped entirely (cover-style generation). `reference_audio`: path to a **WAV or MP3** file for global timbre/style (MP3 decoded in memory; encoded via built-in VAE encoder; requires VAE GGUF with encoder weights). `src_audio`: not yet implemented (see docs/MODES.md).
 
 Turbo preset: `inference_steps=8, shift=3.0` (no guidance_scale, turbo models don't use CFG).
 SFT preset: `inference_steps=50, guidance_scale=4.0, shift=6.0`.

diff --git a/audio.h b/audio.h
@@ -0,0 +1,17 @@
+// audio.h: unified reference-audio loader (WAV + MP3 → stereo 48kHz float)
+// Header-only for WAV; MP3 implementation in audio_loader.cpp (minimp3, no temp files).
+
+#pragma once
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+// Load WAV or MP3 file into stereo float32 at 48kHz.
+// Out: interleaved L,R,L,R,...; length = num_samples (per channel).
+// Returns num_samples (per channel), or -1 on error.
+// No temp files; MP3 decoded in memory via minimp3 (header-only dep).
+int load_audio_48k_stereo(const char * path, std::vector<float> * out);
+
+// MP3 implementation (in audio_loader.cpp; do not call from other TUs without linking it)
+int mp3_load_48k_stereo(const char * path, std::vector<float> * out);
diff --git a/audio_loader.cpp b/audio_loader.cpp
@@ -0,0 +1,117 @@
+// audio_loader.cpp: MP3 decode for reference audio (minimp3, no deps, no temp files)
+
+#define MINIMP3_IMPLEMENTATION
+#include "third_party/minimp3.h"
+
+#include "wav.h"
+#include "audio.h"
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include <algorithm>
+
+static bool path_ends_with_ci(const char * path, const char * suffix) {
+    size_t pl = strlen(path), sl = strlen(suffix);
+    if (pl < sl) return false;
+    const char * p = path + pl - sl;
+    for (size_t i = 0; i < sl; i++) {
+        char a = (char)(p[i] >= 'A' && p[i] <= 'Z' ? p[i] + 32 : p[i]);
+        char b = (char)(suffix[i] >= 'A' && suffix[i] <= 'Z' ? suffix[i] + 32 : suffix[i]);
+        if (a != b) return false;
+    }
+    return true;
+}
+
+static void pcm_to_float_stereo_48k(
+    const int16_t * pcm, size_t num_samples, int channels, unsigned int sample_rate,
+    std::vector<float> * out)
+{
+    const float scale = 1.0f / 32768.0f;
+    out->resize(num_samples * 2);
+    if (channels == 1) {
+        for (size_t i = 0; i < num_samples; i++) {
+            float s = (float)pcm[i] * scale;
+            (*out)[i * 2] = s;
+            (*out)[i * 2 + 1] = s;
+        }
+    } else {
+        for (size_t i = 0; i < num_samples * 2; i++)
+            (*out)[i] = (float)pcm[i] * scale;
+    }
+
+    if (sample_rate != 48000) {
+        size_t in_len = num_samples;
+        size_t out_len = (size_t)((double)in_len * 48000.0 / (double)sample_rate);
+        std::vector<float> resampled(out_len * 2);
+        for (size_t i = 0; i < out_len; i++) {
+            double t = (double)i * (double)in_len / (double)out_len;
+            size_t i0 = (size_t)t;
+            size_t i1 = std::min(i0 + 1, in_len - 1);
+            float w = (float)(t - (double)i0);
+            for (int c = 0; c < 2; c++)
+                resampled[i * 2 + c] = (*out)[i0 * 2 + c] * (1.0f - w) + (*out)[i1 * 2 + c] * w;
+        }
+        *out = std::move(resampled);
+    }
+}
+
+int mp3_load_48k_stereo(const char * path, std::vector<float> * out) {
+    FILE * f = fopen(path, "rb");
+    if (!f) return -1;
+    fseek(f, 0, SEEK_END);
+    long sz = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    if (sz <= 0 || sz > 200 * 1024 * 1024) {
+        fclose(f);
+        return -1;
+    }
+    std::vector<uint8_t> buf((size_t)sz);
+    if (fread(buf.data(), 1, (size_t)sz, f) != (size_t)sz) {
+        fclose(f);
+        return -1;
+    }
+    fclose(f);
+
+    mp3dec_t dec;
+    mp3dec_init(&dec);
+    mp3dec_frame_info_t info;
+    std::vector<int16_t> pcm;
+    const uint8_t * read_pos = buf.data();
+    int remaining = (int)buf.size();
+    int first_hz = 0, first_ch = 0;
+    const size_t max_samples = (size_t)(60 * 48000 * 2);
+
+    while (remaining > 0) {
+        size_t old_size = pcm.size();
+        if (old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME > max_samples) break;
+        pcm.resize(old_size + (size_t)MINIMP3_MAX_SAMPLES_PER_FRAME);
+        int frame_samples = mp3dec_decode_frame(&dec, read_pos, remaining, pcm.data() + old_size, &info);
+        if (frame_samples <= 0) {
+            pcm.resize(old_size);
+            read_pos++;
+            remaining--;
+            continue;
+        }
+        if (first_hz == 0) {
+            first_hz = info.hz;
+            first_ch = info.channels;
+        }
+        pcm.resize(old_size + (size_t)(frame_samples * info.channels));
+        read_pos += info.frame_bytes;
+        remaining -= info.frame_bytes;
+    }
+
+    if (pcm.empty() || first_hz == 0) return -1;
+    size_t num_samples = pcm.size() / (size_t)first_ch;
+    pcm_to_float_stereo_48k(pcm.data(), num_samples, first_ch, (unsigned)first_hz, out);
+    return (int)(out->size() / 2);
+}
+
+int load_audio_48k_stereo(const char * path, std::vector<float> * out) {
+    if (!path || !out) return -1;
+    if (path_ends_with_ci(path, ".mp3"))
+        return mp3_load_48k_stereo(path, out);
+    if (path_ends_with_ci(path, ".wav"))
+        return wav_load_48k_stereo(path, out);
+    return -1;
+}