Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ mkdir build
cd build

cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
#cmake .. -DGGML_VULKAN=ON
cmake --build . --config Release -j "$(nproc)"
8 changes: 8 additions & 0 deletions buildtermux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

rm -rf build
mkdir build
cd build

cmake .. -DGGML_BLAS=ON -DBLAS_INCLUDE_DIRS=$PREFIX/include/openblas
cmake --build . --config Release -j "$(nproc)"
8 changes: 8 additions & 0 deletions buildvulkan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

rm -rf build
mkdir build
cd build

cmake .. -DGGML_VULKAN=ON
cmake --build . --config Release -j "$(nproc)"
14 changes: 6 additions & 8 deletions examples/full.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
{
"caption": "Vibrant French house meets tech-house fusion track featuring filtered disco samples, driving funky basslines, and classic four-on-the-floor beats with signature Bob Sinclar vocal chops. Analytical yet euphoric mood blending advanced AI technical vocabulary with dancefloor energy. Instruments include talkbox lead vocals, analog Moog bass synths, glitchy arpeggiated sequencers, punchy TR-808 drum machine, and shimmering high-hat rolls.",
"lyrics": "[Intro - Filtered Disco Sample & Synth Arp]\n\n[Verse 1]\nOptimisation des poids synaptiques en temps réel\nRéseaux neuronaux convolutifs, profondeur idéale\nBackpropagation stochastique, gradient ajusté\nModèles GAN génératifs, data-set finalisé\n\n[Pre-Chorus]\nLatence zéro, flux continu\nL'IA évolue, circuit virtuel\n\n[Chorus - Talkbox Vocals]\nC'est l'ère de l'intelligence artificielle\nAlgorithmes de backpropagation dansent sur le beat\nDeep learning en action, réseau fully connected\nProcessing en temps réel, le futur est lancé !\n\n[Bridge - Synth Pad Build-Up]\nStochastic gradient descent, optimise le modèle\nEarly stopping activé, le réseau se stabilise\n\n[Outro - Vinyl Crackles & Fade Out]\nIA... Intelligence Artificielle...\nRéseaux neuronaux... dansent... forever...",
"caption": "Upbeat French house with infectious disco-inspired bassline, crisp four-on-the-floor kick pattern, wah-wah filtered guitar riffs, retro synth stabs, soulful male lead vocals with gospel-style backing harmonies, smooth saxophone accents, warm vinyl crackle texture, bright summer vibe, polished modern mix with vintage analog warmth, driving yet laid-back energy perfect for rooftop parties and sunset drives",
"lyrics": "[Intro - Ligne de Basse Funk & Beat House]\n\n[Verse 1]\nSous le soleil de Paris, on danse sans fin\nLa nuit s'allume, le beat nous guide\nLes étoiles scintillent au rythme du kick\nUn sourire léger, tout est si vivant\n\n[Pre-Chorus]\nLaisse-toi porter par la musique qui chante\n\n[Chorus]\nOn danse sous le ciel étoilé\nLe monde s'arrête, on s'envole\nAvec ce groove qui nous emporte\nJusqu'au matin, on ne s'arrête pas\n\n[Verse 2]\nLa ville respire au son des cuivres légers\nLes mains en l'air, on oublie le temps\nLa basse funk nous secoue les pieds\nUn été éternel, rien ne peut nous briser\n\n[Chorus]\nOn danse sous le ciel étoilé\nLe monde s'arrête, on s'envole\nAvec ce groove qui nous emporte\nJusqu'au matin, on ne s'arrête pas\n\n[Guitar Solo - Wah-Wah Funk]\n\n[Bridge - Saxophone & Cordes]\nRespire profondément, l'univers t'appelle\n\n[Outro - Synth Fade avec Craquement Vinyle]",
"duration": 240,
"bpm": 124,
"duration": 220,
"keyscale": "F# minor",
"timesignature": "4",
"vocal_language": "fr",
"inference_steps": 8,
"shift": 3.0
}
"keyscale": "F# major",
"timesignature": "4"
}
26 changes: 22 additions & 4 deletions src/vae.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,19 +248,27 @@ static void vae_ggml_load(VAEGGML * m, const char * path) {
vae_load_snake_inv(m->sb, gf, "decoder.snake1.beta");
vae_fuse_wn(m->c2w, gf, "decoder.conv2");

fprintf(stderr, "[VAE] Loaded: 5 blocks, upsample=1920x\n");
fprintf(stderr, "[VAE] Loaded: 5 blocks, upsample=1920x, BF16 activations\n");
gf_close(&gf);
}

// Graph building
// Snake activation (fused): y = x + sin^2(a * x) * inv_b
// x: [T, C], exp_a: [1, C], inv_b: [1, C] (pre-computed at load)
// Casts to BF16 before snake, back to F32 after.
static struct ggml_tensor * vae_snake(
struct ggml_context * ctx,
struct ggml_tensor * x,
struct ggml_tensor * exp_a,
struct ggml_tensor * inv_b) {
return ggml_snake(ctx, x, exp_a, inv_b);
if (x->type == GGML_TYPE_F32) {
x = ggml_cast(ctx, x, GGML_TYPE_BF16);
}
x = ggml_snake(ctx, x, exp_a, inv_b);
if (x->type != GGML_TYPE_F32) {
x = ggml_cast(ctx, x, GGML_TYPE_F32);
}
return x;
}

// Conv1d + bias: data [T, IC] -> [T_out, OC]
Expand Down Expand Up @@ -298,11 +306,21 @@ static struct ggml_tensor * vae_conv_t1d(
// w: [IC, K*OC] xt: [IC, T_in] -> col: [K*OC, T_in]
struct ggml_tensor * col = ggml_mul_mat(ctx, w, xt);

// Step 3: col2im - scatter-add columns to signal, fused padding crop
// Step 3: cast to BF16 before col2im_1d
if (col->type == GGML_TYPE_F32) {
col = ggml_cast(ctx, col, GGML_TYPE_BF16);
}

// Step 4: col2im - scatter-add columns to signal, fused padding crop
// [K*OC, T_in] -> [T_out, OC] where T_out = (T_in-1)*stride + K - 2*padding
struct ggml_tensor * y = ggml_col2im_1d(ctx, col, stride, oc, padding);

// Step 4: Add bias
// Step 5: cast back to F32
if (y->type != GGML_TYPE_F32) {
y = ggml_cast(ctx, y, GGML_TYPE_F32);
}

// Step 6: Add bias
if (b) {
struct ggml_tensor * b2d = ggml_reshape_2d(ctx, b, 1, b->ne[0]);
y = ggml_add(ctx, y, b2d);
Expand Down
8 changes: 4 additions & 4 deletions tests/BF16.log
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ Using precomputed LM hints
dit_step6_xt 0.988188
dit_step7_vt 0.969375
dit_x0 0.979213
vae_audio 0.901411
vae_audio (STFT cosine) 0.975533
vae_audio 0.901377
vae_audio (STFT cosine) 0.975525
[Turbo] Error growth GGML vs Python
stage cos max_err mean_err mean_A std_A mean_B std_B
dit_step0_xt 0.999945 0.135628 0.006709 -0.002312 0.972932 -0.002342 0.972003
Expand Down Expand Up @@ -114,8 +114,8 @@ Using precomputed LM hints
dit_step49_vt_cond 0.983553
dit_step49_vt 0.924041
dit_x0 0.990243
vae_audio 0.956365
vae_audio (STFT cosine) 0.981932
vae_audio 0.956370
vae_audio (STFT cosine) 0.981929
[SFT] Error growth GGML vs Python
stage cos max_err mean_err mean_A std_A mean_B std_B
dit_step0_xt 0.999998 0.038950 0.002063 -0.001725 0.980009 -0.001741 0.980402
Expand Down
166 changes: 82 additions & 84 deletions tests/CPU_BF16.log
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
[Request] Loaded request0.json
[Noise] Generating T=2200 (duration=88.0s)...
[Noise] Generated rng_philox_seed42.bf16: [2200, 64] bf16 (281600 bytes)
[Turbo] steps=8, shift=3.0 | acestep-v15-turbo-BF16.gguf
[GGML] Running acestep-v15-turbo-BF16.gguf...
[GGML] Done, 47 dump files
Expand All @@ -12,48 +10,48 @@ Using precomputed LM hints
[Python] Done, 40 dump files
[Turbo] Cosine similarities GGML vs Python
stage GGML vs Python
text_hidden 0.999815
text_hidden 0.999816
lyric_embed 1.000000
enc_hidden 0.999809
enc_hidden 0.999841
detok_output 0.999995
context 0.999997
noise 1.000000
temb_t 0.999999
hidden_after_proj_in 0.999988
enc_after_cond_emb 0.999791
enc_after_cond_emb 0.999832
layer0_sa_output 0.999960
hidden_after_layer0 0.999981
hidden_after_layer0 0.999982
hidden_after_layer6 0.999924
hidden_after_layer12 0.999220
hidden_after_layer18 0.996570
hidden_after_layer23 0.993544
dit_step0_vt 0.974748
dit_step0_xt 0.999944
dit_step1_vt 0.978754
dit_step1_xt 0.999828
dit_step2_vt 0.980585
dit_step2_xt 0.999534
dit_step3_vt 0.982107
dit_step3_xt 0.998886
dit_step4_vt 0.981219
dit_step4_xt 0.997453
dit_step5_vt 0.978640
dit_step5_xt 0.994290
dit_step6_vt 0.975385
dit_step6_xt 0.988279
dit_step7_vt 0.969594
dit_x0 0.979396
vae_audio 0.900831
vae_audio (log spectral) 0.999827
hidden_after_layer12 0.999332
hidden_after_layer18 0.996692
hidden_after_layer23 0.993786
dit_step0_vt 0.975712
dit_step0_xt 0.999946
dit_step1_vt 0.979525
dit_step1_xt 0.999833
dit_step2_vt 0.981808
dit_step2_xt 0.999552
dit_step3_vt 0.982382
dit_step3_xt 0.998917
dit_step4_vt 0.980777
dit_step4_xt 0.997480
dit_step5_vt 0.978078
dit_step5_xt 0.994264
dit_step6_vt 0.974849
dit_step6_xt 0.988142
dit_step7_vt 0.969102
dit_x0 0.979106
vae_audio 0.901370
vae_audio (STFT cosine) 0.975816
[Turbo] Error growth GGML vs Python
stage cos max_err mean_err mean_A std_A mean_B std_B
dit_step0_xt 0.999944 0.139834 0.006735 -0.002329 0.972955 -0.002342 0.972003
dit_step1_xt 0.999828 0.271987 0.011491 -0.005329 0.942705 -0.005313 0.941730
dit_step2_xt 0.999534 0.461474 0.017839 -0.009366 0.909252 -0.009311 0.908527
dit_step3_xt 0.998886 0.660416 0.026355 -0.014729 0.873915 -0.014577 0.873624
dit_step4_xt 0.997453 0.836992 0.038155 -0.021772 0.842138 -0.021660 0.841995
dit_step5_xt 0.994290 1.265321 0.055698 -0.031831 0.825532 -0.032109 0.824593
dit_step6_xt 0.988279 2.080415 0.082441 -0.046092 0.856505 -0.046482 0.855546
dit_step0_xt 0.999946 0.136541 0.006626 -0.002312 0.972951 -0.002342 0.972003
dit_step1_xt 0.999833 0.265486 0.011288 -0.005309 0.942692 -0.005313 0.941730
dit_step2_xt 0.999552 0.451896 0.017477 -0.009347 0.909217 -0.009311 0.908527
dit_step3_xt 0.998917 0.642624 0.025957 -0.014710 0.873863 -0.014577 0.873624
dit_step4_xt 0.997480 0.778374 0.037868 -0.021751 0.842047 -0.021660 0.841995
dit_step5_xt 0.994264 1.244624 0.055630 -0.031814 0.825360 -0.032109 0.824593
dit_step6_xt 0.988142 2.080976 0.082605 -0.046091 0.856212 -0.046482 0.855546
[SFT] steps=50, shift=1.0, CFG=7.0 | acestep-v15-sft-BF16.gguf
[GGML] Running acestep-v15-sft-BF16.gguf...
[GGML] Done, 233 dump files
Expand All @@ -65,68 +63,68 @@ Using precomputed LM hints
[Python] Done, 218 dump files
[SFT] Cosine similarities GGML vs Python
stage GGML vs Python
text_hidden 0.999815
text_hidden 0.999816
lyric_embed 1.000000
enc_hidden 0.999809
enc_hidden 0.999841
detok_output 0.999995
context 0.999997
noise 1.000000
temb_t 0.999998
hidden_after_proj_in 0.999988
enc_after_cond_emb 0.999794
enc_after_cond_emb 0.999834
layer0_sa_output 0.999959
hidden_after_layer0 0.999984
hidden_after_layer6 0.999852
hidden_after_layer6 0.999851
hidden_after_layer12 0.999471
hidden_after_layer18 0.998743
hidden_after_layer23 0.998998
hidden_after_layer18 0.998749
hidden_after_layer23 0.998994
null_condition_emb 1.000000
null_enc_hidden 1.000000
dit_step0_vt_cond 0.998968
dit_step0_vt_uncond 0.998728
dit_step0_vt 0.995811
dit_step0_vt_cond 0.998963
dit_step0_vt_uncond 0.998717
dit_step0_vt 0.995766
dit_step0_xt 0.999998
dit_step5_vt_cond 0.999488
dit_step5_vt 0.993679
dit_step5_vt_cond 0.999507
dit_step5_vt 0.993884
dit_step5_xt 0.999963
dit_step10_vt_cond 0.998829
dit_step10_vt 0.993532
dit_step10_xt 0.999888
dit_step15_vt_cond 0.996990
dit_step15_vt 0.985156
dit_step15_xt 0.999658
dit_step20_vt_cond 0.994347
dit_step20_vt 0.979694
dit_step20_xt 0.999039
dit_step25_vt_cond 0.989302
dit_step25_vt 0.969289
dit_step25_xt 0.997768
dit_step30_vt_cond 0.984077
dit_step30_vt 0.964410
dit_step30_xt 0.995793
dit_step35_vt_cond 0.979244
dit_step35_vt 0.955834
dit_step35_xt 0.993373
dit_step40_vt_cond 0.976554
dit_step40_vt 0.950866
dit_step40_xt 0.991062
dit_step45_vt_cond 0.979301
dit_step45_vt 0.952689
dit_step45_xt 0.989446
dit_step49_vt_cond 0.981868
dit_step49_vt 0.921025
dit_x0 0.988916
vae_audio 0.947636
vae_audio (log spectral) 0.999651
dit_step10_vt_cond 0.998797
dit_step10_vt 0.993423
dit_step10_xt 0.999887
dit_step15_vt_cond 0.997670
dit_step15_vt 0.988372
dit_step15_xt 0.999682
dit_step20_vt_cond 0.995498
dit_step20_vt 0.982137
dit_step20_xt 0.999190
dit_step25_vt_cond 0.991181
dit_step25_vt 0.972161
dit_step25_xt 0.998167
dit_step30_vt_cond 0.986183
dit_step30_vt 0.967394
dit_step30_xt 0.996519
dit_step35_vt_cond 0.981815
dit_step35_vt 0.959696
dit_step35_xt 0.994436
dit_step40_vt_cond 0.979298
dit_step40_vt 0.954151
dit_step40_xt 0.992400
dit_step45_vt_cond 0.981642
dit_step45_vt 0.955459
dit_step45_xt 0.990953
dit_step49_vt_cond 0.982680
dit_step49_vt 0.941788
dit_x0 0.990427
vae_audio 0.960778
vae_audio (STFT cosine) 0.984703
[SFT] Error growth GGML vs Python
stage cos max_err mean_err mean_A std_A mean_B std_B
dit_step0_xt 0.999998 0.038570 0.002031 -0.001742 0.980032 -0.001741 0.980402
dit_step5_xt 0.999963 0.128356 0.005832 -0.006967 0.888993 -0.007143 0.887999
dit_step10_xt 0.999888 0.221095 0.008891 -0.012449 0.810442 -0.012603 0.811299
dit_step15_xt 0.999658 0.362084 0.013808 -0.017776 0.745381 -0.018114 0.745268
dit_step20_xt 0.999039 0.559352 0.021038 -0.023205 0.699913 -0.023808 0.699582
dit_step25_xt 0.997768 0.856427 0.030573 -0.028769 0.678624 -0.029311 0.679278
dit_step30_xt 0.995793 1.124315 0.042139 -0.034362 0.684716 -0.035027 0.685262
dit_step35_xt 0.993373 1.609566 0.055461 -0.039908 0.716931 -0.040716 0.717195
dit_step40_xt 0.991062 2.075305 0.069778 -0.045420 0.770802 -0.046462 0.771853
dit_step45_xt 0.989446 2.361056 0.083301 -0.051419 0.842083 -0.052475 0.843036
dit_step0_xt 0.999998 0.038465 0.002037 -0.001739 0.980023 -0.001741 0.980402
dit_step5_xt 0.999963 0.130767 0.005794 -0.006951 0.888986 -0.007143 0.887999
dit_step10_xt 0.999887 0.230145 0.008907 -0.012421 0.810420 -0.012603 0.811299
dit_step15_xt 0.999682 0.369882 0.013468 -0.017757 0.745283 -0.018114 0.745268
dit_step20_xt 0.999190 0.439784 0.019899 -0.023189 0.699688 -0.023808 0.699582
dit_step25_xt 0.998167 0.657918 0.028642 -0.028736 0.678283 -0.029311 0.679278
dit_step30_xt 0.996519 1.070616 0.039415 -0.034342 0.684394 -0.035027 0.685262
dit_step35_xt 0.994436 1.684599 0.051968 -0.039891 0.716568 -0.040716 0.717195
dit_step40_xt 0.992400 2.115248 0.065570 -0.045402 0.770424 -0.046462 0.771853
dit_step45_xt 0.990953 2.369087 0.078496 -0.051406 0.841668 -0.052475 0.843036
Loading