All commands expect the following environment variable for common arguments:
export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
--ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
--opt-v-dtype=bf16 --gpus=8 --use-cuda-graphs" ./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=2
# [T] step 0 [ 0.1%] | time: 1098 ms | norm 15.321494 | loss 12.093583 | tps 477k | sol 23.9%
# [T] step 1 [ 0.2%] | time: 719 ms | norm 17.247507 | loss 11.460354 | tps 729k | sol 36.4%
# [T] step 2 [ 0.3%] | time: 711 ms | norm 12.041599 | loss 10.899013 | tps 737k | sol 36.8%
# [T] step 3 [ 0.5%] | time: 707 ms | norm 9.213974 | loss 10.506249 | tps 741k | sol 37.1%
# [T] step 4 [ 0.6%] | time: 704 ms | norm 7.609219 | loss 10.230234 | tps 744k | sol 37.2%
# [T] step 5 [ 0.7%] | time: 704 ms | norm 6.597405 | loss 10.015131 | tps 744k | sol 37.2 ./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2
# [T] step 0 [ 0.1%] | time: 1168 ms | norm 15.406463 | loss 12.091159 | tps 448k | sol 34.4%
# [T] step 1 [ 0.2%] | time: 775 ms | norm 17.309547 | loss 11.454817 | tps 676k | sol 51.9%
# [T] step 2 [ 0.3%] | time: 776 ms | norm 11.997014 | loss 10.893351 | tps 675k | sol 51.8%
# [T] step 3 [ 0.5%] | time: 778 ms | norm 9.204950 | loss 10.500839 | tps 673k | sol 51.7%
# [T] step 4 [ 0.6%] | time: 777 ms | norm 7.631546 | loss 10.224687 | tps 674k | sol 51.7%
# [T] step 5 [ 0.7%] | time: 779 ms | norm 6.641865 | loss 10.008981 | tps 673k | sol 51.6%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=2
# [T] step 0 [ 0.1%] | time: 2053 ms | norm 15.450644 | loss 12.257782 | tps 255k | sol 35.4%
# [T] step 1 [ 0.2%] | time: 1661 ms | norm 21.253056 | loss 11.091463 | tps 315k | sol 43.8%
# [T] step 2 [ 0.3%] | time: 1662 ms | norm 12.888207 | loss 9.959963 | tps 315k | sol 43.7%
# [T] step 3 [ 0.5%] | time: 1661 ms | norm 11.486399 | loss 9.328218 | tps 315k | sol 43.8%
# [T] step 4 [ 0.6%] | time: 1664 ms | norm 13.613173 | loss 9.046066 | tps 315k | sol 43.7%
# [T] step 5 [ 0.7%] | time: 1664 ms | norm 9.329375 | loss 8.733832 | tps 315k | sol 43.7
./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=2
# [T] step 0 [ 0.1%] | time: 2429 ms | norm 15.608638 | loss 12.256838 | tps 215k | sol 51.0%
# [T] step 1 [ 0.2%] | time: 2039 ms | norm 21.531446 | loss 11.080791 | tps 257k | sol 60.7%
# [T] step 2 [ 0.3%] | time: 2042 ms | norm 12.891563 | loss 9.946579 | tps 256k | sol 60.6%
# [T] step 3 [ 0.5%] | time: 2044 ms | norm 11.485165 | loss 9.316048 | tps 256k | sol 60.6%
# [T] step 4 [ 0.6%] | time: 2048 ms | norm 13.283589 | loss 9.023841 | tps 256k | sol 60.4%
# [T] step 5 [ 0.7%] | time: 2050 ms | norm 8.698628 | loss 8.707534 | tps 255k | sol 60.4%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 3272 ms | norm 16.752102 | loss 12.349745 | tps 160k | sol 42.3%
# [T] step 1 [ 0.2%] | time: 2882 ms | norm 29.804232 | loss 10.759310 | tps 181k | sol 48.0%
# [T] step 2 [ 0.3%] | time: 2884 ms | norm 169.263565 | loss 11.268322 | tps 181k | sol 48.0%
# [T] step 3 [ 0.5%] | time: 2889 ms | norm 63.430531 | loss 9.774895 | tps 181k | sol 47.9%
# [T] step 4 [ 0.6%] | time: 2888 ms | norm 59.478142 | loss 9.590723 | tps 181k | sol 47.9%
# [T] step 5 [ 0.7%] | time: 2890 ms | norm 21.055012 | loss 9.486804 | tps 181k | sol 47.9%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 4208 ms | norm 16.829838 | loss 12.350895 | tps 124k | sol 58.6%
# [T] step 1 [ 0.2%] | time: 3852 ms | norm 29.815002 | loss 10.748739 | tps 136k | sol 64.0%
# [T] step 2 [ 0.3%] | time: 3859 ms | norm 167.841187 | loss 11.228068 | tps 135k | sol 63.9%
# [T] step 3 [ 0.5%] | time: 3858 ms | norm 63.267101 | loss 9.765855 | tps 135k | sol 63.9%
# [T] step 4 [ 0.6%] | time: 3863 ms | norm 60.819016 | loss 9.600936 | tps 135k | sol 63.8%
# [T] step 5 [ 0.7%] | time: 3863 ms | norm 22.034645 | loss 9.491041 | tps 135k | sol 63.8%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=4 --attn-bwd-chunks=4
# [T] step 0 [ 0.1%] | time: 6041 ms | norm 12.707057 | loss 12.665241 | tps 86788 | sol 50.5%
# [T] step 1 [ 0.2%] | time: 5724 ms | norm 24.067322 | loss 10.758235 | tps 91594 | sol 53.3%
# [T] step 2 [ 0.3%] | time: 5733 ms | norm 204.781250 | loss 15.891487 | tps 91450 | sol 53.2%
# [T] step 3 [ 0.5%] | time: 5734 ms | norm 111.757889 | loss 14.925350 | tps 91434 | sol 53.2%
# [T] step 4 [ 0.6%] | time: 5740 ms | norm 169.976746 | loss 14.186973 | tps 91339 | sol 53.2%
# [T] step 5 [ 0.7%] | time: 5741 ms | norm 114.847771 | loss 13.307644 | tps 91323 | sol 53.1%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=8
# [T] step 0 [ 0.1%] | time: 8677 ms | norm 12.690809 | loss 12.665518 | tps 60422 | sol 64.5%
# [T] step 1 [ 0.2%] | time: 8363 ms | norm 23.996948 | loss 10.748487 | tps 62691 | sol 66.9%
# [T] step 2 [ 0.3%] | time: 8377 ms | norm 205.359970 | loss 15.948483 | tps 62586 | sol 66.8%
# [T] step 3 [ 0.5%] | time: 8385 ms | norm 113.560005 | loss 14.996553 | tps 62526 | sol 66.8%
# [T] step 4 [ 0.6%] | time: 8386 ms | norm 171.203033 | loss 14.272202 | tps 62519 | sol 66.7%
# [T] step 5 [ 0.7%] | time: 8386 ms | norm 113.847122 | loss 13.266503 | tps 62519 | sol 66.7%./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=8 \
--grad-accumulation=8 --lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-residual
# [T] step 0 [ 0.2%] | time: 22535 ms | norm 15.902722 | loss 12.959281 | tps 46530 | sol 52.9%
# [T] step 1 [ 0.5%] | time: 22311 ms | norm 208.689987 | loss 12.179295 | tps 46998 | sol 53.4%
# [T] step 2 [ 0.7%] | time: 22344 ms | norm 60.867252 | loss 11.580514 | tps 46928 | sol 53.3%
# [T] step 3 [ 0.9%] | time: 22360 ms | norm 207.767715 | loss 20.771935 | tps 46895 | sol 53.3%
# [T] step 4 [ 1.2%] | time: 22343 ms | norm 206.262772 | loss 20.127741 | tps 46930 | sol 53.3%
# [T] step 5 [ 1.4%] | time: 22348 ms | norm 193.107193 | loss 19.831905 | tps 46920 | sol 53.3%./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=4 --grad-accumulation=16 --lmhead-chunks=2
# [T] step 0 [ 0.1%] | time: 17588 ms | norm 18.313215 | loss 12.960766 | tps 29809 | sol 63.2%
# [T] step 1 [ 0.2%] | time: 17211 ms | norm 180.026840 | loss 11.875872 | tps 30462 | sol 64.6%
# [T] step 2 [ 0.3%] | time: 17235 ms | norm 53.158806 | loss 11.824190 | tps 30419 | sol 64.5%
# [T] step 3 [ 0.5%] | time: 17257 ms | norm 198.049500 | loss 20.663370 | tps 30381 | sol 64.4%
# [T] step 4 [ 0.6%] | time: 17253 ms | norm 156.915009 | loss 19.534800 | tps 30388 | sol 64.4%
# [T] step 5 [ 0.7%] | time: 17249 ms | norm 172.824493 | loss 18.288172 | tps 30395 | sol 64.4%./build/train ${ARGS} --model=Qwen2.5-32B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v --recompute-ffn --shard-gradients --recompute-norm --offload-residual
# [T] step 0 [ 0.1%] | time: 28372 ms | norm 18.194458 | loss 12.948755 | tps 18479 | sol 46.0%
# [T] step 1 [ 0.2%] | time: 28033 ms | norm 23.873095 | loss 10.658402 | tps 18702 | sol 46.5%
# [T] step 2 [ 0.3%] | time: 28045 ms | norm 239.612473 | loss 21.614901 | tps 18694 | sol 46.5%
# [T] step 3 [ 0.5%] | time: 28068 ms | norm 203.879898 | loss 22.488796 | tps 18679 | sol 46.5%
# [T] step 4 [ 0.6%] | time: 28037 ms | norm 319.090271 | loss 20.166912 | tps 18699 | sol 46.5%
# [T] step 5 [ 0.7%] | time: 28041 ms | norm 252.984528 | loss 18.991983 | tps 18697 | sol 46.5%./build/train ${ARGS} --model=Qwen2.5-32B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=4 \
--lmhead-chunks=2 --recompute-ffn --recompute-norm --shard-gradients --shard-weights
# [T] step 0 [ 0.1%] | time: 43153 ms | norm 18.271938 | loss 12.949495 | tps 12149 | sol 58.5%
# [T] step 1 [ 0.2%] | time: 42757 ms | norm 24.360046 | loss 10.648568 | tps 12262 | sol 59.0%
# [T] step 2 [ 0.3%] | time: 42821 ms | norm 220.537399 | loss 18.453310 | tps 12243 | sol 58.9%
# [T] step 3 [ 0.5%] | time: 42823 ms | norm 246.115005 | loss 17.319511 | tps 12243 | sol 58.9%
# [T] step 4 [ 0.6%] | time: 42832 ms | norm 73.238007 | loss 16.348003 | tps 12240 | sol 58.9%
# [T] step 5 [ 0.7%] | time: 42816 ms | norm 199.496277 | loss 18.565773 | tps 12245 | sol 58.9%