llmq/.github/workflows/wheel.yml at dev · IST-DASLab/llmq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
name: Build Wheels
on:
  workflow_dispatch:
  push:
    branches:
      - master
      - release-*
      - dev
  pull_request:
    branches:
      - master
      - dev

jobs:
  wheel:
    name: Build wheel (CUDA ${{ matrix.cuda_version }})
    runs-on: ubuntu-24.04
    container:
      image: ${{ matrix.cuda_image }}

    strategy:
      matrix:
        include:
          - cuda_version: "12.8"
            cuda_image: "nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04"
            wheel_tag: "cu128"
            archs: "89;90;100;120"
            runtime_deps: |
              nvidia-cuda-runtime-cu12>=12.8
              nvidia-cudnn-cu12>=9.0
              nvidia-cufile-cu12>=1.10
              nvidia-cublas-cu12>=12.8
              nvidia-nccl-cu12>=2.0
          - cuda_version: "13.0"
            cuda_image: "nvidia/cuda:13.0.1-cudnn-devel-ubuntu22.04"
            wheel_tag: "cu130"
            archs: "89;90;100f;120f"
            runtime_deps: |
              cuda-toolkit[cudart,cublas,cufile]>=13.0
              nvidia-cudnn-cu13>=9.0
              nvidia-nccl-cu13>=2.0

    env:
      CMAKE_GENERATOR: Ninja
      CMAKE_C_COMPILER: gcc-12
      CMAKE_CXX_COMPILER: g++-12
      CMAKE_C_COMPILER_LAUNCHER: ccache
      CMAKE_CXX_COMPILER_LAUNCHER: ccache
      CMAKE_CUDA_COMPILER_LAUNCHER: ccache
      CUDAARCHS: ${{ matrix.archs }}

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install deps
        run: apt update && apt install -y git g++-12

      - name: ccache
        uses: hendrikmuhs/ccache-action@v1.2
        with:
          key: wheel-${{ matrix.wheel_tag }}

      - name: Install the latest version of uv
        uses: astral-sh/setup-uv@v7

      # we cannot statically declare the cuda dependencies in pyproject.toml
      # so we have to patch them here
      - name: Add CUDA runtime dependencies
        run: uv run --no-project --with tomlkit python3 .github/scripts/add_cuda_deps.py "${{ matrix.runtime_deps }}" "${{ matrix.wheel_tag }}"

      - name: Build wheel
        run: uv build --wheel

      - name: Repair wheel with auditwheel
        run: |
          uv run --no-project --with auditwheel --with patchelf auditwheel repair dist/*.whl -w wheelhouse/ --exclude libcuda.so.1 --exclude libcudart.so.12  --exclude libcudart.so.13 --exclude libcudnn.so.9 --exclude libcufile.so.0 --exclude libnccl.so.2 --exclude libcublasLt.so.12 --exclude libcublasLt.so.13 --exclude libnvidia-ml.so.1
          rm dist/*-linux_*.whl  # Remove non-repaired wheel

      - name: Upload wheel
        uses: actions/upload-artifact@v4
        with:
          name: wheel-${{ matrix.wheel_tag }}
          path: wheelhouse/pyllmq*.whl

  deploy-modal:
    name: Deploy to Modal
    needs: wheel
    runs-on: ubuntu-latest

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Download wheel artifact
        uses: actions/download-artifact@v4
        with:
          name: wheel-cu128
          path: wheelhouse/

      - name: Install Modal
        run: pip install modal

      - name: Set Modal token
        run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}

      - name: Deploy to Modal
        run: modal deploy scripts/modal_test_app.py

  test-modal-recompute:
    # These tests verify that recomputation options do not change the results at all
    name: Recompute - ${{ matrix.recompute.name }} - ${{ matrix.dtype.name }}
    needs: deploy-modal
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 3
      matrix:
        recompute:
          - name: "Determinism test"
            args: ""
          - name: "Full Block"
            args: "--recompute-block --offload-residual --use-cuda-graphs"
          - name: "Blockwise"
            args: "--recompute-att --recompute-ffn"
          - name: "Non-linearities"
            args: "--recompute-swiglu --recompute-norm"
          - name: "Offload Opt"
            args: "--offload-opt-m --offload-opt-v --offload-master"
          - name: "Offload Gradient"
            args: "--shard-gradients --offload-grads"
          # While not strictly a recomputation, chunked attention should be bitwise identical, too
          - name: "Chunked attention"
            args: "--recompute-att --attn-bwd-chunks=2"
        dtype:
          - name: "BF16"
            args: "--matmul-dtype=bf16"
          - name: "FP8"
            args: "--matmul-dtype=e4m3"
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      # Note: No need to download wheel again, it's already in the deployed image

      - name: Install Modal
        run: pip install modal

      - name: Set Modal token
        run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}

      - name: Run test on Modal
        run: python3 scripts/modal_test_ci.py recompute ${{ matrix.recompute.args }} ${{ matrix.dtype.args }}


  test-modal-fixed:
    # These tests run a few steps and compare the resulting losses and norms against a known, fixed reference
    # A failure in these tests doesn't necessarily mean that the code is broken, but it indicates that
    # the changes need to be *carefully* reviewed, and tested end-to-end, before the reference is updated.
    name: Test fixed - ${{ matrix.config.name }}
    needs: deploy-modal
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 3
      matrix:
        config:
          - name: "LLMQ BF16"
            args: "fixed bf16"
          - name: "LLMQ FP8"
            args: "fixed e4m3"
          - name: "LLMQ E5M2"
            args: "fixed e5m2"
          - name: "Torch BF16 ga=1"
            args: "torch-step --grad-accum 1 --model-dtype bf16 --matmul-dtype bf16"
          - name: "Torch BF16 ga=4"
            args: "torch-step --grad-accum 4 --model-dtype bf16 --matmul-dtype bf16"
          - name: "Torch FP32"
            args: "torch-step --grad-accum 2 --model-dtype fp32 --matmul-dtype fp32"
          - name: "Torch AMP"
            args: "torch-step --grad-accum 2 --model-dtype fp32 --matmul-dtype bf16"
          - name: "Torch Chunking"
            args: "torch-step --grad-accum 4 --model-dtype bf16 --matmul-dtype bf16 --lmhead-chunks 4"
          - name: "Torch BF16 Custom Matmul"
            args: "torch-step --grad-accum 1 --model-dtype bf16 --matmul-dtype bf16 --custom-matmul"
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install Modal
        run: pip install modal

      - name: Set Modal token
        run: modal token set --token-id ${{ secrets.MODAL_TOKEN_ID }} --token-secret ${{ secrets.MODAL_TOKEN_SECRET }}

      - name: Run test on Modal
        run: python3 scripts/modal_test_ci.py ${{ matrix.config.args }}

  release:
    if: github.event_name == 'workflow_dispatch' || startsWith(github.ref, 'refs/heads/release-') || startsWith(github.ref, 'refs/tags/')
    needs:
      - test-modal-recompute
      - test-modal-fixed

    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Download all wheels
        uses: actions/download-artifact@v4
        with:
          pattern: wheel-*
          path: wheelhouse/
          merge-multiple: true

      - name: Create Release
        uses: softprops/action-gh-release@v1
        with:
          tag_name: ${{ github.ref_name }}
          files: wheelhouse/*.whl
          generate_release_notes: true
          draft: true