diff --git a/.github/workflows/pipeline.yaml b/.github/workflows/pipeline.yaml index ab17a5d01..5d42c8f91 100644 --- a/.github/workflows/pipeline.yaml +++ b/.github/workflows/pipeline.yaml @@ -5,11 +5,11 @@ on: inputs: gpu: description: "GPU type for regional calibration" - default: "T4" + default: "A100-40GB" type: string national_gpu: description: "GPU type for national calibration" - default: "T4" + default: "A100-40GB" type: string epochs: description: "Epochs for regional calibration" @@ -116,8 +116,8 @@ jobs: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} PIPELINE_BRANCH: main - GPU: ${{ inputs.gpu || 'T4' }} - NATIONAL_GPU: ${{ inputs.national_gpu || 'T4' }} + GPU: ${{ inputs.gpu || 'A100-40GB' }} + NATIONAL_GPU: ${{ inputs.national_gpu || 'A100-40GB' }} EPOCHS: ${{ inputs.epochs || '1000' }} NATIONAL_EPOCHS: ${{ inputs.national_epochs || '1000' }} NUM_WORKERS: ${{ inputs.num_workers || '50' }} diff --git a/Makefile b/Makefile index 88c35899a..eee414205 100644 --- a/Makefile +++ b/Makefile @@ -5,9 +5,9 @@ SOI_TARGET_YEAR ?= 2023 YEAR ?= 2024 -GPU ?= T4 +GPU ?= A100-40GB EPOCHS ?= 1000 -NATIONAL_GPU ?= T4 +NATIONAL_GPU ?= A100-40GB NATIONAL_EPOCHS ?= 1000 BRANCH ?= $(shell git rev-parse --abbrev-ref HEAD) NUM_WORKERS ?= 8 diff --git a/changelog.d/default-publication-a100.changed.md b/changelog.d/default-publication-a100.changed.md new file mode 100644 index 000000000..81febac75 --- /dev/null +++ b/changelog.d/default-publication-a100.changed.md @@ -0,0 +1 @@ +Default publication calibration fits to A100-40GB GPUs to avoid T4 memory failures on the full target matrix. diff --git a/modal_app/pipeline.py b/modal_app/pipeline.py index 39a437808..473007e23 100644 --- a/modal_app/pipeline.py +++ b/modal_app/pipeline.py @@ -904,9 +904,9 @@ def _new_run_metadata( ) def run_pipeline( branch: str = "main", - gpu: str = "T4", + gpu: str = "A100-40GB", epochs: int = 1000, - national_gpu: str = "T4", + national_gpu: str = "A100-40GB", national_epochs: int = 1000, num_workers: int = 50, n_clones: int = 430, @@ -2097,9 +2097,9 @@ def main( branch: str = "main", run_id: str = None, resume_run_id: str = None, - gpu: str = "T4", + gpu: str = "A100-40GB", epochs: int = 1000, - national_gpu: str = "T4", + national_gpu: str = "A100-40GB", national_epochs: int = 1000, num_workers: int = 50, n_clones: int = 430, diff --git a/tests/unit/test_pipeline_source_contracts.py b/tests/unit/test_pipeline_source_contracts.py index a022ef3fe..52d1d801d 100644 --- a/tests/unit/test_pipeline_source_contracts.py +++ b/tests/unit/test_pipeline_source_contracts.py @@ -20,6 +20,25 @@ def _name(node: ast.AST) -> str | None: return node.id if isinstance(node, ast.Name) else None +def test_run_pipeline_defaults_to_a100_for_full_target_matrix() -> None: + tree = ast.parse(PIPELINE_SOURCE.read_text()) + run_pipeline = _function_def(tree, "run_pipeline") + defaults = dict( + zip( + [ + arg.arg + for arg in run_pipeline.args.args[-len(run_pipeline.args.defaults) :] + ], + run_pipeline.args.defaults, + ) + ) + + assert isinstance(defaults["gpu"], ast.Constant) + assert defaults["gpu"].value == "A100-40GB" + assert isinstance(defaults["national_gpu"], ast.Constant) + assert defaults["national_gpu"].value == "A100-40GB" + + def test_promote_run_uses_single_full_release_promotion() -> None: tree = ast.parse(PIPELINE_SOURCE.read_text()) promote_run = _function_def(tree, "promote_run")