Skip to content

SkypilotJobsExecutor crashes after submission when SkyPilot returns job id as [1] #481

@jesintharnold

Description

@jesintharnold

Summary

When using run.SkypilotJobsExecutor against a remote SkyPilot API server on Kubernetes, the job is accepted by SkyPilot but NeMo Run crashes immediately afterward while parsing the returned job id. The managed job keeps running on the cluster, but NeMo Run loses track of it.

Environment

  • nemo-run version: 0.8.1
  • skypilot version: 0.11.2
  • Python: 3.11.9
  • Backend: SkyPilot API server + Kubernetes
  • SKYPILOT_API_SERVER_ENDPOINT: <skypilot api endpoint>

Minimal Reproducer

import os
os.environ["SKYPILOT_API_SERVER_ENDPOINT"] = "<SKY-PILOT-API-SERVER-URL>"

import nemo_run as run
from nemo.collections import llm
import nemo.lightning as nl
from lightning.pytorch.loggers import MLFlowLogger
from nemo.collections.llm.peft.lora import LoRA
from lightning.pytorch.callbacks import EarlyStopping

NEMO_MODEL_PATH = "/mnt/models/nemo-models/Mistral-7B-v0.3"
DATASET_ROOT = "/mnt/datasets/demo-v1"
experiment_name="demo-mistral-lora-ft-exp4"
run_name="demo-mistral-lora-ft-exp4-lora-r32a64-100steps"
OUTPUT_DIR = "/mnt/experiments/demo-mistral-lora-ft-exp4"
LOG_DIR = "/mnt/experiments/kyc_edd_mistral_lora_ft_logs-exp4"
MLFLOW_TRACKING_URI = "<ML-FLOW-URL>"

def configure_recipe(nodes: int = 1, gpus_per_node: int = 4):
    recipe = llm.mistral_7b.finetune_recipe(
        dir=OUTPUT_DIR,
        name="mistral_lora",
        num_nodes=nodes,
        num_gpus_per_node=gpus_per_node,
        peft_scheme="lora",
    )

    recipe.resume = run.Config(
        nl.AutoResume,
        restore_config=run.Config(
            nl.RestoreConfig,
            path=NEMO_MODEL_PATH,
        ),
        resume_if_exists=True,
    )

    recipe.data = run.Config(
        llm.FineTuningDataModule,
        dataset_root=DATASET_ROOT,
        seq_length=4096,
        micro_batch_size=1,
        global_batch_size=64,
    )

    ckpt = run.Config(
        nl.ModelCheckpoint,
        save_last=True,
        every_n_train_steps=100,
        save_weights_only=False,
        always_save_context=True,
        save_context_on_train_end=True,
    )

    recipe.log = run.Config(
        nl.NeMoLogger,
        name="mistral-lora-ft",
        log_dir=LOG_DIR,
        use_datetime_version=False,
        ckpt=ckpt,
        explicit_log_dir=LOG_DIR,
        extra_loggers=[
            run.Config(
                MLFlowLogger,
                experiment_name=experiment_name,
                run_name=run_name,
                tracking_uri=MLFLOW_TRACKING_URI,
                log_model=False,
            )
        ],
    )
    recipe.peft = run.Config(
                LoRA,
                target_modules=[
                    'linear_qkv',
                    'linear_proj',
                    'linear_fc1',
                    'linear_fc2'
                    ],
                 exclude_modules=[],
                 dim=32,
                 alpha=64,
                 dropout=0.05,
                 dropout_position='pre',
                 lora_A_init_method='xavier',
                 lora_B_init_method='zero',
                 a2a_experimental=False,
                 lora_dtype=None,
                 dropout_recompute=False
              )

    early_stop = run.Config(
        EarlyStopping,
        monitor="val_loss",
        mode="min",
        patience=3,
        min_delta=0.0,
        strict=True,
        verbose=True,
    )
    recipe.trainer.max_steps = 1000
    recipe.trainer.num_sanity_val_steps = 0
    recipe.trainer.val_check_interval = 5
    recipe.trainer.strategy.ckpt_async_save = False
    recipe.trainer.strategy.context_parallel_size = 1
    recipe.trainer.strategy.ddp = "megatron"
    if recipe.trainer.callbacks is None:
        recipe.trainer.callbacks = []
    recipe.trainer.callbacks.append(early_stop)
    return recipe

def skypilot_executor(nodes: int = 1, gpus_per_node: int = 4) -> run.SkypilotExecutor:
    return run.SkypilotExecutor(
        gpus="H100",
        gpus_per_node=gpus_per_node,
        num_nodes=nodes,
        cloud="kubernetes",
        container_image="nvcr.io/nvidia/nemo:25.07",
        cluster_name="demo-mistral-finetune",
        setup="pip install mlflow>=1.0.0",
        env_vars={
            "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
            "NCCL_NVLS_ENABLE": "0",
            "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
            "NVTE_ASYNC_AMAX_REDUCTION": "1",
            "CUDA_DEVICE_MAX_CONNECTIONS": "1",
            "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
            "MLFLOW_TRACKING_URI": MLFLOW_TRACKING_URI,
        },
    )

def finetune_mistral():
    nodes = 1
    gpus_per_node = 1
    recipe = configure_recipe(nodes=nodes, gpus_per_node=gpus_per_node)
    executor = skypilot_executor(nodes=nodes, gpus_per_node=gpus_per_node)
    with run.Experiment("demo-mistral-7b-peft-finetuning-exp4") as exp:
        exp.add(recipe, executor=executor, name="demo_mistral_peft_finetuning-exp4")
        exp.run(sequential=True, tail_logs=False)

if __name__ == "__main__":
    finetune_mistral()

Observed Output

⚙︎ Job submitted, ID: 1
[06:23:54] Error running job mistral_peft_finetuning_demo_1: invalid literal for int() with base 10: '[1]'
Traceback (most recent call last):
  ...
  File ".../nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py", line 115, in schedule
    task_details = SkypilotJobsExecutor.status(app_id=app_id)
  File ".../nemo_run/core/execution/skypilot_jobs.py", line 228, in status
    _, _, job_id = cls.parse_app(app_id)
  File ".../nemo_run/core/execution/skypilot_jobs.py", line 151, in parse_app
    return cluster, task, int(job_id)
ValueError: invalid literal for int() with base 10: '[1]'

Expected Behavior

  • NeMo Run should complete submission successfully.
  • The returned job id should be normalized before parsing.
  • nemo experiment status and nemo experiment logs should continue working for the submitted job.

Actual Behavior

  • SkyPilot accepts the job and prints Job submitted, ID: 1.
  • NeMo Run immediately raises ValueError.
  • The managed job continues running on Kubernetes, but NeMo Run no longer tracks it.

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions