-
Notifications
You must be signed in to change notification settings - Fork 93
SkypilotJobsExecutor crashes after submission when SkyPilot returns job id as [1] #481
Copy link
Copy link
Open
Labels
Description
Summary
When using run.SkypilotJobsExecutor against a remote SkyPilot API server on Kubernetes, the job is accepted by SkyPilot but NeMo Run crashes immediately afterward while parsing the returned job id. The managed job keeps running on the cluster, but NeMo Run loses track of it.
Environment
nemo-runversion: 0.8.1skypilotversion: 0.11.2- Python: 3.11.9
- Backend: SkyPilot API server + Kubernetes
SKYPILOT_API_SERVER_ENDPOINT:<skypilot api endpoint>
Minimal Reproducer
import os
os.environ["SKYPILOT_API_SERVER_ENDPOINT"] = "<SKY-PILOT-API-SERVER-URL>"
import nemo_run as run
from nemo.collections import llm
import nemo.lightning as nl
from lightning.pytorch.loggers import MLFlowLogger
from nemo.collections.llm.peft.lora import LoRA
from lightning.pytorch.callbacks import EarlyStopping
NEMO_MODEL_PATH = "/mnt/models/nemo-models/Mistral-7B-v0.3"
DATASET_ROOT = "/mnt/datasets/demo-v1"
experiment_name="demo-mistral-lora-ft-exp4"
run_name="demo-mistral-lora-ft-exp4-lora-r32a64-100steps"
OUTPUT_DIR = "/mnt/experiments/demo-mistral-lora-ft-exp4"
LOG_DIR = "/mnt/experiments/kyc_edd_mistral_lora_ft_logs-exp4"
MLFLOW_TRACKING_URI = "<ML-FLOW-URL>"
def configure_recipe(nodes: int = 1, gpus_per_node: int = 4):
recipe = llm.mistral_7b.finetune_recipe(
dir=OUTPUT_DIR,
name="mistral_lora",
num_nodes=nodes,
num_gpus_per_node=gpus_per_node,
peft_scheme="lora",
)
recipe.resume = run.Config(
nl.AutoResume,
restore_config=run.Config(
nl.RestoreConfig,
path=NEMO_MODEL_PATH,
),
resume_if_exists=True,
)
recipe.data = run.Config(
llm.FineTuningDataModule,
dataset_root=DATASET_ROOT,
seq_length=4096,
micro_batch_size=1,
global_batch_size=64,
)
ckpt = run.Config(
nl.ModelCheckpoint,
save_last=True,
every_n_train_steps=100,
save_weights_only=False,
always_save_context=True,
save_context_on_train_end=True,
)
recipe.log = run.Config(
nl.NeMoLogger,
name="mistral-lora-ft",
log_dir=LOG_DIR,
use_datetime_version=False,
ckpt=ckpt,
explicit_log_dir=LOG_DIR,
extra_loggers=[
run.Config(
MLFlowLogger,
experiment_name=experiment_name,
run_name=run_name,
tracking_uri=MLFLOW_TRACKING_URI,
log_model=False,
)
],
)
recipe.peft = run.Config(
LoRA,
target_modules=[
'linear_qkv',
'linear_proj',
'linear_fc1',
'linear_fc2'
],
exclude_modules=[],
dim=32,
alpha=64,
dropout=0.05,
dropout_position='pre',
lora_A_init_method='xavier',
lora_B_init_method='zero',
a2a_experimental=False,
lora_dtype=None,
dropout_recompute=False
)
early_stop = run.Config(
EarlyStopping,
monitor="val_loss",
mode="min",
patience=3,
min_delta=0.0,
strict=True,
verbose=True,
)
recipe.trainer.max_steps = 1000
recipe.trainer.num_sanity_val_steps = 0
recipe.trainer.val_check_interval = 5
recipe.trainer.strategy.ckpt_async_save = False
recipe.trainer.strategy.context_parallel_size = 1
recipe.trainer.strategy.ddp = "megatron"
if recipe.trainer.callbacks is None:
recipe.trainer.callbacks = []
recipe.trainer.callbacks.append(early_stop)
return recipe
def skypilot_executor(nodes: int = 1, gpus_per_node: int = 4) -> run.SkypilotExecutor:
return run.SkypilotExecutor(
gpus="H100",
gpus_per_node=gpus_per_node,
num_nodes=nodes,
cloud="kubernetes",
container_image="nvcr.io/nvidia/nemo:25.07",
cluster_name="demo-mistral-finetune",
setup="pip install mlflow>=1.0.0",
env_vars={
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
"NCCL_NVLS_ENABLE": "0",
"NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
"NVTE_ASYNC_AMAX_REDUCTION": "1",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
"MLFLOW_TRACKING_URI": MLFLOW_TRACKING_URI,
},
)
def finetune_mistral():
nodes = 1
gpus_per_node = 1
recipe = configure_recipe(nodes=nodes, gpus_per_node=gpus_per_node)
executor = skypilot_executor(nodes=nodes, gpus_per_node=gpus_per_node)
with run.Experiment("demo-mistral-7b-peft-finetuning-exp4") as exp:
exp.add(recipe, executor=executor, name="demo_mistral_peft_finetuning-exp4")
exp.run(sequential=True, tail_logs=False)
if __name__ == "__main__":
finetune_mistral()Observed Output
⚙︎ Job submitted, ID: 1
[06:23:54] Error running job mistral_peft_finetuning_demo_1: invalid literal for int() with base 10: '[1]'
Traceback (most recent call last):
...
File ".../nemo_run/run/torchx_backend/schedulers/skypilot_jobs.py", line 115, in schedule
task_details = SkypilotJobsExecutor.status(app_id=app_id)
File ".../nemo_run/core/execution/skypilot_jobs.py", line 228, in status
_, _, job_id = cls.parse_app(app_id)
File ".../nemo_run/core/execution/skypilot_jobs.py", line 151, in parse_app
return cluster, task, int(job_id)
ValueError: invalid literal for int() with base 10: '[1]'
Expected Behavior
- NeMo Run should complete submission successfully.
- The returned job id should be normalized before parsing.
nemo experiment statusandnemo experiment logsshould continue working for the submitted job.
Actual Behavior
- SkyPilot accepts the job and prints
Job submitted, ID: 1. - NeMo Run immediately raises
ValueError. - The managed job continues running on Kubernetes, but NeMo Run no longer tracks it.
Reactions are currently unavailable