Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions src/art/dev/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,7 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:

if set(trainer_gpu_ids) & set(inference_gpu_ids):
raise ValueError("trainer_gpu_ids and inference_gpu_ids must not overlap")

if len(inference_gpu_ids) > 1:
raise ValueError(
"Multi-GPU inference not yet supported; inference_gpu_ids must have exactly one GPU"
)
inference_gpu_count = len(inference_gpu_ids)

if trainer_gpu_ids[0] != 0:
raise ValueError(
Expand All @@ -65,3 +61,16 @@ def validate_dedicated_config(config: InternalModelConfig) -> None:
"enable_sleep_mode is incompatible with dedicated mode "
"(dedicated mode runs vLLM on a separate GPU, sleep/wake is not needed)"
)

engine_args = config.get("engine_args", {})
for key in ("data_parallel_size", "data_parallel_size_local"):
value = engine_args.get(key)
if value is None:
continue
if isinstance(value, bool) or not isinstance(value, int):
raise ValueError(f"{key} must be an integer in dedicated mode")
if value != inference_gpu_count:
raise ValueError(
f"{key} must equal len(inference_gpu_ids) ({inference_gpu_count}) "
"in dedicated mode"
)
43 changes: 43 additions & 0 deletions src/art/unsloth/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,22 @@ async def _start_vllm_subprocess(
"""Launch vLLM as a subprocess on inference GPUs. Returns (host, port)."""
import atexit

def _parse_int_arg(name: str, value: object) -> int:
if isinstance(value, bool):
raise ValueError(f"{name} must be an integer, got bool")
if isinstance(value, int):
return value
if isinstance(value, str):
try:
return int(value)
except ValueError as exc:
raise ValueError(
f"{name} must be an integer, got {value!r}"
) from exc
raise ValueError(f"{name} must be an integer, got {type(value).__name__}")

inference_gpu_ids = self.config["inference_gpu_ids"]
inference_gpu_count = len(inference_gpu_ids)
cuda_devices = ",".join(str(g) for g in inference_gpu_ids)

# Build server_args: ART defaults, then user overrides, strip CLI-handled keys
Expand All @@ -311,6 +326,16 @@ async def _start_vllm_subprocess(
}
if config and "server_args" in config:
server_args.update(dict(config["server_args"]))
api_server_count = server_args.pop("api_server_count", None)
if api_server_count is not None:
parsed_api_server_count = _parse_int_arg(
"api_server_count", api_server_count
)
if parsed_api_server_count != 1:
raise ValueError(
"api_server_count must be 1 in dedicated mode when runtime "
"LoRA updating is enabled"
)
for key in ("port", "host", "lora_modules", "api_key"):
server_args.pop(key, None)

Expand All @@ -319,6 +344,24 @@ async def _start_vllm_subprocess(
engine_args = dict(self.config.get("engine_args", {}))
if config and "engine_args" in config:
engine_args.update(dict(config["engine_args"]))

for key in ("data_parallel_size", "data_parallel_size_local"):
value = engine_args.get(key)
if value is None:
continue
parsed_value = _parse_int_arg(key, value)
if parsed_value != inference_gpu_count:
raise ValueError(
f"{key} must equal len(inference_gpu_ids) "
f"({inference_gpu_count}) in dedicated mode"
)
engine_args[key] = parsed_value

if inference_gpu_count > 1:
engine_args.setdefault("data_parallel_size", inference_gpu_count)
engine_args.setdefault("data_parallel_size_local", inference_gpu_count)
engine_args.setdefault("distributed_executor_backend", "mp")

engine_args.setdefault("generation_config", "vllm")
engine_args["enable_lora"] = True
engine_args.setdefault("max_loras", 2)
Expand Down
40 changes: 38 additions & 2 deletions tests/unit/test_dedicated_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,48 @@ def test_overlapping_gpu_ids():


def test_multi_gpu_inference():
with pytest.raises(ValueError, match="Multi-GPU inference not yet supported"):
validate_dedicated_config(
InternalModelConfig(trainer_gpu_ids=[0], inference_gpu_ids=[1, 2])
)


def test_dedicated_data_parallel_size_must_match_inference_gpus():
with pytest.raises(ValueError, match="data_parallel_size must equal"):
validate_dedicated_config(
InternalModelConfig(trainer_gpu_ids=[0], inference_gpu_ids=[1, 2])
InternalModelConfig(
trainer_gpu_ids=[0],
inference_gpu_ids=[1, 2],
engine_args={"data_parallel_size": 1}, # type: ignore[typeddict-item]
)
)


def test_dedicated_data_parallel_size_local_must_match_inference_gpus():
with pytest.raises(ValueError, match="data_parallel_size_local must equal"):
validate_dedicated_config(
InternalModelConfig(
trainer_gpu_ids=[0],
inference_gpu_ids=[1, 2],
engine_args={ # type: ignore[typeddict-item]
"data_parallel_size_local": 1
},
)
)


def test_dedicated_data_parallel_size_allows_matching_values():
validate_dedicated_config(
InternalModelConfig(
trainer_gpu_ids=[0],
inference_gpu_ids=[1, 2],
engine_args={ # type: ignore[typeddict-item]
"data_parallel_size": 2,
"data_parallel_size_local": 2,
},
)
)


def test_trainer_not_starting_at_zero():
with pytest.raises(ValueError, match="must start at GPU 0"):
validate_dedicated_config(
Expand Down