diff --git a/src/art/dev/validate.py b/src/art/dev/validate.py index 031464e0..281b1a6d 100644 --- a/src/art/dev/validate.py +++ b/src/art/dev/validate.py @@ -36,11 +36,7 @@ def validate_dedicated_config(config: InternalModelConfig) -> None: if set(trainer_gpu_ids) & set(inference_gpu_ids): raise ValueError("trainer_gpu_ids and inference_gpu_ids must not overlap") - - if len(inference_gpu_ids) > 1: - raise ValueError( - "Multi-GPU inference not yet supported; inference_gpu_ids must have exactly one GPU" - ) + inference_gpu_count = len(inference_gpu_ids) if trainer_gpu_ids[0] != 0: raise ValueError( @@ -65,3 +61,16 @@ def validate_dedicated_config(config: InternalModelConfig) -> None: "enable_sleep_mode is incompatible with dedicated mode " "(dedicated mode runs vLLM on a separate GPU, sleep/wake is not needed)" ) + + engine_args = config.get("engine_args", {}) + for key in ("data_parallel_size", "data_parallel_size_local"): + value = engine_args.get(key) + if value is None: + continue + if isinstance(value, bool) or not isinstance(value, int): + raise ValueError(f"{key} must be an integer in dedicated mode") + if value != inference_gpu_count: + raise ValueError( + f"{key} must equal len(inference_gpu_ids) ({inference_gpu_count}) " + "in dedicated mode" + ) diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py index cfb95b3c..5440db1b 100644 --- a/src/art/unsloth/service.py +++ b/src/art/unsloth/service.py @@ -300,7 +300,22 @@ async def _start_vllm_subprocess( """Launch vLLM as a subprocess on inference GPUs. Returns (host, port).""" import atexit + def _parse_int_arg(name: str, value: object) -> int: + if isinstance(value, bool): + raise ValueError(f"{name} must be an integer, got bool") + if isinstance(value, int): + return value + if isinstance(value, str): + try: + return int(value) + except ValueError as exc: + raise ValueError( + f"{name} must be an integer, got {value!r}" + ) from exc + raise ValueError(f"{name} must be an integer, got {type(value).__name__}") + inference_gpu_ids = self.config["inference_gpu_ids"] + inference_gpu_count = len(inference_gpu_ids) cuda_devices = ",".join(str(g) for g in inference_gpu_ids) # Build server_args: ART defaults, then user overrides, strip CLI-handled keys @@ -311,6 +326,16 @@ async def _start_vllm_subprocess( } if config and "server_args" in config: server_args.update(dict(config["server_args"])) + api_server_count = server_args.pop("api_server_count", None) + if api_server_count is not None: + parsed_api_server_count = _parse_int_arg( + "api_server_count", api_server_count + ) + if parsed_api_server_count != 1: + raise ValueError( + "api_server_count must be 1 in dedicated mode when runtime " + "LoRA updating is enabled" + ) for key in ("port", "host", "lora_modules", "api_key"): server_args.pop(key, None) @@ -319,6 +344,24 @@ async def _start_vllm_subprocess( engine_args = dict(self.config.get("engine_args", {})) if config and "engine_args" in config: engine_args.update(dict(config["engine_args"])) + + for key in ("data_parallel_size", "data_parallel_size_local"): + value = engine_args.get(key) + if value is None: + continue + parsed_value = _parse_int_arg(key, value) + if parsed_value != inference_gpu_count: + raise ValueError( + f"{key} must equal len(inference_gpu_ids) " + f"({inference_gpu_count}) in dedicated mode" + ) + engine_args[key] = parsed_value + + if inference_gpu_count > 1: + engine_args.setdefault("data_parallel_size", inference_gpu_count) + engine_args.setdefault("data_parallel_size_local", inference_gpu_count) + engine_args.setdefault("distributed_executor_backend", "mp") + engine_args.setdefault("generation_config", "vllm") engine_args["enable_lora"] = True engine_args.setdefault("max_loras", 2) diff --git a/tests/unit/test_dedicated_config.py b/tests/unit/test_dedicated_config.py index 3de780ef..8e96b4b4 100644 --- a/tests/unit/test_dedicated_config.py +++ b/tests/unit/test_dedicated_config.py @@ -77,12 +77,48 @@ def test_overlapping_gpu_ids(): def test_multi_gpu_inference(): - with pytest.raises(ValueError, match="Multi-GPU inference not yet supported"): + validate_dedicated_config( + InternalModelConfig(trainer_gpu_ids=[0], inference_gpu_ids=[1, 2]) + ) + + +def test_dedicated_data_parallel_size_must_match_inference_gpus(): + with pytest.raises(ValueError, match="data_parallel_size must equal"): validate_dedicated_config( - InternalModelConfig(trainer_gpu_ids=[0], inference_gpu_ids=[1, 2]) + InternalModelConfig( + trainer_gpu_ids=[0], + inference_gpu_ids=[1, 2], + engine_args={"data_parallel_size": 1}, # type: ignore[typeddict-item] + ) ) +def test_dedicated_data_parallel_size_local_must_match_inference_gpus(): + with pytest.raises(ValueError, match="data_parallel_size_local must equal"): + validate_dedicated_config( + InternalModelConfig( + trainer_gpu_ids=[0], + inference_gpu_ids=[1, 2], + engine_args={ # type: ignore[typeddict-item] + "data_parallel_size_local": 1 + }, + ) + ) + + +def test_dedicated_data_parallel_size_allows_matching_values(): + validate_dedicated_config( + InternalModelConfig( + trainer_gpu_ids=[0], + inference_gpu_ids=[1, 2], + engine_args={ # type: ignore[typeddict-item] + "data_parallel_size": 2, + "data_parallel_size_local": 2, + }, + ) + ) + + def test_trainer_not_starting_at_zero(): with pytest.raises(ValueError, match="must start at GPU 0"): validate_dedicated_config(