diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm index a0c777af..a2d80a3f 100644 --- a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm +++ b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm @@ -34,11 +34,14 @@ RUN ln -s /usr/bin/python3 /usr/bin/python FROM base AS vllm_base ARG VLLM_VERSION +# VLLM_PIP_VERSION allows specifying a pip version string separately from VLLM_VERSION +# (useful when the pip version contains '+' which is invalid in Docker image tags) +ARG VLLM_PIP_VERSION=${VLLM_VERSION} ARG VLLM_WHEEL_INDEX="" ARG TORCH_VERSION="" RUN pip install --no-cache-dir --pre --upgrade \ ${VLLM_WHEEL_INDEX:+--extra-index-url "$VLLM_WHEEL_INDEX"} \ - "vllm[audio]==${VLLM_VERSION}" \ + "vllm[audio]==${VLLM_PIP_VERSION}" \ ${TORCH_VERSION:+"torch==${TORCH_VERSION}"} # Post-install fixups: ensure flashinfer and pplx-kernels match the new vllm/torch. @@ -51,24 +54,6 @@ RUN FLASHINFER_CUDA_TAG="$(python3 -c 'import torch; print((torch.version.cuda o --extra-index-url "https://flashinfer.ai/whl/cu${FLASHINFER_CUDA_TAG}" && \ pip uninstall -y pplx-kernels || true -# Ensure pip-installed CUDA libs (matching torch) take priority over the stale -# system-level 12.9.x libs in the base image at runtime. Write the path into a -# file at build time, then source it via ENV. -RUN python3 -c "\ -import os, importlib, pathlib -dirs = [] -for pkg in ['nvidia.cublas','nvidia.cuda_runtime','nvidia.cuda_nvrtc',\ -'nvidia.curand','nvidia.cufft','nvidia.cusolver','nvidia.cusparse',\ -'nvidia.nvjitlink','nvidia.nvtx','nvidia.cudnn','nvidia.nccl','nvidia.cufile']: - try: - m = importlib.import_module(pkg) - d = str(pathlib.Path(m.__file__).parent / 'lib') - if os.path.isdir(d): dirs.append(d) - except ImportError: pass -print(':'.join(dirs)) -" > /tmp/nvidia_lib_path.txt -ENV NVIDIA_PIP_LIB_PATH_FILE=/tmp/nvidia_lib_path.txt - FROM vllm_base AS vllm COPY model-engine/model_engine_server/inference/vllm/requirements.txt /workspace/requirements.txt @@ -86,9 +71,11 @@ COPY model-engine/model_engine_server/inference/vllm/init_ray.sh /workspace/init # vLLM-specific startup instrumentation COPY model-engine/model_engine_server/inference/vllm/vllm_startup_wrapper.py /workspace/vllm_startup_wrapper.py +COPY model-engine/model_engine_server/inference/vllm/entrypoint.sh /workspace/entrypoint.sh +RUN chmod +x /workspace/entrypoint.sh # Need to override entrypoint from parent image -ENTRYPOINT ["/bin/env"] +ENTRYPOINT ["/workspace/entrypoint.sh"] FROM vllm_base AS vllm_omni @@ -100,7 +87,7 @@ RUN pip uninstall -y vllm-omni || true COPY --from=vllm-omni-source . /tmp/vllm-omni-source/ RUN if [ "$VLLM_OMNI_FROM_SOURCE" = "true" ]; then \ echo "==> Installing vllm-omni from local source"; \ - pip install --no-cache-dir /tmp/vllm-omni-source/; \ + SETUPTOOLS_SCM_PRETEND_VERSION=${VLLM_OMNI_VERSION} pip install --no-cache-dir /tmp/vllm-omni-source/; \ else \ echo "==> Installing vllm-omni ${VLLM_OMNI_VERSION} from PyPI"; \ pip install --no-cache-dir vllm-omni==${VLLM_OMNI_VERSION}; \ diff --git a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh index 89e40e52..18059a1f 100755 --- a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh +++ b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh @@ -31,7 +31,7 @@ set -eo pipefail # SCCACHE_BUCKET, SCCACHE_REGION - S3 sccache config # # The image tag will be automatically constructed as: -# - For vllm_omni: {VLLM_VERSION}-omni-{VLLM_OMNI_VERSION}-{USER_TAG} +# - For vllm_omni: {VLLM_VERSION}-omni-{USER_TAG} # - For others: {VLLM_VERSION}-{USER_TAG} # # Examples: @@ -92,6 +92,7 @@ CUDA_ARCH="" SCCACHE_BUCKET=${SCCACHE_BUCKET:-""} SCCACHE_REGION=${SCCACHE_REGION:-"us-west-2"} VLLM_WHEEL_INDEX="" +VLLM_PIP_VERSION="" VLLM_OMNI_VERSION=${VLLM_OMNI_VERSION:-"0.16.0"} VLLM_OMNI_SOURCE_DIR="" @@ -125,6 +126,7 @@ declare -A FLAG_VARS=( ["--vllm-omni-source-dir"]="VLLM_OMNI_SOURCE_DIR" ["--vllm-omni-source-ref"]="VLLM_OMNI_SOURCE_REF" ["--vllm-wheel-index"]="VLLM_WHEEL_INDEX" + ["--vllm-pip-version"]="VLLM_PIP_VERSION" ) # Parse keyword arguments @@ -178,7 +180,6 @@ declare -A VLLM_WHEEL_INDEXES=( # Required torch version for each vllm version (when base image has a different torch) declare -A VLLM_TORCH_VERSIONS=( - ["0.16.0"]="2.10.0" ) # Auto-resolve wheel index if not explicitly provided @@ -255,7 +256,7 @@ fi # Construct image tag based on vllm version and user tag if [ "$BUILD_TARGET" == "vllm_omni" ]; then - IMAGE_TAG="${VLLM_VERSION}-omni-${VLLM_OMNI_VERSION}-${USER_TAG}" + IMAGE_TAG="${VLLM_VERSION}-omni-${USER_TAG}" else IMAGE_TAG="${VLLM_VERSION}-${USER_TAG}" fi @@ -338,6 +339,11 @@ if [ -n "$VLLM_WHEEL_INDEX" ]; then BUILD_ARGS+=(--build-arg VLLM_WHEEL_INDEX="${VLLM_WHEEL_INDEX}") fi +# Separate pip version (e.g. when version contains '+' which is invalid in Docker tags) +if [ -n "$VLLM_PIP_VERSION" ]; then + BUILD_ARGS+=(--build-arg VLLM_PIP_VERSION="${VLLM_PIP_VERSION}") +fi + # Explicit torch version (needed when base image has incompatible torch) if [ -n "$TORCH_VERSION" ]; then BUILD_ARGS+=(--build-arg TORCH_VERSION="${TORCH_VERSION}") diff --git a/model-engine/model_engine_server/inference/vllm/entrypoint.sh b/model-engine/model_engine_server/inference/vllm/entrypoint.sh new file mode 100755 index 00000000..51937b92 --- /dev/null +++ b/model-engine/model_engine_server/inference/vllm/entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# Prepend pip-installed NVIDIA CUDA lib paths so they take priority over stale +# system-level libs (e.g. CUDA 12.9.x in the base image). +if [ -n "$NVIDIA_PIP_LIB_PATH_FILE" ] && [ -f "$NVIDIA_PIP_LIB_PATH_FILE" ]; then + _nvidia_lib_path="$(cat "$NVIDIA_PIP_LIB_PATH_FILE")" + if [ -n "$_nvidia_lib_path" ]; then + export LD_LIBRARY_PATH="${_nvidia_lib_path}:${LD_LIBRARY_PATH:-}" + fi +fi +exec "$@" diff --git a/model-engine/model_engine_server/inference/vllm/requirements.txt b/model-engine/model_engine_server/inference/vllm/requirements.txt index 0a13640c..e3e5917f 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements.txt @@ -1,4 +1,5 @@ pydantic>=2.0 +transformers>=5.0.0 opentelemetry-api>=1.39.1 opentelemetry-sdk>=1.39.1 opentelemetry-exporter-otlp-proto-grpc>=1.39.1 \ No newline at end of file diff --git a/model-engine/model_engine_server/inference/vllm/utils/startup_telemetry.py b/model-engine/model_engine_server/inference/vllm/utils/startup_telemetry.py index 4e2a1d30..7bb26916 100644 --- a/model-engine/model_engine_server/inference/vllm/utils/startup_telemetry.py +++ b/model-engine/model_engine_server/inference/vllm/utils/startup_telemetry.py @@ -287,4 +287,4 @@ async def with_startup_metrics( ) health_thread.start() - return await func(*args) + return await func(args) diff --git a/model-engine/model_engine_server/inference/vllm/vllm_omni_server.py b/model-engine/model_engine_server/inference/vllm/vllm_omni_server.py index aff90413..6cbcd6ac 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_omni_server.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_omni_server.py @@ -7,17 +7,30 @@ # Now do heavy imports (noqa: E402 - intentional late import for startup time measurement) import asyncio # noqa: E402 +from utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402 +from utils.startup_telemetry import with_startup_metrics # noqa: E402 from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402 from vllm.utils.argparse_utils import FlexibleArgumentParser # noqa: E402 from vllm_omni.entrypoints.openai.api_server import omni_run_server # noqa: E402 -from .utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402 -from .utils.startup_telemetry import with_startup_metrics # noqa: E402 - if __name__ == "__main__": check_unknown_startup_memory_usage() parser = make_arg_parser(FlexibleArgumentParser()) + parser.add_argument( + "--init-timeout", + type=int, + default=600, + dest="init_timeout", + help="Timeout in seconds for waiting for all stages to initialize.", + ) + parser.add_argument( + "--stage-init-timeout", + type=int, + default=300, + dest="stage_init_timeout", + help="Per-stage init watchdog timeout in seconds.", + ) args = parser.parse_args() if args.attention_backend is not None: os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_backend diff --git a/model-engine/model_engine_server/inference/vllm/vllm_overrides/omni/config/model.py b/model-engine/model_engine_server/inference/vllm/vllm_overrides/omni/config/model.py index 7ac86707..a8571833 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_overrides/omni/config/model.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_overrides/omni/config/model.py @@ -59,6 +59,7 @@ class OmniModelConfig(ModelConfig): ) omni_kv_config: dict | None = None codec_frame_rate_hz: float | None = None + task_type: str | None = None @property def registry(self): diff --git a/model-engine/model_engine_server/inference/vllm/vllm_server.py b/model-engine/model_engine_server/inference/vllm/vllm_server.py index 1b34fc8a..a30bcc3e 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_server.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_server.py @@ -13,12 +13,11 @@ import threading # noqa: E402 from logging import Logger # noqa: E402 +from utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402 from vllm.entrypoints.openai.api_server import run_server # noqa: E402 from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402 from vllm.utils.argparse_utils import FlexibleArgumentParser # noqa: E402 -from .utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402 - logger = Logger("vllm_server")