Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions model-engine/model_engine_server/inference/vllm/Dockerfile.vllm
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
FROM base AS vllm_base

ARG VLLM_VERSION
# VLLM_PIP_VERSION allows specifying a pip version string separately from VLLM_VERSION
# (useful when the pip version contains '+' which is invalid in Docker image tags)
ARG VLLM_PIP_VERSION=${VLLM_VERSION}
ARG VLLM_WHEEL_INDEX=""
ARG TORCH_VERSION=""
RUN pip install --no-cache-dir --pre --upgrade \
${VLLM_WHEEL_INDEX:+--extra-index-url "$VLLM_WHEEL_INDEX"} \
"vllm[audio]==${VLLM_VERSION}" \
"vllm[audio]==${VLLM_PIP_VERSION}" \
${TORCH_VERSION:+"torch==${TORCH_VERSION}"}

# Post-install fixups: ensure flashinfer and pplx-kernels match the new vllm/torch.
Expand All @@ -54,19 +57,20 @@ RUN FLASHINFER_CUDA_TAG="$(python3 -c 'import torch; print((torch.version.cuda o
# Ensure pip-installed CUDA libs (matching torch) take priority over the stale
# system-level 12.9.x libs in the base image at runtime. Write the path into a
# file at build time, then source it via ENV.
RUN python3 -c "\
RUN python3 - > /tmp/nvidia_lib_path.txt <<'PYEOF'
import os, importlib, pathlib
dirs = []
for pkg in ['nvidia.cublas','nvidia.cuda_runtime','nvidia.cuda_nvrtc',\
'nvidia.curand','nvidia.cufft','nvidia.cusolver','nvidia.cusparse',\
for pkg in ['nvidia.cublas','nvidia.cuda_runtime','nvidia.cuda_nvrtc',
'nvidia.curand','nvidia.cufft','nvidia.cusolver','nvidia.cusparse',
'nvidia.nvjitlink','nvidia.nvtx','nvidia.cudnn','nvidia.nccl','nvidia.cufile']:
try:
m = importlib.import_module(pkg)
if m.__file__ is None: continue
d = str(pathlib.Path(m.__file__).parent / 'lib')
if os.path.isdir(d): dirs.append(d)
except ImportError: pass
print(':'.join(dirs))
" > /tmp/nvidia_lib_path.txt
PYEOF
ENV NVIDIA_PIP_LIB_PATH_FILE=/tmp/nvidia_lib_path.txt

FROM vllm_base AS vllm
Expand All @@ -86,9 +90,11 @@ COPY model-engine/model_engine_server/inference/vllm/init_ray.sh /workspace/init

# vLLM-specific startup instrumentation
COPY model-engine/model_engine_server/inference/vllm/vllm_startup_wrapper.py /workspace/vllm_startup_wrapper.py
COPY model-engine/model_engine_server/inference/vllm/entrypoint.sh /workspace/entrypoint.sh
RUN chmod +x /workspace/entrypoint.sh

# Need to override entrypoint from parent image
ENTRYPOINT ["/bin/env"]
ENTRYPOINT ["/workspace/entrypoint.sh"]

FROM vllm_base AS vllm_omni

Expand All @@ -100,7 +106,7 @@ RUN pip uninstall -y vllm-omni || true
COPY --from=vllm-omni-source . /tmp/vllm-omni-source/
RUN if [ "$VLLM_OMNI_FROM_SOURCE" = "true" ]; then \
echo "==> Installing vllm-omni from local source"; \
pip install --no-cache-dir /tmp/vllm-omni-source/; \
SETUPTOOLS_SCM_PRETEND_VERSION=${VLLM_OMNI_VERSION} pip install --no-cache-dir /tmp/vllm-omni-source/; \
else \
echo "==> Installing vllm-omni ${VLLM_OMNI_VERSION} from PyPI"; \
pip install --no-cache-dir vllm-omni==${VLLM_OMNI_VERSION}; \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set -eo pipefail
# SCCACHE_BUCKET, SCCACHE_REGION - S3 sccache config
#
# The image tag will be automatically constructed as:
# - For vllm_omni: {VLLM_VERSION}-omni-{VLLM_OMNI_VERSION}-{USER_TAG}
# - For vllm_omni: {VLLM_VERSION}-omni-{USER_TAG}
# - For others: {VLLM_VERSION}-{USER_TAG}
#
# Examples:
Expand Down Expand Up @@ -92,6 +92,7 @@ CUDA_ARCH=""
SCCACHE_BUCKET=${SCCACHE_BUCKET:-""}
SCCACHE_REGION=${SCCACHE_REGION:-"us-west-2"}
VLLM_WHEEL_INDEX=""
VLLM_PIP_VERSION=""

VLLM_OMNI_VERSION=${VLLM_OMNI_VERSION:-"0.16.0"}
VLLM_OMNI_SOURCE_DIR=""
Expand Down Expand Up @@ -125,6 +126,7 @@ declare -A FLAG_VARS=(
["--vllm-omni-source-dir"]="VLLM_OMNI_SOURCE_DIR"
["--vllm-omni-source-ref"]="VLLM_OMNI_SOURCE_REF"
["--vllm-wheel-index"]="VLLM_WHEEL_INDEX"
["--vllm-pip-version"]="VLLM_PIP_VERSION"
)

# Parse keyword arguments
Expand Down Expand Up @@ -178,7 +180,6 @@ declare -A VLLM_WHEEL_INDEXES=(

# Required torch version for each vllm version (when base image has a different torch)
declare -A VLLM_TORCH_VERSIONS=(
["0.16.0"]="2.10.0"
)

# Auto-resolve wheel index if not explicitly provided
Expand Down Expand Up @@ -255,7 +256,7 @@ fi

# Construct image tag based on vllm version and user tag
if [ "$BUILD_TARGET" == "vllm_omni" ]; then
IMAGE_TAG="${VLLM_VERSION}-omni-${VLLM_OMNI_VERSION}-${USER_TAG}"
IMAGE_TAG="${VLLM_VERSION}-omni-${USER_TAG}"
else
IMAGE_TAG="${VLLM_VERSION}-${USER_TAG}"
fi
Expand Down Expand Up @@ -338,6 +339,11 @@ if [ -n "$VLLM_WHEEL_INDEX" ]; then
BUILD_ARGS+=(--build-arg VLLM_WHEEL_INDEX="${VLLM_WHEEL_INDEX}")
fi

# Separate pip version (e.g. when version contains '+' which is invalid in Docker tags)
if [ -n "$VLLM_PIP_VERSION" ]; then
BUILD_ARGS+=(--build-arg VLLM_PIP_VERSION="${VLLM_PIP_VERSION}")
fi

# Explicit torch version (needed when base image has incompatible torch)
if [ -n "$TORCH_VERSION" ]; then
BUILD_ARGS+=(--build-arg TORCH_VERSION="${TORCH_VERSION}")
Expand Down
10 changes: 10 additions & 0 deletions model-engine/model_engine_server/inference/vllm/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
# Prepend pip-installed NVIDIA CUDA lib paths so they take priority over stale
# system-level libs (e.g. CUDA 12.9.x in the base image).
if [ -n "$NVIDIA_PIP_LIB_PATH_FILE" ] && [ -f "$NVIDIA_PIP_LIB_PATH_FILE" ]; then
_nvidia_lib_path="$(cat "$NVIDIA_PIP_LIB_PATH_FILE")"
if [ -n "$_nvidia_lib_path" ]; then
export LD_LIBRARY_PATH="${_nvidia_lib_path}:${LD_LIBRARY_PATH:-}"
fi
fi
exec "$@"
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pydantic>=2.0
transformers>=5.0.0
opentelemetry-api>=1.39.1
opentelemetry-sdk>=1.39.1
opentelemetry-exporter-otlp-proto-grpc>=1.39.1
Original file line number Diff line number Diff line change
Expand Up @@ -287,4 +287,4 @@ async def with_startup_metrics(
)
health_thread.start()

return await func(*args)
return await func(args)
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,30 @@
# Now do heavy imports (noqa: E402 - intentional late import for startup time measurement)
import asyncio # noqa: E402

from utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402
from utils.startup_telemetry import with_startup_metrics # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.utils.argparse_utils import FlexibleArgumentParser # noqa: E402
from vllm_omni.entrypoints.openai.api_server import omni_run_server # noqa: E402

from .utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402
from .utils.startup_telemetry import with_startup_metrics # noqa: E402

if __name__ == "__main__":
check_unknown_startup_memory_usage()

parser = make_arg_parser(FlexibleArgumentParser())
parser.add_argument(
"--init-timeout",
type=int,
default=600,
dest="init_timeout",
help="Timeout in seconds for waiting for all stages to initialize.",
)
parser.add_argument(
"--stage-init-timeout",
type=int,
default=300,
dest="stage_init_timeout",
help="Per-stage init watchdog timeout in seconds.",
)
args = parser.parse_args()
if args.attention_backend is not None:
os.environ["VLLM_ATTENTION_BACKEND"] = args.attention_backend
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class OmniModelConfig(ModelConfig):
)
omni_kv_config: dict | None = None
codec_frame_rate_hz: float | None = None
task_type: str | None = None

@property
def registry(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@
import threading # noqa: E402
from logging import Logger # noqa: E402

from utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402
from vllm.entrypoints.openai.api_server import run_server # noqa: E402
from vllm.entrypoints.openai.cli_args import make_arg_parser # noqa: E402
from vllm.utils.argparse_utils import FlexibleArgumentParser # noqa: E402

from .utils.resource_debug import check_unknown_startup_memory_usage # noqa: E402

logger = Logger("vllm_server")


Expand Down