From 5ae54bba537e281c5e7d42814890a1d77e30bb4d Mon Sep 17 00:00:00 2001 From: zxy Date: Mon, 1 Jun 2026 12:26:12 +0800 Subject: [PATCH 1/5] feat: add multimodal metrics --- docs/en/advance/metrics.md | 16 +++ docs/zh_cn/advance/metrics.md | 15 +++ lmdeploy/cli/serve.py | 4 + lmdeploy/cli/utils.py | 8 ++ lmdeploy/messages.py | 4 + lmdeploy/metrics/loggers.py | 93 +++++++++++++++- lmdeploy/metrics/metrics_processor.py | 17 ++- lmdeploy/metrics/stats.py | 96 ++++++++++++++++ lmdeploy/serve/core/async_engine.py | 13 ++- lmdeploy/serve/openai/api_server.py | 4 +- lmdeploy/serve/processors/multimodal.py | 104 ++++++++++++------ .../test_lmdeploy/test_metrics_multimodal.py | 81 ++++++++++++++ 12 files changed, 413 insertions(+), 42 deletions(-) create mode 100644 tests/test_lmdeploy/test_metrics_multimodal.py diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md index 1f05807d9a..fe952072cb 100644 --- a/docs/en/advance/metrics.md +++ b/docs/en/advance/metrics.md @@ -23,6 +23,22 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics Replace the model path according to your needs. By default, the metrics endpoint will be available at `http://:23333/metrics`. +For VLM serving, total multimodal preprocessing metrics are exported when the metrics system is enabled. Use +`--enable-mm-metrics` to additionally export per-stage metrics: + +``` +lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics +``` + +The multimodal metrics include: + +- `lmdeploy:multimodal_requests_total`: multimodal request count +- `lmdeploy:multimodal_items_total`: multimodal item count by modality +- `lmdeploy:multimodal_preprocess_time_seconds`: total multimodal preprocessing time +- `lmdeploy:multimodal_stage_time_seconds`: detailed stage time by stage and modality, exported with `--enable-mm-metrics` +- `lmdeploy:multimodal_item_count`: item count per request by modality, exported with `--enable-mm-metrics` +- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality, exported with `--enable-mm-metrics` + 2. **Navigate to the monitoring directory** ``` diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md index 7eb4eec5cc..3928d5e60e 100644 --- a/docs/zh_cn/advance/metrics.md +++ b/docs/zh_cn/advance/metrics.md @@ -22,6 +22,21 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics 请根据需求替换模型路径。默认 metrics endpoint 位于 `http://:23333/metrics`。 +VLM 服务在启用指标系统后会默认导出多模态预处理总耗时指标。使用 `--enable-mm-metrics` 可额外导出分阶段指标: + +``` +lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics +``` + +多模态指标包括: + +- `lmdeploy:multimodal_requests_total`:多模态请求数量 +- `lmdeploy:multimodal_items_total`:按模态统计的多模态输入数量 +- `lmdeploy:multimodal_preprocess_time_seconds`:多模态预处理总耗时 +- `lmdeploy:multimodal_stage_time_seconds`:按阶段和模态统计的耗时,使用 `--enable-mm-metrics` 导出 +- `lmdeploy:multimodal_item_count`:按模态统计的单请求输入数量,使用 `--enable-mm-metrics` 导出 +- `lmdeploy:multimodal_processing_failures_total`:按阶段和模态统计的处理失败次数,使用 `--enable-mm-metrics` 导出 + 2. **进入监控目录** ``` diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 9ca662b961..399a17297f 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -122,6 +122,7 @@ def add_parser_api_server(): model_format = ArgumentHelper.model_format(pt_group) hf_overrides = ArgumentHelper.hf_overrides(pt_group) disable_metrics = ArgumentHelper.disable_metrics(pt_group) + enable_mm_metrics = ArgumentHelper.enable_mm_metrics(pt_group) dp = ArgumentHelper.dp(pt_group) ArgumentHelper.ep(pt_group) ArgumentHelper.enable_microbatch(pt_group) @@ -149,6 +150,7 @@ def add_parser_api_server(): tb_group._group_actions.append(node_rank_act) tb_group._group_actions.append(hf_overrides) tb_group._group_actions.append(disable_metrics) + tb_group._group_actions.append(enable_mm_metrics) tb_group._group_actions.append(dp) ArgumentHelper.cp(tb_group) ArgumentHelper.rope_scaling_factor(tb_group) @@ -239,6 +241,7 @@ def api_server(args): enable_microbatch=args.enable_microbatch, enable_eplb=args.enable_eplb, enable_metrics=not args.disable_metrics, + enable_mm_metrics=args.enable_mm_metrics, role=EngineRole[args.role], migration_backend=MigrationBackend[args.migration_backend], model_format=args.model_format, @@ -275,6 +278,7 @@ def api_server(args): async_=args.async_, communicator=args.communicator, enable_metrics=not args.disable_metrics, + enable_mm_metrics=args.enable_mm_metrics, hf_overrides=args.hf_overrides) chat_template_config = get_chat_template(args.chat_template, args.model_path) speculative_config = get_speculative_config(args) diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 2c9f8034f6..3efa66d2bb 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -677,6 +677,14 @@ def disable_metrics(parser): default=False, help='disable metrics system') + @staticmethod + def enable_mm_metrics(parser): + """Add argument enable_mm_metrics to parser.""" + return parser.add_argument('--enable-mm-metrics', + action='store_true', + default=False, + help='enable detailed multimodal metrics') + # For Disaggregation @staticmethod def role(parser): diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index aec9e08ee6..4a1682c4d2 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -269,6 +269,7 @@ class TurbomindEngineConfig: hf_overrides: Huggingface overrides for the model. It can be used to override the default config of the model enable_metrics: enable metrics system + enable_mm_metrics: enable detailed multimodal metrics """ dtype: str = 'auto' @@ -307,6 +308,7 @@ class TurbomindEngineConfig: communicator: str = 'nccl' hf_overrides: dict[str, Any] | None = None enable_metrics: bool = True + enable_mm_metrics: bool = False def __post_init__(self): """Check input validation.""" @@ -382,6 +384,7 @@ class PytorchEngineConfig: enable_microbatch: enable microbatch for specified model enable_eplb: enable eplb for specified model enable_metrics: enable metrics system + enable_mm_metrics: enable detailed multimodal metrics role: role of engin, options: ['Hybrid', 'Prefill', 'Decode']. Default to `EngineRole.Hybrid`. migration_backend: migration backend. options: ['DLSlime']. @@ -436,6 +439,7 @@ class PytorchEngineConfig: mp_engine_backend: str = 'mp' model_format: str = None enable_metrics: bool = True + enable_mm_metrics: bool = False hf_overrides: dict[str, Any] | None = None disable_vision_encoder: bool = False logprobs_mode: str = None diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py index 69e24ede4f..417dbedaf3 100644 --- a/lmdeploy/metrics/loggers.py +++ b/lmdeploy/metrics/loggers.py @@ -7,7 +7,13 @@ import numpy as np -from lmdeploy.metrics.stats import IterationStats, RequestStats, SchedulerStats, SpeculativeDecodingStats +from lmdeploy.metrics.stats import ( + IterationStats, + MultimodalStats, + RequestStats, + SchedulerStats, + SpeculativeDecodingStats, +) from lmdeploy.utils import get_logger logger = get_logger('lmdeploy') @@ -27,6 +33,9 @@ def record_iteration(self, stats: IterationStats) -> None: def record_specdecode(self, stats: SpeculativeDecodingStats) -> None: ... + def record_multimodal(self, stats: MultimodalStats) -> None: + ... + def log(self): # noqa pass @@ -42,6 +51,9 @@ def _reset(self, now): self.last_log_time = now self.total_prompt_tokens = 0 self.total_generation_tokens = 0 + self.total_multimodal_requests = 0 + self.total_multimodal_items = 0 + self.total_multimodal_preprocess_time = 0.0 # spec decode self.num_drafts: int = 0 self.num_draft_tokens: int = 0 @@ -72,6 +84,15 @@ def record_specdecode(self, stats: SpeculativeDecodingStats): def record_finish(self, stats: RequestStats): pass + def record_multimodal(self, stats: MultimodalStats) -> None: + if not stats.has_data: + return + + total_time, _, item_counts, _ = stats.snapshot() + self.total_multimodal_requests += 1 + self.total_multimodal_items += sum(item_counts.values()) + self.total_multimodal_preprocess_time += total_time + def get_spec_msg(self): """Get spec decoding logging msg.""" if self.num_drafts == 0: @@ -98,7 +119,8 @@ def log(self): now = time.perf_counter() # skip logging if no tokens were processed - if self.total_prompt_tokens == 0 and self.total_generation_tokens == 0: + if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0 + and self.total_multimodal_requests == 0): self._reset(now) return @@ -109,7 +131,8 @@ def log(self): spec_msg = self.get_spec_msg() # format and print - log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} DP{self.dp_rank}] " + log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} " + f'Engine {self.dp_rank:03d}] ' f'Avg thr (in/out): {prompt_throughput:.1f} / {generation_throughput:.1f} tokens/s, ' f'Server (succeeded/failed/routed/waiting): ' f'{scheduler_stats.num_succeeded_reqs} / {scheduler_stats.num_failed_reqs} / ' @@ -121,6 +144,10 @@ def log(self): if scheduler_stats.prefix_cache_hit_rate != 0: log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, ' + if self.total_multimodal_requests: + avg_mm_time = self.total_multimodal_preprocess_time / self.total_multimodal_requests + log_msg += f'Avg MM preprocess: {avg_mm_time:.3f} s/req, ' + if spec_msg is not None: log_msg += spec_msg @@ -130,7 +157,7 @@ def log(self): class PrometheusStatLogger(StatLoggerBase): - def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): + def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable_mm_metrics: bool = False): try: import prometheus_client prometheus_client.disable_created_metrics() # disable noisy creation timestamp gauge in prometheus @@ -139,6 +166,7 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): 'To use metrics system , please install prometheus_client by `pip install prometheus_client`') self.dp_rank = dp_rank + self.enable_mm_metrics = enable_mm_metrics # unregister any existing lmdeploy collectors for collector in list(prometheus_client.REGISTRY._collector_to_names): @@ -151,6 +179,7 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): labelnames = ['model_name', 'engine'] labelvalues = [model_name, str(dp_rank)] + self.labelvalues = labelvalues # # Scheduler stats @@ -350,6 +379,39 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) + # + # Multimodal preprocessing + # + self.counter_multimodal_requests = prometheus_client.Counter( + name='lmdeploy:multimodal_requests_total', + documentation='Count of multimodal requests.', + labelnames=labelnames).labels(*labelvalues) + self.counter_multimodal_items = prometheus_client.Counter( + name='lmdeploy:multimodal_items_total', + documentation='Count of multimodal input items by modality.', + labelnames=labelnames + ['modality']) + self.histogram_multimodal_preprocess_time = prometheus_client.Histogram( + name='lmdeploy:multimodal_preprocess_time_seconds', + documentation='Histogram of multimodal preprocessing time in seconds.', + buckets=request_latency_buckets, + labelnames=labelnames).labels(*labelvalues) + + if self.enable_mm_metrics: + self.histogram_multimodal_stage_time = prometheus_client.Histogram( + name='lmdeploy:multimodal_stage_time_seconds', + documentation='Histogram of multimodal preprocessing stage time in seconds.', + buckets=request_latency_buckets, + labelnames=labelnames + ['stage', 'modality']) + self.histogram_multimodal_item_count = prometheus_client.Histogram( + name='lmdeploy:multimodal_item_count', + documentation='Histogram of multimodal item counts per request.', + buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256], + labelnames=labelnames + ['modality']) + self.counter_multimodal_failures = prometheus_client.Counter( + name='lmdeploy:multimodal_processing_failures_total', + documentation='Count of multimodal preprocessing failures.', + labelnames=labelnames + ['stage', 'modality']) + def record_schedule(self, stats: SchedulerStats) -> None: """Report schedule metrics to prometheus.""" self.gauge_scheduler_succeeded.set(stats.num_succeeded_reqs) @@ -391,6 +453,29 @@ def _get_counter_value(counter) -> float: """Get the current value from a prometheus counter child.""" return counter._value.get() + def record_multimodal(self, stats: MultimodalStats) -> None: + if not stats.has_data: + return + + total_time, stage_times, item_counts, failures = stats.snapshot() + labelvalues = self.labelvalues + self.counter_multimodal_requests.inc() + if total_time > 0: + self.histogram_multimodal_preprocess_time.observe(total_time) + + for modality, count in item_counts.items(): + self.counter_multimodal_items.labels(*(labelvalues + [modality])).inc(count) + + if not self.enable_mm_metrics: + return + + for (stage, modality), seconds in stage_times.items(): + self.histogram_multimodal_stage_time.labels(*(labelvalues + [stage, modality])).observe(seconds) + for modality, count in item_counts.items(): + self.histogram_multimodal_item_count.labels(*(labelvalues + [modality])).observe(count) + for (stage, modality), count in failures.items(): + self.counter_multimodal_failures.labels(*(labelvalues + [stage, modality])).inc(count) + def record_specdecode(self, stats: SpeculativeDecodingStats) -> None: """Report speculative decoding metrics to prometheus.""" if stats.num_drafts <= 0: diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py index 81db4fa57d..f67218153d 100644 --- a/lmdeploy/metrics/metrics_processor.py +++ b/lmdeploy/metrics/metrics_processor.py @@ -5,7 +5,7 @@ from lmdeploy.pytorch.utils import singleton from lmdeploy.utils import get_logger -from .stats import SchedulerStats +from .stats import MultimodalStats, SchedulerStats logger = get_logger('lmdeploy') @@ -17,14 +17,16 @@ class MetricsProcessor: def __init__(self): """Init metrics processor.""" self.enable_metrics: bool = False + self.enable_mm_metrics: bool = False self.scheduler_stats = SchedulerStats() self.stat_loggers = [] self.metrics_queue: asyncio.Queue = None self.metrics_handler: asyncio.Task = None - def start_metrics_handler(self, enable_metrics: bool): + def start_metrics_handler(self, enable_metrics: bool, enable_mm_metrics: bool = False): """Start metrics handler.""" self.enable_metrics = enable_metrics + self.enable_mm_metrics = enable_mm_metrics if enable_metrics and self.metrics_handler is None: self.metrics_queue = asyncio.Queue() self.metrics_handler = asyncio.create_task(self._run_metrics_handler()) @@ -93,6 +95,17 @@ def queue_update(self, update_data: tuple): return self.metrics_queue.put_nowait(update_data) + def record_multimodal(self, stats: MultimodalStats | None): + """Record multimodal preprocessing stats.""" + if not self.enable_metrics or stats is None: + return + stats.finish() + if not stats.mark_emitted(): + return + + for stat_logger in self.stat_loggers: + stat_logger.record_multimodal(stats) + def increase_total_requests(self): """Increase total requests.""" self.scheduler_stats.num_total_reqs += 1 diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py index 7aa8e9efa5..31a2a5e895 100644 --- a/lmdeploy/metrics/stats.py +++ b/lmdeploy/metrics/stats.py @@ -2,7 +2,11 @@ # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/stats.py import time +from collections import defaultdict +from contextlib import contextmanager from dataclasses import dataclass +from threading import Lock +from typing import Iterator import numpy as np @@ -94,6 +98,98 @@ def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics): self.prefix_cache_hit_rate = scheduled_metrics.prefix_cache_hit_rate +class MultimodalStats: + """Stats associated with multimodal prompt preprocessing.""" + + def __init__(self, enabled: bool = True, detailed: bool = False): + """Initialize multimodal preprocessing stats. + + Args: + enabled (bool): Whether to record multimodal metrics. + detailed (bool): Whether to collect per-stage timing and failure + metrics in addition to total request-level metrics. + """ + self.enabled = enabled + self.detailed = detailed + self.start_time = time.perf_counter() + self.total_time: float = 0.0 + self.stage_times: dict[tuple[str, str], float] = defaultdict(float) + self.item_counts: dict[str, int] = defaultdict(int) + self.failures: dict[tuple[str, str], int] = defaultdict(int) + self.finished = False + # Avoid duplicate emission if success/error paths both try to record. + self._emitted = False + # Multimodal item parsing can update this object from executor threads. + self._lock = Lock() + + def add_stage(self, stage: str, seconds: float, modality='all') -> None: + """Add elapsed time for a multimodal preprocessing stage.""" + if not self.enabled: + return + seconds = max(float(seconds), 0.0) + with self._lock: + self.stage_times[(stage, modality)] += seconds + + def add_item(self, modality, count: int = 1) -> None: + """Record multimodal input item count by modality.""" + if not self.enabled: + return + with self._lock: + self.item_counts[modality] += count + + def record_failure(self, stage: str = 'total', modality='all') -> None: + """Record a multimodal preprocessing failure.""" + if not self.enabled: + return + with self._lock: + self.failures[(stage, modality)] += 1 + + @contextmanager + def span(self, stage: str, modality='all') -> Iterator[None]: + """Measure a multimodal stage when detailed metrics are enabled.""" + if not self.enabled or not self.detailed: + yield + return + + start = time.perf_counter() + try: + yield + finally: + self.add_stage(stage, time.perf_counter() - start, modality) + + def finish(self) -> None: + """Mark total multimodal preprocessing time.""" + if not self.enabled: + return + with self._lock: + if not self.finished: + self.total_time = time.perf_counter() - self.start_time + self.finished = True + + @property + def has_data(self) -> bool: + """Whether this object contains any multimodal metric data.""" + with self._lock: + return bool(self.item_counts or self.stage_times or self.failures) + + def mark_emitted(self) -> bool: + """Mark this stats object as emitted. + + Returns: + bool: True if this call marked it for the first time. + """ + with self._lock: + if self._emitted or not (self.item_counts or self.stage_times or self.failures): + return False + self._emitted = True + return True + + def snapshot(self) -> tuple[float, dict[tuple[str, str], float], dict[str, int], dict[tuple[str, str], int]]: + """Return a thread-safe copy of the collected stats.""" + with self._lock: + return self.total_time, dict(self.stage_times), dict(self.item_counts), dict(self.failures) + + class RequestStats: """Stats associated with a request.""" diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index 48749bfa75..ccaa67fe32 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -23,7 +23,7 @@ TurbomindEngineConfig, ) from lmdeploy.metrics.metrics_processor import metrics_processor -from lmdeploy.metrics.stats import IterationStats, RequestStats, SpeculativeDecodingStats +from lmdeploy.metrics.stats import IterationStats, MultimodalStats, RequestStats, SpeculativeDecodingStats from lmdeploy.model import ChatTemplateConfig, get_chat_template from lmdeploy.pytorch.disagg.conn.protocol import ( DistServeConnectionRequest, @@ -214,11 +214,15 @@ def _build_stat_loggers(self): # currently, metrics in TM engine doesn't support dp dp_rank = self.backend_config.dp_rank if self.backend == 'pytorch' else 0 + metrics_processor.enable_mm_metrics = self.backend_config.enable_mm_metrics logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}') self.stat_loggers = [ LoggingStatLogger(dp_rank=dp_rank), - PrometheusStatLogger(model_name=self.model_name, max_model_len=self.session_len, dp_rank=dp_rank) + PrometheusStatLogger(model_name=self.model_name, + max_model_len=self.session_len, + dp_rank=dp_rank, + enable_mm_metrics=metrics_processor.enable_mm_metrics) ] # set stats loggers of metrics processor @@ -510,6 +514,8 @@ async def generate( logger.warning('chat_template_kwargs["enable_thinking"] is already set, ' 'the value will not be overwritten by enable_thinking') if messages: + mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics, + detailed=metrics_processor.enable_mm_metrics) try: prompt = messages self.request_logger.log_prompt(session, prompt=prompt) @@ -522,7 +528,9 @@ async def generate( chat_template_kwargs=chat_template_kwargs, media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, + mm_stats=mm_stats, **kwargs) + metrics_processor.record_multimodal(mm_stats) prompt = prompt_input.get('prompt') input_ids = prompt_input.get('input_ids') self.request_logger.log_inputs(session, @@ -532,6 +540,7 @@ async def generate( adapter_name=adapter_name) except Exception: logger.exception('[generate] error in prompt processing') + metrics_processor.record_multimodal(mm_stats) metrics_processor.increase_failed_requests('error') yield GenOut(response='in prompt processing error', history_token_len=session.step, diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 63b63cdd2a..dcc54593d7 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -1358,7 +1358,9 @@ async def lifespan_handler(app: FastAPI): try: health_monitor.start() if getattr(backend_config, 'enable_metrics', False): - metrics_processor.start_metrics_handler(enable_metrics=True) + metrics_processor.start_metrics_handler( + enable_metrics=True, + enable_mm_metrics=backend_config.enable_mm_metrics) log_interval = 10. async def _force_log(): diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index b44b75e491..4005354799 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. import asyncio +from contextlib import contextmanager from typing import Any, Literal import PIL +from lmdeploy.metrics.stats import MultimodalStats from lmdeploy.model import MODELS, BaseChatTemplate from lmdeploy.tokenizer import Tokenizer from lmdeploy.utils import get_logger @@ -16,6 +18,16 @@ logger = get_logger('lmdeploy') +@contextmanager +def _mm_stage(mm_stats: MultimodalStats, stage: str, modality='all'): + try: + with mm_stats.span(stage, modality): + yield + except Exception: + mm_stats.record_failure(stage, modality) + raise + + class MultimodalProcessor: """Processor for handling prompt preprocessing, message content merging, and multimodal processing.""" @@ -91,8 +103,11 @@ def merge_message_content(msg: dict) -> dict: @staticmethod def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[dict], media_io_kwargs: dict[str, - Any]): + Any], + mm_stats: MultimodalStats | None = None): """Synchronous helper to parse a single multimodal message item.""" + media_io_kwargs = media_io_kwargs or {} + mm_stats = mm_stats or MultimodalStats(enabled=False) role = in_messages[i]['role'] content = in_messages[i]['content'] @@ -130,28 +145,39 @@ def _require_data_src(): raise ValueError(f'Invalid multimodal item at index {i}: {item}. ' f'Expected "{item_type}" to be a direct value or a dict containing "url" or "data".') + modality = None if item_type == 'image_data': modality = Modality.IMAGE + mm_stats.add_item(modality.value) data = _require_data_src() elif item_type in ('image_url', 'image'): modality = Modality.IMAGE + mm_stats.add_item(modality.value) data_src = _require_data_src() if isinstance(data_src, PIL.Image.Image): data = data_src elif isinstance(data_src, str): - data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {}))) + with _mm_stage(mm_stats, 'media_io', modality.value): + data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {}))) else: raise ValueError(f'Invalid multimodal image item at index {i}: {item}. ' - 'Expected a str URL/path/data URL or PIL.Image.Image.') + 'Expected a str URL/path/data URL or PIL.Image.Image.') elif item_type in ('video_url', 'video'): modality = Modality.VIDEO - data, metadata = load_from_url( - _require_data_src(), VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))) + mm_stats.add_item(modality.value) + with _mm_stage(mm_stats, 'media_io', modality.value): + data, metadata = load_from_url( + _require_data_src(), + VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))) item_params['video_metadata'] = metadata elif item_type in ('time_series_url', 'time_series'): modality = Modality.TIME_SERIES - data = load_from_url(_require_data_src(), TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))) + mm_stats.add_item(modality.value) + with _mm_stage(mm_stats, 'media_io', modality.value): + data = load_from_url(_require_data_src(), + TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))) else: + mm_stats.record_failure('media_io', 'unknown') raise NotImplementedError(f'unknown type: {item_type}') out_message['content'].append({'type': modality.value, 'data': data, **item_params}) @@ -160,7 +186,8 @@ def _require_data_src(): @staticmethod async def async_parse_multimodal_item(messages: list[dict], - media_io_kwargs: dict[str, Any] | None = None) -> list[dict]: + media_io_kwargs: dict[str, Any] | None = None, + mm_stats: MultimodalStats | None = None) -> list[dict]: """Convert user-input multimodal data into GPT4V message format.""" if isinstance(messages, dict): messages = [messages] @@ -172,7 +199,7 @@ async def async_parse_multimodal_item(messages: list[dict], await asyncio.gather(*[ loop.run_in_executor(None, MultimodalProcessor._parse_multimodal_item, i, messages, out_messages, - media_io_kwargs) for i in range(len(messages)) + media_io_kwargs, mm_stats) for i in range(len(messages)) ]) return out_messages @@ -186,6 +213,7 @@ async def get_prompt_input(self, chat_template_kwargs: dict | None = None, media_io_kwargs: dict[str, Any] | None = None, mm_processor_kwargs: dict[str, Any] | None = None, + mm_stats: MultimodalStats | None = None, **kwargs): """Process prompt and return prompt string and input_ids. @@ -244,6 +272,7 @@ async def get_prompt_input(self, chat_template_kwargs=chat_template_kwargs, media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, + mm_stats=mm_stats, **kwargs) else: raise RuntimeError(f'unsupported prompt type: {type(prompt)}') @@ -375,39 +404,48 @@ async def _get_multimodal_prompt_input(self, chat_template_kwargs: dict | None = None, media_io_kwargs: dict[str, Any] | None = None, mm_processor_kwargs: dict[str, Any] | None = None, + mm_stats: MultimodalStats | None = None, **kwargs): """Process multimodal prompt and return processed data for inference engines.""" + mm_stats = mm_stats or MultimodalStats(enabled=False) chat_template = self.chat_template if do_preprocess else BaseChatTemplate() - messages = await self.async_parse_multimodal_item(messages, media_io_kwargs) + messages = await self.async_parse_multimodal_item(messages, media_io_kwargs, mm_stats=mm_stats) if self.backend == 'turbomind': - results = await self.vl_encoder.preprocess(messages, - mm_processor_kwargs=mm_processor_kwargs) - results = await self.vl_encoder.async_infer(results) - results = await self.vl_encoder.wrap_for_turbomind(messages=results, - chat_template=chat_template, - tokenizer=self.tokenizer, - sequence_start=sequence_start, - tools=tools, - chat_template_kwargs=chat_template_kwargs) - elif self.backend == 'pytorch': - if self.vl_encoder._uses_new_preprocess: - input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages, - chat_template=chat_template, - sequence_start=sequence_start, - chat_template_kwargs=chat_template_kwargs) + with _mm_stage(mm_stats, 'hf_processor'): results = await self.vl_encoder.preprocess(messages, - input_prompt=input_prompt, mm_processor_kwargs=mm_processor_kwargs) + with _mm_stage(mm_stats, 'vision_encoder'): + results = await self.vl_encoder.async_infer(results) + with _mm_stage(mm_stats, 'embedding_merge'): + results = await self.vl_encoder.wrap_for_turbomind(messages=results, + chat_template=chat_template, + tokenizer=self.tokenizer, + sequence_start=sequence_start, + tools=tools, + chat_template_kwargs=chat_template_kwargs) + elif self.backend == 'pytorch': + if self.vl_encoder._uses_new_preprocess: + with _mm_stage(mm_stats, 'prompt_template'): + input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages, + chat_template=chat_template, + sequence_start=sequence_start, + chat_template_kwargs=chat_template_kwargs) + with _mm_stage(mm_stats, 'hf_processor'): + results = await self.vl_encoder.preprocess(messages, + input_prompt=input_prompt, + mm_processor_kwargs=mm_processor_kwargs) else: - results = await self.vl_encoder.preprocess(messages, - mm_processor_kwargs=mm_processor_kwargs) - results = await self.vl_encoder.wrap_for_pytorch(messages=results, - chat_template=chat_template, - tokenizer=self.tokenizer, - sequence_start=sequence_start, - tools=tools, - chat_template_kwargs=chat_template_kwargs) + with _mm_stage(mm_stats, 'hf_processor'): + results = await self.vl_encoder.preprocess(messages, + mm_processor_kwargs=mm_processor_kwargs) + with _mm_stage(mm_stats, 'embedding_merge'): + results = await self.vl_encoder.wrap_for_pytorch(messages=results, + chat_template=chat_template, + tokenizer=self.tokenizer, + sequence_start=sequence_start, + tools=tools, + chat_template_kwargs=chat_template_kwargs) return results diff --git a/tests/test_lmdeploy/test_metrics_multimodal.py b/tests/test_lmdeploy/test_metrics_multimodal.py new file mode 100644 index 0000000000..6d089425ae --- /dev/null +++ b/tests/test_lmdeploy/test_metrics_multimodal.py @@ -0,0 +1,81 @@ +import sys + +import pytest + +from lmdeploy.metrics.loggers import PrometheusStatLogger +from lmdeploy.metrics.stats import MultimodalStats +from lmdeploy.serve.processors import MultimodalProcessor +from lmdeploy.vl.constants import Modality + +multimodal_module = sys.modules[MultimodalProcessor.__module__] + + +def test_multimodal_stats_snapshot_and_emit_guard(): + stats = MultimodalStats(detailed=True) + + stats.add_item(Modality.IMAGE.value, count=2) + stats.add_stage('media_io', 0.1, Modality.IMAGE.value) + stats.record_failure('media_io', Modality.IMAGE.value) + stats.finish() + + total_time, stage_times, item_counts, failures = stats.snapshot() + + assert total_time >= 0 + assert item_counts == {'image': 2} + assert stage_times == {('media_io', 'image'): 0.1} + assert failures == {('media_io', 'image'): 1} + assert stats.mark_emitted() + assert not stats.mark_emitted() + + +def test_prometheus_logger_records_multimodal_metrics(): + pytest.importorskip('prometheus_client') + from prometheus_client import REGISTRY, generate_latest + + logger = PrometheusStatLogger('test-model', 128, dp_rank=0, enable_mm_metrics=True) + stats = MultimodalStats(detailed=True) + stats.add_item(Modality.IMAGE.value, count=2) + stats.add_stage('media_io', 0.1, Modality.IMAGE.value) + stats.record_failure('media_io', Modality.IMAGE.value) + stats.finish() + + logger.record_multimodal(stats) + metrics_text = generate_latest(REGISTRY).decode('utf-8') + + assert 'lmdeploy:multimodal_requests_total' in metrics_text + assert 'lmdeploy:multimodal_items_total' in metrics_text + assert 'lmdeploy:multimodal_preprocess_time_seconds' in metrics_text + assert 'lmdeploy:multimodal_stage_time_seconds' in metrics_text + assert 'lmdeploy:multimodal_item_count' in metrics_text + assert 'lmdeploy:multimodal_processing_failures_total' in metrics_text + assert 'modality="image"' in metrics_text + assert 'stage="media_io"' in metrics_text + + +def test_parse_multimodal_item_records_multimodal_stats(monkeypatch): + load_calls = [] + + def fake_load_from_url(data_src, media_io): + load_calls.append((data_src, type(media_io).__name__)) + return f'loaded:{data_src}' + + monkeypatch.setattr(multimodal_module, 'load_from_url', fake_load_from_url) + + messages = [{ + 'role': 'user', + 'content': [{ + 'type': 'image', + 'image': 'file:///tmp/a.png', + }] + }] + stats = MultimodalStats(detailed=True) + parsed = [None] + + MultimodalProcessor._parse_multimodal_item(0, messages, parsed, {}, stats) + _, stage_times, item_counts, failures = stats.snapshot() + + assert parsed[0]['content'][0] == {'type': Modality.IMAGE, 'data': 'loaded:file:///tmp/a.png'} + assert load_calls == [('file:///tmp/a.png', 'ImageMediaIO')] + assert item_counts == {'image': 1} + assert ('media_io', 'image') in stage_times + assert failures == {} From bafee8b8bd65c015011aa1e104d55a15da337e41 Mon Sep 17 00:00:00 2001 From: zxy Date: Mon, 1 Jun 2026 15:45:00 +0800 Subject: [PATCH 2/5] refactor: enable multimodal metrics by default --- docs/en/advance/metrics.md | 13 +++---- docs/zh_cn/advance/metrics.md | 12 +++---- lmdeploy/cli/serve.py | 4 --- lmdeploy/cli/utils.py | 8 ----- lmdeploy/messages.py | 4 --- lmdeploy/metrics/loggers.py | 36 ++++++++----------- lmdeploy/metrics/metrics_processor.py | 4 +-- lmdeploy/metrics/stats.py | 9 ++--- lmdeploy/serve/core/async_engine.py | 7 ++-- lmdeploy/serve/openai/api_server.py | 4 +-- .../test_lmdeploy/test_metrics_multimodal.py | 8 ++--- 11 files changed, 34 insertions(+), 75 deletions(-) diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md index fe952072cb..1aef50bbc1 100644 --- a/docs/en/advance/metrics.md +++ b/docs/en/advance/metrics.md @@ -23,21 +23,16 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics Replace the model path according to your needs. By default, the metrics endpoint will be available at `http://:23333/metrics`. -For VLM serving, total multimodal preprocessing metrics are exported when the metrics system is enabled. Use -`--enable-mm-metrics` to additionally export per-stage metrics: - -``` -lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics -``` +For VLM serving, multimodal preprocessing metrics are exported when the metrics system is enabled: The multimodal metrics include: - `lmdeploy:multimodal_requests_total`: multimodal request count - `lmdeploy:multimodal_items_total`: multimodal item count by modality - `lmdeploy:multimodal_preprocess_time_seconds`: total multimodal preprocessing time -- `lmdeploy:multimodal_stage_time_seconds`: detailed stage time by stage and modality, exported with `--enable-mm-metrics` -- `lmdeploy:multimodal_item_count`: item count per request by modality, exported with `--enable-mm-metrics` -- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality, exported with `--enable-mm-metrics` +- `lmdeploy:multimodal_stage_time_seconds`: stage time by stage and modality +- `lmdeploy:multimodal_item_count`: item count per request by modality +- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality 2. **Navigate to the monitoring directory** diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md index 3928d5e60e..64cf2cedf7 100644 --- a/docs/zh_cn/advance/metrics.md +++ b/docs/zh_cn/advance/metrics.md @@ -22,20 +22,16 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics 请根据需求替换模型路径。默认 metrics endpoint 位于 `http://:23333/metrics`。 -VLM 服务在启用指标系统后会默认导出多模态预处理总耗时指标。使用 `--enable-mm-metrics` 可额外导出分阶段指标: - -``` -lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics -``` +VLM 服务在启用指标系统后会导出多模态预处理指标: 多模态指标包括: - `lmdeploy:multimodal_requests_total`:多模态请求数量 - `lmdeploy:multimodal_items_total`:按模态统计的多模态输入数量 - `lmdeploy:multimodal_preprocess_time_seconds`:多模态预处理总耗时 -- `lmdeploy:multimodal_stage_time_seconds`:按阶段和模态统计的耗时,使用 `--enable-mm-metrics` 导出 -- `lmdeploy:multimodal_item_count`:按模态统计的单请求输入数量,使用 `--enable-mm-metrics` 导出 -- `lmdeploy:multimodal_processing_failures_total`:按阶段和模态统计的处理失败次数,使用 `--enable-mm-metrics` 导出 +- `lmdeploy:multimodal_stage_time_seconds`:按阶段和模态统计的耗时 +- `lmdeploy:multimodal_item_count`:按模态统计的单请求输入数量 +- `lmdeploy:multimodal_processing_failures_total`:按阶段和模态统计的处理失败次数 2. **进入监控目录** diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py index 399a17297f..9ca662b961 100644 --- a/lmdeploy/cli/serve.py +++ b/lmdeploy/cli/serve.py @@ -122,7 +122,6 @@ def add_parser_api_server(): model_format = ArgumentHelper.model_format(pt_group) hf_overrides = ArgumentHelper.hf_overrides(pt_group) disable_metrics = ArgumentHelper.disable_metrics(pt_group) - enable_mm_metrics = ArgumentHelper.enable_mm_metrics(pt_group) dp = ArgumentHelper.dp(pt_group) ArgumentHelper.ep(pt_group) ArgumentHelper.enable_microbatch(pt_group) @@ -150,7 +149,6 @@ def add_parser_api_server(): tb_group._group_actions.append(node_rank_act) tb_group._group_actions.append(hf_overrides) tb_group._group_actions.append(disable_metrics) - tb_group._group_actions.append(enable_mm_metrics) tb_group._group_actions.append(dp) ArgumentHelper.cp(tb_group) ArgumentHelper.rope_scaling_factor(tb_group) @@ -241,7 +239,6 @@ def api_server(args): enable_microbatch=args.enable_microbatch, enable_eplb=args.enable_eplb, enable_metrics=not args.disable_metrics, - enable_mm_metrics=args.enable_mm_metrics, role=EngineRole[args.role], migration_backend=MigrationBackend[args.migration_backend], model_format=args.model_format, @@ -278,7 +275,6 @@ def api_server(args): async_=args.async_, communicator=args.communicator, enable_metrics=not args.disable_metrics, - enable_mm_metrics=args.enable_mm_metrics, hf_overrides=args.hf_overrides) chat_template_config = get_chat_template(args.chat_template, args.model_path) speculative_config = get_speculative_config(args) diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py index 3efa66d2bb..2c9f8034f6 100644 --- a/lmdeploy/cli/utils.py +++ b/lmdeploy/cli/utils.py @@ -677,14 +677,6 @@ def disable_metrics(parser): default=False, help='disable metrics system') - @staticmethod - def enable_mm_metrics(parser): - """Add argument enable_mm_metrics to parser.""" - return parser.add_argument('--enable-mm-metrics', - action='store_true', - default=False, - help='enable detailed multimodal metrics') - # For Disaggregation @staticmethod def role(parser): diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py index 4a1682c4d2..aec9e08ee6 100644 --- a/lmdeploy/messages.py +++ b/lmdeploy/messages.py @@ -269,7 +269,6 @@ class TurbomindEngineConfig: hf_overrides: Huggingface overrides for the model. It can be used to override the default config of the model enable_metrics: enable metrics system - enable_mm_metrics: enable detailed multimodal metrics """ dtype: str = 'auto' @@ -308,7 +307,6 @@ class TurbomindEngineConfig: communicator: str = 'nccl' hf_overrides: dict[str, Any] | None = None enable_metrics: bool = True - enable_mm_metrics: bool = False def __post_init__(self): """Check input validation.""" @@ -384,7 +382,6 @@ class PytorchEngineConfig: enable_microbatch: enable microbatch for specified model enable_eplb: enable eplb for specified model enable_metrics: enable metrics system - enable_mm_metrics: enable detailed multimodal metrics role: role of engin, options: ['Hybrid', 'Prefill', 'Decode']. Default to `EngineRole.Hybrid`. migration_backend: migration backend. options: ['DLSlime']. @@ -439,7 +436,6 @@ class PytorchEngineConfig: mp_engine_backend: str = 'mp' model_format: str = None enable_metrics: bool = True - enable_mm_metrics: bool = False hf_overrides: dict[str, Any] | None = None disable_vision_encoder: bool = False logprobs_mode: str = None diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py index 417dbedaf3..002e4175b5 100644 --- a/lmdeploy/metrics/loggers.py +++ b/lmdeploy/metrics/loggers.py @@ -157,7 +157,7 @@ def log(self): class PrometheusStatLogger(StatLoggerBase): - def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable_mm_metrics: bool = False): + def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0): try: import prometheus_client prometheus_client.disable_created_metrics() # disable noisy creation timestamp gauge in prometheus @@ -166,7 +166,6 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable 'To use metrics system , please install prometheus_client by `pip install prometheus_client`') self.dp_rank = dp_rank - self.enable_mm_metrics = enable_mm_metrics # unregister any existing lmdeploy collectors for collector in list(prometheus_client.REGISTRY._collector_to_names): @@ -395,22 +394,20 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable documentation='Histogram of multimodal preprocessing time in seconds.', buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) - - if self.enable_mm_metrics: - self.histogram_multimodal_stage_time = prometheus_client.Histogram( - name='lmdeploy:multimodal_stage_time_seconds', - documentation='Histogram of multimodal preprocessing stage time in seconds.', - buckets=request_latency_buckets, - labelnames=labelnames + ['stage', 'modality']) - self.histogram_multimodal_item_count = prometheus_client.Histogram( - name='lmdeploy:multimodal_item_count', - documentation='Histogram of multimodal item counts per request.', - buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256], - labelnames=labelnames + ['modality']) - self.counter_multimodal_failures = prometheus_client.Counter( - name='lmdeploy:multimodal_processing_failures_total', - documentation='Count of multimodal preprocessing failures.', - labelnames=labelnames + ['stage', 'modality']) + self.histogram_multimodal_stage_time = prometheus_client.Histogram( + name='lmdeploy:multimodal_stage_time_seconds', + documentation='Histogram of multimodal preprocessing stage time in seconds.', + buckets=request_latency_buckets, + labelnames=labelnames + ['stage', 'modality']) + self.histogram_multimodal_item_count = prometheus_client.Histogram( + name='lmdeploy:multimodal_item_count', + documentation='Histogram of multimodal item counts per request.', + buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256], + labelnames=labelnames + ['modality']) + self.counter_multimodal_failures = prometheus_client.Counter( + name='lmdeploy:multimodal_processing_failures_total', + documentation='Count of multimodal preprocessing failures.', + labelnames=labelnames + ['stage', 'modality']) def record_schedule(self, stats: SchedulerStats) -> None: """Report schedule metrics to prometheus.""" @@ -466,9 +463,6 @@ def record_multimodal(self, stats: MultimodalStats) -> None: for modality, count in item_counts.items(): self.counter_multimodal_items.labels(*(labelvalues + [modality])).inc(count) - if not self.enable_mm_metrics: - return - for (stage, modality), seconds in stage_times.items(): self.histogram_multimodal_stage_time.labels(*(labelvalues + [stage, modality])).observe(seconds) for modality, count in item_counts.items(): diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py index f67218153d..6bac129382 100644 --- a/lmdeploy/metrics/metrics_processor.py +++ b/lmdeploy/metrics/metrics_processor.py @@ -17,16 +17,14 @@ class MetricsProcessor: def __init__(self): """Init metrics processor.""" self.enable_metrics: bool = False - self.enable_mm_metrics: bool = False self.scheduler_stats = SchedulerStats() self.stat_loggers = [] self.metrics_queue: asyncio.Queue = None self.metrics_handler: asyncio.Task = None - def start_metrics_handler(self, enable_metrics: bool, enable_mm_metrics: bool = False): + def start_metrics_handler(self, enable_metrics: bool): """Start metrics handler.""" self.enable_metrics = enable_metrics - self.enable_mm_metrics = enable_mm_metrics if enable_metrics and self.metrics_handler is None: self.metrics_queue = asyncio.Queue() self.metrics_handler = asyncio.create_task(self._run_metrics_handler()) diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py index 31a2a5e895..78d95fc539 100644 --- a/lmdeploy/metrics/stats.py +++ b/lmdeploy/metrics/stats.py @@ -101,16 +101,13 @@ def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics): class MultimodalStats: """Stats associated with multimodal prompt preprocessing.""" - def __init__(self, enabled: bool = True, detailed: bool = False): + def __init__(self, enabled: bool = True): """Initialize multimodal preprocessing stats. Args: enabled (bool): Whether to record multimodal metrics. - detailed (bool): Whether to collect per-stage timing and failure - metrics in addition to total request-level metrics. """ self.enabled = enabled - self.detailed = detailed self.start_time = time.perf_counter() self.total_time: float = 0.0 self.stage_times: dict[tuple[str, str], float] = defaultdict(float) @@ -146,8 +143,8 @@ def record_failure(self, stage: str = 'total', modality='all') -> None: @contextmanager def span(self, stage: str, modality='all') -> Iterator[None]: - """Measure a multimodal stage when detailed metrics are enabled.""" - if not self.enabled or not self.detailed: + """Measure a multimodal preprocessing stage.""" + if not self.enabled: yield return diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py index ccaa67fe32..195dd44e78 100644 --- a/lmdeploy/serve/core/async_engine.py +++ b/lmdeploy/serve/core/async_engine.py @@ -214,15 +214,13 @@ def _build_stat_loggers(self): # currently, metrics in TM engine doesn't support dp dp_rank = self.backend_config.dp_rank if self.backend == 'pytorch' else 0 - metrics_processor.enable_mm_metrics = self.backend_config.enable_mm_metrics logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}') self.stat_loggers = [ LoggingStatLogger(dp_rank=dp_rank), PrometheusStatLogger(model_name=self.model_name, max_model_len=self.session_len, - dp_rank=dp_rank, - enable_mm_metrics=metrics_processor.enable_mm_metrics) + dp_rank=dp_rank) ] # set stats loggers of metrics processor @@ -514,8 +512,7 @@ async def generate( logger.warning('chat_template_kwargs["enable_thinking"] is already set, ' 'the value will not be overwritten by enable_thinking') if messages: - mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics, - detailed=metrics_processor.enable_mm_metrics) + mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics) try: prompt = messages self.request_logger.log_prompt(session, prompt=prompt) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index dcc54593d7..63b63cdd2a 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -1358,9 +1358,7 @@ async def lifespan_handler(app: FastAPI): try: health_monitor.start() if getattr(backend_config, 'enable_metrics', False): - metrics_processor.start_metrics_handler( - enable_metrics=True, - enable_mm_metrics=backend_config.enable_mm_metrics) + metrics_processor.start_metrics_handler(enable_metrics=True) log_interval = 10. async def _force_log(): diff --git a/tests/test_lmdeploy/test_metrics_multimodal.py b/tests/test_lmdeploy/test_metrics_multimodal.py index 6d089425ae..4d6e3d4620 100644 --- a/tests/test_lmdeploy/test_metrics_multimodal.py +++ b/tests/test_lmdeploy/test_metrics_multimodal.py @@ -11,7 +11,7 @@ def test_multimodal_stats_snapshot_and_emit_guard(): - stats = MultimodalStats(detailed=True) + stats = MultimodalStats() stats.add_item(Modality.IMAGE.value, count=2) stats.add_stage('media_io', 0.1, Modality.IMAGE.value) @@ -32,8 +32,8 @@ def test_prometheus_logger_records_multimodal_metrics(): pytest.importorskip('prometheus_client') from prometheus_client import REGISTRY, generate_latest - logger = PrometheusStatLogger('test-model', 128, dp_rank=0, enable_mm_metrics=True) - stats = MultimodalStats(detailed=True) + logger = PrometheusStatLogger('test-model', 128, dp_rank=0) + stats = MultimodalStats() stats.add_item(Modality.IMAGE.value, count=2) stats.add_stage('media_io', 0.1, Modality.IMAGE.value) stats.record_failure('media_io', Modality.IMAGE.value) @@ -68,7 +68,7 @@ def fake_load_from_url(data_src, media_io): 'image': 'file:///tmp/a.png', }] }] - stats = MultimodalStats(detailed=True) + stats = MultimodalStats() parsed = [None] MultimodalProcessor._parse_multimodal_item(0, messages, parsed, {}, stats) From 4265ff9b7f12c7f4c6af87bcd4f685e8b1745bb9 Mon Sep 17 00:00:00 2001 From: zxy Date: Mon, 1 Jun 2026 16:00:36 +0800 Subject: [PATCH 3/5] style: fix metrics lint --- lmdeploy/metrics/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py index 78d95fc539..80a6480bcf 100644 --- a/lmdeploy/metrics/stats.py +++ b/lmdeploy/metrics/stats.py @@ -3,10 +3,10 @@ import time from collections import defaultdict +from collections.abc import Iterator from contextlib import contextmanager from dataclasses import dataclass from threading import Lock -from typing import Iterator import numpy as np From 18b8be342ec4b8545c189827e6e87be11ccb77ca Mon Sep 17 00:00:00 2001 From: zxy Date: Mon, 1 Jun 2026 16:20:15 +0800 Subject: [PATCH 4/5] fix: record multimodal parse failures --- lmdeploy/serve/processors/multimodal.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index 4005354799..6328122326 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -139,13 +139,15 @@ def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[d item_params = {k: v for k, v in item.items() if k not in ('type', item_type)} data_src = item_val + modality = None + def _require_data_src(): if data_src is not None: return data_src + mm_stats.record_failure('parse', modality.value) raise ValueError(f'Invalid multimodal item at index {i}: {item}. ' f'Expected "{item_type}" to be a direct value or a dict containing "url" or "data".') - modality = None if item_type == 'image_data': modality = Modality.IMAGE mm_stats.add_item(modality.value) @@ -160,22 +162,23 @@ def _require_data_src(): with _mm_stage(mm_stats, 'media_io', modality.value): data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {}))) else: + mm_stats.record_failure('parse', modality.value) raise ValueError(f'Invalid multimodal image item at index {i}: {item}. ' 'Expected a str URL/path/data URL or PIL.Image.Image.') elif item_type in ('video_url', 'video'): modality = Modality.VIDEO mm_stats.add_item(modality.value) + data_src = _require_data_src() with _mm_stage(mm_stats, 'media_io', modality.value): data, metadata = load_from_url( - _require_data_src(), - VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))) + data_src, VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {}))) item_params['video_metadata'] = metadata elif item_type in ('time_series_url', 'time_series'): modality = Modality.TIME_SERIES mm_stats.add_item(modality.value) + data_src = _require_data_src() with _mm_stage(mm_stats, 'media_io', modality.value): - data = load_from_url(_require_data_src(), - TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))) + data = load_from_url(data_src, TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {}))) else: mm_stats.record_failure('media_io', 'unknown') raise NotImplementedError(f'unknown type: {item_type}') From c6df63de52405e00e3afa4dc701a9f8efbd97b08 Mon Sep 17 00:00:00 2001 From: zxy Date: Mon, 1 Jun 2026 16:43:46 +0800 Subject: [PATCH 5/5] refactor: simplify multimodal metrics defaults --- lmdeploy/metrics/loggers.py | 3 +-- lmdeploy/serve/processors/multimodal.py | 11 ++++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py index 002e4175b5..1661439434 100644 --- a/lmdeploy/metrics/loggers.py +++ b/lmdeploy/metrics/loggers.py @@ -119,8 +119,7 @@ def log(self): now = time.perf_counter() # skip logging if no tokens were processed - if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0 - and self.total_multimodal_requests == 0): + if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0): self._reset(now) return diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py index 6328122326..140e37358f 100644 --- a/lmdeploy/serve/processors/multimodal.py +++ b/lmdeploy/serve/processors/multimodal.py @@ -102,12 +102,12 @@ def merge_message_content(msg: dict) -> dict: return result @staticmethod - def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[dict], media_io_kwargs: dict[str, - Any], - mm_stats: MultimodalStats | None = None): + def _parse_multimodal_item(i: int, + in_messages: list[dict], + out_messages: list[dict], + media_io_kwargs: dict[str, Any], + mm_stats: MultimodalStats): """Synchronous helper to parse a single multimodal message item.""" - media_io_kwargs = media_io_kwargs or {} - mm_stats = mm_stats or MultimodalStats(enabled=False) role = in_messages[i]['role'] content = in_messages[i]['content'] @@ -198,6 +198,7 @@ async def async_parse_multimodal_item(messages: list[dict], out_messages = [None] * len(messages) media_io_kwargs = media_io_kwargs or {} + mm_stats = mm_stats or MultimodalStats(enabled=False) loop = asyncio.get_event_loop() await asyncio.gather(*[