From 5ae54bba537e281c5e7d42814890a1d77e30bb4d Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 1 Jun 2026 12:26:12 +0800
Subject: [PATCH 1/5] feat: add multimodal metrics

---
 docs/en/advance/metrics.md                    |  16 +++
 docs/zh_cn/advance/metrics.md                 |  15 +++
 lmdeploy/cli/serve.py                         |   4 +
 lmdeploy/cli/utils.py                         |   8 ++
 lmdeploy/messages.py                          |   4 +
 lmdeploy/metrics/loggers.py                   |  93 +++++++++++++++-
 lmdeploy/metrics/metrics_processor.py         |  17 ++-
 lmdeploy/metrics/stats.py                     |  96 ++++++++++++++++
 lmdeploy/serve/core/async_engine.py           |  13 ++-
 lmdeploy/serve/openai/api_server.py           |   4 +-
 lmdeploy/serve/processors/multimodal.py       | 104 ++++++++++++------
 .../test_lmdeploy/test_metrics_multimodal.py  |  81 ++++++++++++++
 12 files changed, 413 insertions(+), 42 deletions(-)
 create mode 100644 tests/test_lmdeploy/test_metrics_multimodal.py

diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md
index 1f05807d9a..fe952072cb 100644
--- a/docs/en/advance/metrics.md
+++ b/docs/en/advance/metrics.md
@@ -23,6 +23,22 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
 Replace the model path according to your needs.
 By default, the metrics endpoint will be available at `http://<lmdeploy_server_host>:23333/metrics`.
 
+For VLM serving, total multimodal preprocessing metrics are exported when the metrics system is enabled. Use
+`--enable-mm-metrics` to additionally export per-stage metrics:
+
+```
+lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics
+```
+
+The multimodal metrics include:
+
+- `lmdeploy:multimodal_requests_total`: multimodal request count
+- `lmdeploy:multimodal_items_total`: multimodal item count by modality
+- `lmdeploy:multimodal_preprocess_time_seconds`: total multimodal preprocessing time
+- `lmdeploy:multimodal_stage_time_seconds`: detailed stage time by stage and modality, exported with `--enable-mm-metrics`
+- `lmdeploy:multimodal_item_count`: item count per request by modality, exported with `--enable-mm-metrics`
+- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality, exported with `--enable-mm-metrics`
+
 2. **Navigate to the monitoring directory**
 
 ```
diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md
index 7eb4eec5cc..3928d5e60e 100644
--- a/docs/zh_cn/advance/metrics.md
+++ b/docs/zh_cn/advance/metrics.md
@@ -22,6 +22,21 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
 
 请根据需求替换模型路径。默认 metrics endpoint 位于 `http://<lmdeploy_server_host>:23333/metrics`。
 
+VLM 服务在启用指标系统后会默认导出多模态预处理总耗时指标。使用 `--enable-mm-metrics` 可额外导出分阶段指标：
+
+```
+lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics
+```
+
+多模态指标包括：
+
+- `lmdeploy:multimodal_requests_total`：多模态请求数量
+- `lmdeploy:multimodal_items_total`：按模态统计的多模态输入数量
+- `lmdeploy:multimodal_preprocess_time_seconds`：多模态预处理总耗时
+- `lmdeploy:multimodal_stage_time_seconds`：按阶段和模态统计的耗时，使用 `--enable-mm-metrics` 导出
+- `lmdeploy:multimodal_item_count`：按模态统计的单请求输入数量，使用 `--enable-mm-metrics` 导出
+- `lmdeploy:multimodal_processing_failures_total`：按阶段和模态统计的处理失败次数，使用 `--enable-mm-metrics` 导出
+
 2. **进入监控目录**
 
 ```
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 9ca662b961..399a17297f 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -122,6 +122,7 @@ def add_parser_api_server():
         model_format = ArgumentHelper.model_format(pt_group)
         hf_overrides = ArgumentHelper.hf_overrides(pt_group)
         disable_metrics = ArgumentHelper.disable_metrics(pt_group)
+        enable_mm_metrics = ArgumentHelper.enable_mm_metrics(pt_group)
         dp = ArgumentHelper.dp(pt_group)
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
@@ -149,6 +150,7 @@ def add_parser_api_server():
         tb_group._group_actions.append(node_rank_act)
         tb_group._group_actions.append(hf_overrides)
         tb_group._group_actions.append(disable_metrics)
+        tb_group._group_actions.append(enable_mm_metrics)
         tb_group._group_actions.append(dp)
         ArgumentHelper.cp(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
@@ -239,6 +241,7 @@ def api_server(args):
                 enable_microbatch=args.enable_microbatch,
                 enable_eplb=args.enable_eplb,
                 enable_metrics=not args.disable_metrics,
+                enable_mm_metrics=args.enable_mm_metrics,
                 role=EngineRole[args.role],
                 migration_backend=MigrationBackend[args.migration_backend],
                 model_format=args.model_format,
@@ -275,6 +278,7 @@ def api_server(args):
                                                    async_=args.async_,
                                                    communicator=args.communicator,
                                                    enable_metrics=not args.disable_metrics,
+                                                   enable_mm_metrics=args.enable_mm_metrics,
                                                    hf_overrides=args.hf_overrides)
         chat_template_config = get_chat_template(args.chat_template, args.model_path)
         speculative_config = get_speculative_config(args)
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 2c9f8034f6..3efa66d2bb 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -677,6 +677,14 @@ def disable_metrics(parser):
                                    default=False,
                                    help='disable metrics system')
 
+    @staticmethod
+    def enable_mm_metrics(parser):
+        """Add argument enable_mm_metrics to parser."""
+        return parser.add_argument('--enable-mm-metrics',
+                                   action='store_true',
+                                   default=False,
+                                   help='enable detailed multimodal metrics')
+
     # For Disaggregation
     @staticmethod
     def role(parser):
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index aec9e08ee6..4a1682c4d2 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -269,6 +269,7 @@ class TurbomindEngineConfig:
         hf_overrides: Huggingface overrides for the model.
             It can be used to override the default config of the model
         enable_metrics: enable metrics system
+        enable_mm_metrics: enable detailed multimodal metrics
     """
 
     dtype: str = 'auto'
@@ -307,6 +308,7 @@ class TurbomindEngineConfig:
     communicator: str = 'nccl'
     hf_overrides: dict[str, Any] | None = None
     enable_metrics: bool = True
+    enable_mm_metrics: bool = False
 
     def __post_init__(self):
         """Check input validation."""
@@ -382,6 +384,7 @@ class PytorchEngineConfig:
         enable_microbatch: enable microbatch for specified model
         enable_eplb: enable eplb for specified model
         enable_metrics: enable metrics system
+        enable_mm_metrics: enable detailed multimodal metrics
         role: role of engin, options: ['Hybrid', 'Prefill',
             'Decode']. Default to `EngineRole.Hybrid`.
         migration_backend: migration backend. options: ['DLSlime'].
@@ -436,6 +439,7 @@ class PytorchEngineConfig:
     mp_engine_backend: str = 'mp'
     model_format: str = None
     enable_metrics: bool = True
+    enable_mm_metrics: bool = False
     hf_overrides: dict[str, Any] | None = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
index 69e24ede4f..417dbedaf3 100644
--- a/lmdeploy/metrics/loggers.py
+++ b/lmdeploy/metrics/loggers.py
@@ -7,7 +7,13 @@
 
 import numpy as np
 
-from lmdeploy.metrics.stats import IterationStats, RequestStats, SchedulerStats, SpeculativeDecodingStats
+from lmdeploy.metrics.stats import (
+    IterationStats,
+    MultimodalStats,
+    RequestStats,
+    SchedulerStats,
+    SpeculativeDecodingStats,
+)
 from lmdeploy.utils import get_logger
 
 logger = get_logger('lmdeploy')
@@ -27,6 +33,9 @@ def record_iteration(self, stats: IterationStats) -> None:
     def record_specdecode(self, stats: SpeculativeDecodingStats) -> None:
         ...
 
+    def record_multimodal(self, stats: MultimodalStats) -> None:
+        ...
+
     def log(self):  # noqa
         pass
 
@@ -42,6 +51,9 @@ def _reset(self, now):
         self.last_log_time = now
         self.total_prompt_tokens = 0
         self.total_generation_tokens = 0
+        self.total_multimodal_requests = 0
+        self.total_multimodal_items = 0
+        self.total_multimodal_preprocess_time = 0.0
         # spec decode
         self.num_drafts: int = 0
         self.num_draft_tokens: int = 0
@@ -72,6 +84,15 @@ def record_specdecode(self, stats: SpeculativeDecodingStats):
     def record_finish(self, stats: RequestStats):
         pass
 
+    def record_multimodal(self, stats: MultimodalStats) -> None:
+        if not stats.has_data:
+            return
+
+        total_time, _, item_counts, _ = stats.snapshot()
+        self.total_multimodal_requests += 1
+        self.total_multimodal_items += sum(item_counts.values())
+        self.total_multimodal_preprocess_time += total_time
+
     def get_spec_msg(self):
         """Get spec decoding logging msg."""
         if self.num_drafts == 0:
@@ -98,7 +119,8 @@ def log(self):
         now = time.perf_counter()
 
         # skip logging if no tokens were processed
-        if self.total_prompt_tokens == 0 and self.total_generation_tokens == 0:
+        if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0
+                and self.total_multimodal_requests == 0):
             self._reset(now)
             return
 
@@ -109,7 +131,8 @@ def log(self):
         spec_msg = self.get_spec_msg()
 
         # format and print
-        log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} DP{self.dp_rank}] "
+        log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} "
+                   f'Engine {self.dp_rank:03d}] '
                    f'Avg thr (in/out): {prompt_throughput:.1f} / {generation_throughput:.1f} tokens/s, '
                    f'Server (succeeded/failed/routed/waiting): '
                    f'{scheduler_stats.num_succeeded_reqs} / {scheduler_stats.num_failed_reqs} / '
@@ -121,6 +144,10 @@ def log(self):
         if scheduler_stats.prefix_cache_hit_rate != 0:
             log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, '
 
+        if self.total_multimodal_requests:
+            avg_mm_time = self.total_multimodal_preprocess_time / self.total_multimodal_requests
+            log_msg += f'Avg MM preprocess: {avg_mm_time:.3f} s/req, '
+
         if spec_msg is not None:
             log_msg += spec_msg
 
@@ -130,7 +157,7 @@ def log(self):
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
+    def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable_mm_metrics: bool = False):
         try:
             import prometheus_client
             prometheus_client.disable_created_metrics()  # disable noisy creation timestamp gauge in prometheus
@@ -139,6 +166,7 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
                 'To use metrics system , please install prometheus_client by `pip install prometheus_client`')
 
         self.dp_rank = dp_rank
+        self.enable_mm_metrics = enable_mm_metrics
 
         # unregister any existing lmdeploy collectors
         for collector in list(prometheus_client.REGISTRY._collector_to_names):
@@ -151,6 +179,7 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
 
         labelnames = ['model_name', 'engine']
         labelvalues = [model_name, str(dp_rank)]
+        self.labelvalues = labelvalues
 
         #
         # Scheduler stats
@@ -350,6 +379,39 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
                 buckets=request_latency_buckets,
                 labelnames=labelnames).labels(*labelvalues)
 
+        #
+        # Multimodal preprocessing
+        #
+        self.counter_multimodal_requests = prometheus_client.Counter(
+            name='lmdeploy:multimodal_requests_total',
+            documentation='Count of multimodal requests.',
+            labelnames=labelnames).labels(*labelvalues)
+        self.counter_multimodal_items = prometheus_client.Counter(
+            name='lmdeploy:multimodal_items_total',
+            documentation='Count of multimodal input items by modality.',
+            labelnames=labelnames + ['modality'])
+        self.histogram_multimodal_preprocess_time = prometheus_client.Histogram(
+            name='lmdeploy:multimodal_preprocess_time_seconds',
+            documentation='Histogram of multimodal preprocessing time in seconds.',
+            buckets=request_latency_buckets,
+            labelnames=labelnames).labels(*labelvalues)
+
+        if self.enable_mm_metrics:
+            self.histogram_multimodal_stage_time = prometheus_client.Histogram(
+                name='lmdeploy:multimodal_stage_time_seconds',
+                documentation='Histogram of multimodal preprocessing stage time in seconds.',
+                buckets=request_latency_buckets,
+                labelnames=labelnames + ['stage', 'modality'])
+            self.histogram_multimodal_item_count = prometheus_client.Histogram(
+                name='lmdeploy:multimodal_item_count',
+                documentation='Histogram of multimodal item counts per request.',
+                buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256],
+                labelnames=labelnames + ['modality'])
+            self.counter_multimodal_failures = prometheus_client.Counter(
+                name='lmdeploy:multimodal_processing_failures_total',
+                documentation='Count of multimodal preprocessing failures.',
+                labelnames=labelnames + ['stage', 'modality'])
+
     def record_schedule(self, stats: SchedulerStats) -> None:
         """Report schedule metrics to prometheus."""
         self.gauge_scheduler_succeeded.set(stats.num_succeeded_reqs)
@@ -391,6 +453,29 @@ def _get_counter_value(counter) -> float:
         """Get the current value from a prometheus counter child."""
         return counter._value.get()
 
+    def record_multimodal(self, stats: MultimodalStats) -> None:
+        if not stats.has_data:
+            return
+
+        total_time, stage_times, item_counts, failures = stats.snapshot()
+        labelvalues = self.labelvalues
+        self.counter_multimodal_requests.inc()
+        if total_time > 0:
+            self.histogram_multimodal_preprocess_time.observe(total_time)
+
+        for modality, count in item_counts.items():
+            self.counter_multimodal_items.labels(*(labelvalues + [modality])).inc(count)
+
+        if not self.enable_mm_metrics:
+            return
+
+        for (stage, modality), seconds in stage_times.items():
+            self.histogram_multimodal_stage_time.labels(*(labelvalues + [stage, modality])).observe(seconds)
+        for modality, count in item_counts.items():
+            self.histogram_multimodal_item_count.labels(*(labelvalues + [modality])).observe(count)
+        for (stage, modality), count in failures.items():
+            self.counter_multimodal_failures.labels(*(labelvalues + [stage, modality])).inc(count)
+
     def record_specdecode(self, stats: SpeculativeDecodingStats) -> None:
         """Report speculative decoding metrics to prometheus."""
         if stats.num_drafts <= 0:
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
index 81db4fa57d..f67218153d 100644
--- a/lmdeploy/metrics/metrics_processor.py
+++ b/lmdeploy/metrics/metrics_processor.py
@@ -5,7 +5,7 @@
 from lmdeploy.pytorch.utils import singleton
 from lmdeploy.utils import get_logger
 
-from .stats import SchedulerStats
+from .stats import MultimodalStats, SchedulerStats
 
 logger = get_logger('lmdeploy')
 
@@ -17,14 +17,16 @@ class MetricsProcessor:
     def __init__(self):
         """Init metrics processor."""
         self.enable_metrics: bool = False
+        self.enable_mm_metrics: bool = False
         self.scheduler_stats = SchedulerStats()
         self.stat_loggers = []
         self.metrics_queue: asyncio.Queue = None
         self.metrics_handler: asyncio.Task = None
 
-    def start_metrics_handler(self, enable_metrics: bool):
+    def start_metrics_handler(self, enable_metrics: bool, enable_mm_metrics: bool = False):
         """Start metrics handler."""
         self.enable_metrics = enable_metrics
+        self.enable_mm_metrics = enable_mm_metrics
         if enable_metrics and self.metrics_handler is None:
             self.metrics_queue = asyncio.Queue()
             self.metrics_handler = asyncio.create_task(self._run_metrics_handler())
@@ -93,6 +95,17 @@ def queue_update(self, update_data: tuple):
             return
         self.metrics_queue.put_nowait(update_data)
 
+    def record_multimodal(self, stats: MultimodalStats | None):
+        """Record multimodal preprocessing stats."""
+        if not self.enable_metrics or stats is None:
+            return
+        stats.finish()
+        if not stats.mark_emitted():
+            return
+
+        for stat_logger in self.stat_loggers:
+            stat_logger.record_multimodal(stats)
+
     def increase_total_requests(self):
         """Increase total requests."""
         self.scheduler_stats.num_total_reqs += 1
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
index 7aa8e9efa5..31a2a5e895 100644
--- a/lmdeploy/metrics/stats.py
+++ b/lmdeploy/metrics/stats.py
@@ -2,7 +2,11 @@
 # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/v1/metrics/stats.py
 
 import time
+from collections import defaultdict
+from contextlib import contextmanager
 from dataclasses import dataclass
+from threading import Lock
+from typing import Iterator
 
 import numpy as np
 
@@ -94,6 +98,98 @@ def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics):
         self.prefix_cache_hit_rate = scheduled_metrics.prefix_cache_hit_rate
 
 
+class MultimodalStats:
+    """Stats associated with multimodal prompt preprocessing."""
+
+    def __init__(self, enabled: bool = True, detailed: bool = False):
+        """Initialize multimodal preprocessing stats.
+
+        Args:
+            enabled (bool): Whether to record multimodal metrics.
+            detailed (bool): Whether to collect per-stage timing and failure
+                metrics in addition to total request-level metrics.
+        """
+        self.enabled = enabled
+        self.detailed = detailed
+        self.start_time = time.perf_counter()
+        self.total_time: float = 0.0
+        self.stage_times: dict[tuple[str, str], float] = defaultdict(float)
+        self.item_counts: dict[str, int] = defaultdict(int)
+        self.failures: dict[tuple[str, str], int] = defaultdict(int)
+        self.finished = False
+        # Avoid duplicate emission if success/error paths both try to record.
+        self._emitted = False
+        # Multimodal item parsing can update this object from executor threads.
+        self._lock = Lock()
+
+    def add_stage(self, stage: str, seconds: float, modality='all') -> None:
+        """Add elapsed time for a multimodal preprocessing stage."""
+        if not self.enabled:
+            return
+        seconds = max(float(seconds), 0.0)
+        with self._lock:
+            self.stage_times[(stage, modality)] += seconds
+
+    def add_item(self, modality, count: int = 1) -> None:
+        """Record multimodal input item count by modality."""
+        if not self.enabled:
+            return
+        with self._lock:
+            self.item_counts[modality] += count
+
+    def record_failure(self, stage: str = 'total', modality='all') -> None:
+        """Record a multimodal preprocessing failure."""
+        if not self.enabled:
+            return
+        with self._lock:
+            self.failures[(stage, modality)] += 1
+
+    @contextmanager
+    def span(self, stage: str, modality='all') -> Iterator[None]:
+        """Measure a multimodal stage when detailed metrics are enabled."""
+        if not self.enabled or not self.detailed:
+            yield
+            return
+
+        start = time.perf_counter()
+        try:
+            yield
+        finally:
+            self.add_stage(stage, time.perf_counter() - start, modality)
+
+    def finish(self) -> None:
+        """Mark total multimodal preprocessing time."""
+        if not self.enabled:
+            return
+        with self._lock:
+            if not self.finished:
+                self.total_time = time.perf_counter() - self.start_time
+                self.finished = True
+
+    @property
+    def has_data(self) -> bool:
+        """Whether this object contains any multimodal metric data."""
+        with self._lock:
+            return bool(self.item_counts or self.stage_times or self.failures)
+
+    def mark_emitted(self) -> bool:
+        """Mark this stats object as emitted.
+
+        Returns:
+            bool: True if this call marked it for the first time.
+        """
+        with self._lock:
+            if self._emitted or not (self.item_counts or self.stage_times or self.failures):
+                return False
+            self._emitted = True
+            return True
+
+    def snapshot(self) -> tuple[float, dict[tuple[str, str], float], dict[str, int], dict[tuple[str, str], int]]:
+        """Return a thread-safe copy of the collected stats."""
+        with self._lock:
+            return self.total_time, dict(self.stage_times), dict(self.item_counts), dict(self.failures)
+
+
 class RequestStats:
     """Stats associated with a request."""
 
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index 48749bfa75..ccaa67fe32 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -23,7 +23,7 @@
     TurbomindEngineConfig,
 )
 from lmdeploy.metrics.metrics_processor import metrics_processor
-from lmdeploy.metrics.stats import IterationStats, RequestStats, SpeculativeDecodingStats
+from lmdeploy.metrics.stats import IterationStats, MultimodalStats, RequestStats, SpeculativeDecodingStats
 from lmdeploy.model import ChatTemplateConfig, get_chat_template
 from lmdeploy.pytorch.disagg.conn.protocol import (
     DistServeConnectionRequest,
@@ -214,11 +214,15 @@ def _build_stat_loggers(self):
 
             # currently, metrics in TM engine doesn't support dp
             dp_rank = self.backend_config.dp_rank if self.backend == 'pytorch' else 0
+            metrics_processor.enable_mm_metrics = self.backend_config.enable_mm_metrics
 
             logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}')
             self.stat_loggers = [
                 LoggingStatLogger(dp_rank=dp_rank),
-                PrometheusStatLogger(model_name=self.model_name, max_model_len=self.session_len, dp_rank=dp_rank)
+                PrometheusStatLogger(model_name=self.model_name,
+                                     max_model_len=self.session_len,
+                                     dp_rank=dp_rank,
+                                     enable_mm_metrics=metrics_processor.enable_mm_metrics)
             ]
 
             # set stats loggers of metrics processor
@@ -510,6 +514,8 @@ async def generate(
                 logger.warning('chat_template_kwargs["enable_thinking"] is already set, '
                                'the value will not be overwritten by enable_thinking')
         if messages:
+            mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics,
+                                       detailed=metrics_processor.enable_mm_metrics)
             try:
                 prompt = messages
                 self.request_logger.log_prompt(session, prompt=prompt)
@@ -522,7 +528,9 @@ async def generate(
                                                                             chat_template_kwargs=chat_template_kwargs,
                                                                             media_io_kwargs=media_io_kwargs,
                                                                             mm_processor_kwargs=mm_processor_kwargs,
+                                                                            mm_stats=mm_stats,
                                                                             **kwargs)
+                metrics_processor.record_multimodal(mm_stats)
                 prompt = prompt_input.get('prompt')
                 input_ids = prompt_input.get('input_ids')
                 self.request_logger.log_inputs(session,
@@ -532,6 +540,7 @@ async def generate(
                                             adapter_name=adapter_name)
             except Exception:
                 logger.exception('[generate] error in prompt processing')
+                metrics_processor.record_multimodal(mm_stats)
                 metrics_processor.increase_failed_requests('error')
                 yield GenOut(response='in prompt processing error',
                              history_token_len=session.step,
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index 63b63cdd2a..dcc54593d7 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1358,7 +1358,9 @@ async def lifespan_handler(app: FastAPI):
         try:
             health_monitor.start()
             if getattr(backend_config, 'enable_metrics', False):
-                metrics_processor.start_metrics_handler(enable_metrics=True)
+                metrics_processor.start_metrics_handler(
+                    enable_metrics=True,
+                    enable_mm_metrics=backend_config.enable_mm_metrics)
                 log_interval = 10.
 
                 async def _force_log():
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index b44b75e491..4005354799 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -1,9 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import asyncio
+from contextlib import contextmanager
 from typing import Any, Literal
 
 import PIL
 
+from lmdeploy.metrics.stats import MultimodalStats
 from lmdeploy.model import MODELS, BaseChatTemplate
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger
@@ -16,6 +18,16 @@
 logger = get_logger('lmdeploy')
 
 
+@contextmanager
+def _mm_stage(mm_stats: MultimodalStats, stage: str, modality='all'):
+    try:
+        with mm_stats.span(stage, modality):
+            yield
+    except Exception:
+        mm_stats.record_failure(stage, modality)
+        raise
+
+
 class MultimodalProcessor:
     """Processor for handling prompt preprocessing, message content merging,
     and multimodal processing."""
@@ -91,8 +103,11 @@ def merge_message_content(msg: dict) -> dict:
 
     @staticmethod
     def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[dict], media_io_kwargs: dict[str,
-                                                                                                                Any]):
+                                                                                                                Any],
+                               mm_stats: MultimodalStats | None = None):
         """Synchronous helper to parse a single multimodal message item."""
+        media_io_kwargs = media_io_kwargs or {}
+        mm_stats = mm_stats or MultimodalStats(enabled=False)
         role = in_messages[i]['role']
         content = in_messages[i]['content']
 
@@ -130,28 +145,39 @@ def _require_data_src():
                 raise ValueError(f'Invalid multimodal item at index {i}: {item}. '
                                  f'Expected "{item_type}" to be a direct value or a dict containing "url" or "data".')
 
+            modality = None
             if item_type == 'image_data':
                 modality = Modality.IMAGE
+                mm_stats.add_item(modality.value)
                 data = _require_data_src()
             elif item_type in ('image_url', 'image'):
                 modality = Modality.IMAGE
+                mm_stats.add_item(modality.value)
                 data_src = _require_data_src()
                 if isinstance(data_src, PIL.Image.Image):
                     data = data_src
                 elif isinstance(data_src, str):
-                    data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {})))
+                    with _mm_stage(mm_stats, 'media_io', modality.value):
+                        data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {})))
                 else:
                     raise ValueError(f'Invalid multimodal image item at index {i}: {item}. '
-                                     'Expected a str URL/path/data URL or PIL.Image.Image.')
+                                        'Expected a str URL/path/data URL or PIL.Image.Image.')
             elif item_type in ('video_url', 'video'):
                 modality = Modality.VIDEO
-                data, metadata = load_from_url(
-                    _require_data_src(), VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {})))
+                mm_stats.add_item(modality.value)
+                with _mm_stage(mm_stats, 'media_io', modality.value):
+                    data, metadata = load_from_url(
+                        _require_data_src(),
+                        VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {})))
                 item_params['video_metadata'] = metadata
             elif item_type in ('time_series_url', 'time_series'):
                 modality = Modality.TIME_SERIES
-                data = load_from_url(_require_data_src(), TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {})))
+                mm_stats.add_item(modality.value)
+                with _mm_stage(mm_stats, 'media_io', modality.value):
+                    data = load_from_url(_require_data_src(),
+                                            TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {})))
             else:
+                mm_stats.record_failure('media_io', 'unknown')
                 raise NotImplementedError(f'unknown type: {item_type}')
 
             out_message['content'].append({'type': modality.value, 'data': data, **item_params})
@@ -160,7 +186,8 @@ def _require_data_src():
 
     @staticmethod
     async def async_parse_multimodal_item(messages: list[dict],
-                                          media_io_kwargs: dict[str, Any] | None = None) -> list[dict]:
+                                          media_io_kwargs: dict[str, Any] | None = None,
+                                          mm_stats: MultimodalStats | None = None) -> list[dict]:
         """Convert user-input multimodal data into GPT4V message format."""
         if isinstance(messages, dict):
             messages = [messages]
@@ -172,7 +199,7 @@ async def async_parse_multimodal_item(messages: list[dict],
 
         await asyncio.gather(*[
             loop.run_in_executor(None, MultimodalProcessor._parse_multimodal_item, i, messages, out_messages,
-                                 media_io_kwargs) for i in range(len(messages))
+                                 media_io_kwargs, mm_stats) for i in range(len(messages))
         ])
         return out_messages
 
@@ -186,6 +213,7 @@ async def get_prompt_input(self,
                                chat_template_kwargs: dict | None = None,
                                media_io_kwargs: dict[str, Any] | None = None,
                                mm_processor_kwargs: dict[str, Any] | None = None,
+                               mm_stats: MultimodalStats | None = None,
                                **kwargs):
         """Process prompt and return prompt string and input_ids.
 
@@ -244,6 +272,7 @@ async def get_prompt_input(self,
                                                            chat_template_kwargs=chat_template_kwargs,
                                                            media_io_kwargs=media_io_kwargs,
                                                            mm_processor_kwargs=mm_processor_kwargs,
+                                                           mm_stats=mm_stats,
                                                            **kwargs)
         else:
             raise RuntimeError(f'unsupported prompt type: {type(prompt)}')
@@ -375,39 +404,48 @@ async def _get_multimodal_prompt_input(self,
                                            chat_template_kwargs: dict | None = None,
                                            media_io_kwargs: dict[str, Any] | None = None,
                                            mm_processor_kwargs: dict[str, Any] | None = None,
+                                           mm_stats: MultimodalStats | None = None,
                                            **kwargs):
         """Process multimodal prompt and return processed data for inference
         engines."""
+        mm_stats = mm_stats or MultimodalStats(enabled=False)
         chat_template = self.chat_template if do_preprocess else BaseChatTemplate()
-        messages = await self.async_parse_multimodal_item(messages, media_io_kwargs)
+        messages = await self.async_parse_multimodal_item(messages, media_io_kwargs, mm_stats=mm_stats)
 
         if self.backend == 'turbomind':
-            results = await self.vl_encoder.preprocess(messages,
-                                                       mm_processor_kwargs=mm_processor_kwargs)
-            results = await self.vl_encoder.async_infer(results)
-            results = await self.vl_encoder.wrap_for_turbomind(messages=results,
-                                                               chat_template=chat_template,
-                                                               tokenizer=self.tokenizer,
-                                                               sequence_start=sequence_start,
-                                                               tools=tools,
-                                                               chat_template_kwargs=chat_template_kwargs)
-        elif self.backend == 'pytorch':
-            if self.vl_encoder._uses_new_preprocess:
-                input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages,
-                                                                      chat_template=chat_template,
-                                                                      sequence_start=sequence_start,
-                                                                      chat_template_kwargs=chat_template_kwargs)
+            with _mm_stage(mm_stats, 'hf_processor'):
                 results = await self.vl_encoder.preprocess(messages,
-                                                           input_prompt=input_prompt,
                                                            mm_processor_kwargs=mm_processor_kwargs)
+            with _mm_stage(mm_stats, 'vision_encoder'):
+                results = await self.vl_encoder.async_infer(results)
+            with _mm_stage(mm_stats, 'embedding_merge'):
+                results = await self.vl_encoder.wrap_for_turbomind(messages=results,
+                                                                   chat_template=chat_template,
+                                                                   tokenizer=self.tokenizer,
+                                                                   sequence_start=sequence_start,
+                                                                   tools=tools,
+                                                                   chat_template_kwargs=chat_template_kwargs)
+        elif self.backend == 'pytorch':
+            if self.vl_encoder._uses_new_preprocess:
+                with _mm_stage(mm_stats, 'prompt_template'):
+                    input_prompt = self.vl_encoder.model.get_input_prompt(messages=messages,
+                                                                          chat_template=chat_template,
+                                                                          sequence_start=sequence_start,
+                                                                          chat_template_kwargs=chat_template_kwargs)
+                with _mm_stage(mm_stats, 'hf_processor'):
+                    results = await self.vl_encoder.preprocess(messages,
+                                                               input_prompt=input_prompt,
+                                                               mm_processor_kwargs=mm_processor_kwargs)
             else:
-                results = await self.vl_encoder.preprocess(messages,
-                                                           mm_processor_kwargs=mm_processor_kwargs)
-                results = await self.vl_encoder.wrap_for_pytorch(messages=results,
-                                                                 chat_template=chat_template,
-                                                                 tokenizer=self.tokenizer,
-                                                                 sequence_start=sequence_start,
-                                                                 tools=tools,
-                                                                 chat_template_kwargs=chat_template_kwargs)
+                with _mm_stage(mm_stats, 'hf_processor'):
+                    results = await self.vl_encoder.preprocess(messages,
+                                                               mm_processor_kwargs=mm_processor_kwargs)
+                with _mm_stage(mm_stats, 'embedding_merge'):
+                    results = await self.vl_encoder.wrap_for_pytorch(messages=results,
+                                                                     chat_template=chat_template,
+                                                                     tokenizer=self.tokenizer,
+                                                                     sequence_start=sequence_start,
+                                                                     tools=tools,
+                                                                     chat_template_kwargs=chat_template_kwargs)
 
         return results
diff --git a/tests/test_lmdeploy/test_metrics_multimodal.py b/tests/test_lmdeploy/test_metrics_multimodal.py
new file mode 100644
index 0000000000..6d089425ae
--- /dev/null
+++ b/tests/test_lmdeploy/test_metrics_multimodal.py
@@ -0,0 +1,81 @@
+import sys
+
+import pytest
+
+from lmdeploy.metrics.loggers import PrometheusStatLogger
+from lmdeploy.metrics.stats import MultimodalStats
+from lmdeploy.serve.processors import MultimodalProcessor
+from lmdeploy.vl.constants import Modality
+
+multimodal_module = sys.modules[MultimodalProcessor.__module__]
+
+
+def test_multimodal_stats_snapshot_and_emit_guard():
+    stats = MultimodalStats(detailed=True)
+
+    stats.add_item(Modality.IMAGE.value, count=2)
+    stats.add_stage('media_io', 0.1, Modality.IMAGE.value)
+    stats.record_failure('media_io', Modality.IMAGE.value)
+    stats.finish()
+
+    total_time, stage_times, item_counts, failures = stats.snapshot()
+
+    assert total_time >= 0
+    assert item_counts == {'image': 2}
+    assert stage_times == {('media_io', 'image'): 0.1}
+    assert failures == {('media_io', 'image'): 1}
+    assert stats.mark_emitted()
+    assert not stats.mark_emitted()
+
+
+def test_prometheus_logger_records_multimodal_metrics():
+    pytest.importorskip('prometheus_client')
+    from prometheus_client import REGISTRY, generate_latest
+
+    logger = PrometheusStatLogger('test-model', 128, dp_rank=0, enable_mm_metrics=True)
+    stats = MultimodalStats(detailed=True)
+    stats.add_item(Modality.IMAGE.value, count=2)
+    stats.add_stage('media_io', 0.1, Modality.IMAGE.value)
+    stats.record_failure('media_io', Modality.IMAGE.value)
+    stats.finish()
+
+    logger.record_multimodal(stats)
+    metrics_text = generate_latest(REGISTRY).decode('utf-8')
+
+    assert 'lmdeploy:multimodal_requests_total' in metrics_text
+    assert 'lmdeploy:multimodal_items_total' in metrics_text
+    assert 'lmdeploy:multimodal_preprocess_time_seconds' in metrics_text
+    assert 'lmdeploy:multimodal_stage_time_seconds' in metrics_text
+    assert 'lmdeploy:multimodal_item_count' in metrics_text
+    assert 'lmdeploy:multimodal_processing_failures_total' in metrics_text
+    assert 'modality="image"' in metrics_text
+    assert 'stage="media_io"' in metrics_text
+
+
+def test_parse_multimodal_item_records_multimodal_stats(monkeypatch):
+    load_calls = []
+
+    def fake_load_from_url(data_src, media_io):
+        load_calls.append((data_src, type(media_io).__name__))
+        return f'loaded:{data_src}'
+
+    monkeypatch.setattr(multimodal_module, 'load_from_url', fake_load_from_url)
+
+    messages = [{
+        'role': 'user',
+        'content': [{
+            'type': 'image',
+            'image': 'file:///tmp/a.png',
+        }]
+    }]
+    stats = MultimodalStats(detailed=True)
+    parsed = [None]
+
+    MultimodalProcessor._parse_multimodal_item(0, messages, parsed, {}, stats)
+    _, stage_times, item_counts, failures = stats.snapshot()
+
+    assert parsed[0]['content'][0] == {'type': Modality.IMAGE, 'data': 'loaded:file:///tmp/a.png'}
+    assert load_calls == [('file:///tmp/a.png', 'ImageMediaIO')]
+    assert item_counts == {'image': 1}
+    assert ('media_io', 'image') in stage_times
+    assert failures == {}

From bafee8b8bd65c015011aa1e104d55a15da337e41 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 1 Jun 2026 15:45:00 +0800
Subject: [PATCH 2/5] refactor: enable multimodal metrics by default

---
 docs/en/advance/metrics.md                    | 13 +++----
 docs/zh_cn/advance/metrics.md                 | 12 +++----
 lmdeploy/cli/serve.py                         |  4 ---
 lmdeploy/cli/utils.py                         |  8 -----
 lmdeploy/messages.py                          |  4 ---
 lmdeploy/metrics/loggers.py                   | 36 ++++++++-----------
 lmdeploy/metrics/metrics_processor.py         |  4 +--
 lmdeploy/metrics/stats.py                     |  9 ++---
 lmdeploy/serve/core/async_engine.py           |  7 ++--
 lmdeploy/serve/openai/api_server.py           |  4 +--
 .../test_lmdeploy/test_metrics_multimodal.py  |  8 ++---
 11 files changed, 34 insertions(+), 75 deletions(-)

diff --git a/docs/en/advance/metrics.md b/docs/en/advance/metrics.md
index fe952072cb..1aef50bbc1 100644
--- a/docs/en/advance/metrics.md
+++ b/docs/en/advance/metrics.md
@@ -23,21 +23,16 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
 Replace the model path according to your needs.
 By default, the metrics endpoint will be available at `http://<lmdeploy_server_host>:23333/metrics`.
 
-For VLM serving, total multimodal preprocessing metrics are exported when the metrics system is enabled. Use
-`--enable-mm-metrics` to additionally export per-stage metrics:
-
-```
-lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics
-```
+For VLM serving, multimodal preprocessing metrics are exported when the metrics system is enabled:
 
 The multimodal metrics include:
 
 - `lmdeploy:multimodal_requests_total`: multimodal request count
 - `lmdeploy:multimodal_items_total`: multimodal item count by modality
 - `lmdeploy:multimodal_preprocess_time_seconds`: total multimodal preprocessing time
-- `lmdeploy:multimodal_stage_time_seconds`: detailed stage time by stage and modality, exported with `--enable-mm-metrics`
-- `lmdeploy:multimodal_item_count`: item count per request by modality, exported with `--enable-mm-metrics`
-- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality, exported with `--enable-mm-metrics`
+- `lmdeploy:multimodal_stage_time_seconds`: stage time by stage and modality
+- `lmdeploy:multimodal_item_count`: item count per request by modality
+- `lmdeploy:multimodal_processing_failures_total`: processing failures by stage and modality
 
 2. **Navigate to the monitoring directory**
 
diff --git a/docs/zh_cn/advance/metrics.md b/docs/zh_cn/advance/metrics.md
index 3928d5e60e..64cf2cedf7 100644
--- a/docs/zh_cn/advance/metrics.md
+++ b/docs/zh_cn/advance/metrics.md
@@ -22,20 +22,16 @@ lmdeploy serve api_server Qwen/Qwen2.5-7B-Instruct --enable-metrics
 
 请根据需求替换模型路径。默认 metrics endpoint 位于 `http://<lmdeploy_server_host>:23333/metrics`。
 
-VLM 服务在启用指标系统后会默认导出多模态预处理总耗时指标。使用 `--enable-mm-metrics` 可额外导出分阶段指标：
-
-```
-lmdeploy serve api_server OpenGVLab/InternVL3-8B --enable-mm-metrics
-```
+VLM 服务在启用指标系统后会导出多模态预处理指标：
 
 多模态指标包括：
 
 - `lmdeploy:multimodal_requests_total`：多模态请求数量
 - `lmdeploy:multimodal_items_total`：按模态统计的多模态输入数量
 - `lmdeploy:multimodal_preprocess_time_seconds`：多模态预处理总耗时
-- `lmdeploy:multimodal_stage_time_seconds`：按阶段和模态统计的耗时，使用 `--enable-mm-metrics` 导出
-- `lmdeploy:multimodal_item_count`：按模态统计的单请求输入数量，使用 `--enable-mm-metrics` 导出
-- `lmdeploy:multimodal_processing_failures_total`：按阶段和模态统计的处理失败次数，使用 `--enable-mm-metrics` 导出
+- `lmdeploy:multimodal_stage_time_seconds`：按阶段和模态统计的耗时
+- `lmdeploy:multimodal_item_count`：按模态统计的单请求输入数量
+- `lmdeploy:multimodal_processing_failures_total`：按阶段和模态统计的处理失败次数
 
 2. **进入监控目录**
 
diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
index 399a17297f..9ca662b961 100644
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
@@ -122,7 +122,6 @@ def add_parser_api_server():
         model_format = ArgumentHelper.model_format(pt_group)
         hf_overrides = ArgumentHelper.hf_overrides(pt_group)
         disable_metrics = ArgumentHelper.disable_metrics(pt_group)
-        enable_mm_metrics = ArgumentHelper.enable_mm_metrics(pt_group)
         dp = ArgumentHelper.dp(pt_group)
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
@@ -150,7 +149,6 @@ def add_parser_api_server():
         tb_group._group_actions.append(node_rank_act)
         tb_group._group_actions.append(hf_overrides)
         tb_group._group_actions.append(disable_metrics)
-        tb_group._group_actions.append(enable_mm_metrics)
         tb_group._group_actions.append(dp)
         ArgumentHelper.cp(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
@@ -241,7 +239,6 @@ def api_server(args):
                 enable_microbatch=args.enable_microbatch,
                 enable_eplb=args.enable_eplb,
                 enable_metrics=not args.disable_metrics,
-                enable_mm_metrics=args.enable_mm_metrics,
                 role=EngineRole[args.role],
                 migration_backend=MigrationBackend[args.migration_backend],
                 model_format=args.model_format,
@@ -278,7 +275,6 @@ def api_server(args):
                                                    async_=args.async_,
                                                    communicator=args.communicator,
                                                    enable_metrics=not args.disable_metrics,
-                                                   enable_mm_metrics=args.enable_mm_metrics,
                                                    hf_overrides=args.hf_overrides)
         chat_template_config = get_chat_template(args.chat_template, args.model_path)
         speculative_config = get_speculative_config(args)
diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
index 3efa66d2bb..2c9f8034f6 100644
--- a/lmdeploy/cli/utils.py
+++ b/lmdeploy/cli/utils.py
@@ -677,14 +677,6 @@ def disable_metrics(parser):
                                    default=False,
                                    help='disable metrics system')
 
-    @staticmethod
-    def enable_mm_metrics(parser):
-        """Add argument enable_mm_metrics to parser."""
-        return parser.add_argument('--enable-mm-metrics',
-                                   action='store_true',
-                                   default=False,
-                                   help='enable detailed multimodal metrics')
-
     # For Disaggregation
     @staticmethod
     def role(parser):
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
index 4a1682c4d2..aec9e08ee6 100644
--- a/lmdeploy/messages.py
+++ b/lmdeploy/messages.py
@@ -269,7 +269,6 @@ class TurbomindEngineConfig:
         hf_overrides: Huggingface overrides for the model.
             It can be used to override the default config of the model
         enable_metrics: enable metrics system
-        enable_mm_metrics: enable detailed multimodal metrics
     """
 
     dtype: str = 'auto'
@@ -308,7 +307,6 @@ class TurbomindEngineConfig:
     communicator: str = 'nccl'
     hf_overrides: dict[str, Any] | None = None
     enable_metrics: bool = True
-    enable_mm_metrics: bool = False
 
     def __post_init__(self):
         """Check input validation."""
@@ -384,7 +382,6 @@ class PytorchEngineConfig:
         enable_microbatch: enable microbatch for specified model
         enable_eplb: enable eplb for specified model
         enable_metrics: enable metrics system
-        enable_mm_metrics: enable detailed multimodal metrics
         role: role of engin, options: ['Hybrid', 'Prefill',
             'Decode']. Default to `EngineRole.Hybrid`.
         migration_backend: migration backend. options: ['DLSlime'].
@@ -439,7 +436,6 @@ class PytorchEngineConfig:
     mp_engine_backend: str = 'mp'
     model_format: str = None
     enable_metrics: bool = True
-    enable_mm_metrics: bool = False
     hf_overrides: dict[str, Any] | None = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
index 417dbedaf3..002e4175b5 100644
--- a/lmdeploy/metrics/loggers.py
+++ b/lmdeploy/metrics/loggers.py
@@ -157,7 +157,7 @@ def log(self):
 
 class PrometheusStatLogger(StatLoggerBase):
 
-    def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable_mm_metrics: bool = False):
+    def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
         try:
             import prometheus_client
             prometheus_client.disable_created_metrics()  # disable noisy creation timestamp gauge in prometheus
@@ -166,7 +166,6 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable
                 'To use metrics system , please install prometheus_client by `pip install prometheus_client`')
 
         self.dp_rank = dp_rank
-        self.enable_mm_metrics = enable_mm_metrics
 
         # unregister any existing lmdeploy collectors
         for collector in list(prometheus_client.REGISTRY._collector_to_names):
@@ -395,22 +394,20 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0, enable
             documentation='Histogram of multimodal preprocessing time in seconds.',
             buckets=request_latency_buckets,
             labelnames=labelnames).labels(*labelvalues)
-
-        if self.enable_mm_metrics:
-            self.histogram_multimodal_stage_time = prometheus_client.Histogram(
-                name='lmdeploy:multimodal_stage_time_seconds',
-                documentation='Histogram of multimodal preprocessing stage time in seconds.',
-                buckets=request_latency_buckets,
-                labelnames=labelnames + ['stage', 'modality'])
-            self.histogram_multimodal_item_count = prometheus_client.Histogram(
-                name='lmdeploy:multimodal_item_count',
-                documentation='Histogram of multimodal item counts per request.',
-                buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256],
-                labelnames=labelnames + ['modality'])
-            self.counter_multimodal_failures = prometheus_client.Counter(
-                name='lmdeploy:multimodal_processing_failures_total',
-                documentation='Count of multimodal preprocessing failures.',
-                labelnames=labelnames + ['stage', 'modality'])
+        self.histogram_multimodal_stage_time = prometheus_client.Histogram(
+            name='lmdeploy:multimodal_stage_time_seconds',
+            documentation='Histogram of multimodal preprocessing stage time in seconds.',
+            buckets=request_latency_buckets,
+            labelnames=labelnames + ['stage', 'modality'])
+        self.histogram_multimodal_item_count = prometheus_client.Histogram(
+            name='lmdeploy:multimodal_item_count',
+            documentation='Histogram of multimodal item counts per request.',
+            buckets=[1, 2, 4, 8, 16, 32, 64, 128, 256],
+            labelnames=labelnames + ['modality'])
+        self.counter_multimodal_failures = prometheus_client.Counter(
+            name='lmdeploy:multimodal_processing_failures_total',
+            documentation='Count of multimodal preprocessing failures.',
+            labelnames=labelnames + ['stage', 'modality'])
 
     def record_schedule(self, stats: SchedulerStats) -> None:
         """Report schedule metrics to prometheus."""
@@ -466,9 +463,6 @@ def record_multimodal(self, stats: MultimodalStats) -> None:
         for modality, count in item_counts.items():
             self.counter_multimodal_items.labels(*(labelvalues + [modality])).inc(count)
 
-        if not self.enable_mm_metrics:
-            return
-
         for (stage, modality), seconds in stage_times.items():
             self.histogram_multimodal_stage_time.labels(*(labelvalues + [stage, modality])).observe(seconds)
         for modality, count in item_counts.items():
diff --git a/lmdeploy/metrics/metrics_processor.py b/lmdeploy/metrics/metrics_processor.py
index f67218153d..6bac129382 100644
--- a/lmdeploy/metrics/metrics_processor.py
+++ b/lmdeploy/metrics/metrics_processor.py
@@ -17,16 +17,14 @@ class MetricsProcessor:
     def __init__(self):
         """Init metrics processor."""
         self.enable_metrics: bool = False
-        self.enable_mm_metrics: bool = False
         self.scheduler_stats = SchedulerStats()
         self.stat_loggers = []
         self.metrics_queue: asyncio.Queue = None
         self.metrics_handler: asyncio.Task = None
 
-    def start_metrics_handler(self, enable_metrics: bool, enable_mm_metrics: bool = False):
+    def start_metrics_handler(self, enable_metrics: bool):
         """Start metrics handler."""
         self.enable_metrics = enable_metrics
-        self.enable_mm_metrics = enable_mm_metrics
         if enable_metrics and self.metrics_handler is None:
             self.metrics_queue = asyncio.Queue()
             self.metrics_handler = asyncio.create_task(self._run_metrics_handler())
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
index 31a2a5e895..78d95fc539 100644
--- a/lmdeploy/metrics/stats.py
+++ b/lmdeploy/metrics/stats.py
@@ -101,16 +101,13 @@ def update_from_schedule_metrics(self, scheduled_metrics: ScheduleMetrics):
 class MultimodalStats:
     """Stats associated with multimodal prompt preprocessing."""
 
-    def __init__(self, enabled: bool = True, detailed: bool = False):
+    def __init__(self, enabled: bool = True):
         """Initialize multimodal preprocessing stats.
 
         Args:
             enabled (bool): Whether to record multimodal metrics.
-            detailed (bool): Whether to collect per-stage timing and failure
-                metrics in addition to total request-level metrics.
         """
         self.enabled = enabled
-        self.detailed = detailed
         self.start_time = time.perf_counter()
         self.total_time: float = 0.0
         self.stage_times: dict[tuple[str, str], float] = defaultdict(float)
@@ -146,8 +143,8 @@ def record_failure(self, stage: str = 'total', modality='all') -> None:
 
     @contextmanager
     def span(self, stage: str, modality='all') -> Iterator[None]:
-        """Measure a multimodal stage when detailed metrics are enabled."""
-        if not self.enabled or not self.detailed:
+        """Measure a multimodal preprocessing stage."""
+        if not self.enabled:
             yield
             return
 
diff --git a/lmdeploy/serve/core/async_engine.py b/lmdeploy/serve/core/async_engine.py
index ccaa67fe32..195dd44e78 100644
--- a/lmdeploy/serve/core/async_engine.py
+++ b/lmdeploy/serve/core/async_engine.py
@@ -214,15 +214,13 @@ def _build_stat_loggers(self):
 
             # currently, metrics in TM engine doesn't support dp
             dp_rank = self.backend_config.dp_rank if self.backend == 'pytorch' else 0
-            metrics_processor.enable_mm_metrics = self.backend_config.enable_mm_metrics
 
             logger.info(f'enable metrics, with dp: {self.backend_config.dp} dp_rank: {dp_rank}')
             self.stat_loggers = [
                 LoggingStatLogger(dp_rank=dp_rank),
                 PrometheusStatLogger(model_name=self.model_name,
                                      max_model_len=self.session_len,
-                                     dp_rank=dp_rank,
-                                     enable_mm_metrics=metrics_processor.enable_mm_metrics)
+                                     dp_rank=dp_rank)
             ]
 
             # set stats loggers of metrics processor
@@ -514,8 +512,7 @@ async def generate(
                 logger.warning('chat_template_kwargs["enable_thinking"] is already set, '
                                'the value will not be overwritten by enable_thinking')
         if messages:
-            mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics,
-                                       detailed=metrics_processor.enable_mm_metrics)
+            mm_stats = MultimodalStats(enabled=metrics_processor.enable_metrics)
             try:
                 prompt = messages
                 self.request_logger.log_prompt(session, prompt=prompt)
diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py
index dcc54593d7..63b63cdd2a 100644
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -1358,9 +1358,7 @@ async def lifespan_handler(app: FastAPI):
         try:
             health_monitor.start()
             if getattr(backend_config, 'enable_metrics', False):
-                metrics_processor.start_metrics_handler(
-                    enable_metrics=True,
-                    enable_mm_metrics=backend_config.enable_mm_metrics)
+                metrics_processor.start_metrics_handler(enable_metrics=True)
                 log_interval = 10.
 
                 async def _force_log():
diff --git a/tests/test_lmdeploy/test_metrics_multimodal.py b/tests/test_lmdeploy/test_metrics_multimodal.py
index 6d089425ae..4d6e3d4620 100644
--- a/tests/test_lmdeploy/test_metrics_multimodal.py
+++ b/tests/test_lmdeploy/test_metrics_multimodal.py
@@ -11,7 +11,7 @@
 
 
 def test_multimodal_stats_snapshot_and_emit_guard():
-    stats = MultimodalStats(detailed=True)
+    stats = MultimodalStats()
 
     stats.add_item(Modality.IMAGE.value, count=2)
     stats.add_stage('media_io', 0.1, Modality.IMAGE.value)
@@ -32,8 +32,8 @@ def test_prometheus_logger_records_multimodal_metrics():
     pytest.importorskip('prometheus_client')
     from prometheus_client import REGISTRY, generate_latest
 
-    logger = PrometheusStatLogger('test-model', 128, dp_rank=0, enable_mm_metrics=True)
-    stats = MultimodalStats(detailed=True)
+    logger = PrometheusStatLogger('test-model', 128, dp_rank=0)
+    stats = MultimodalStats()
     stats.add_item(Modality.IMAGE.value, count=2)
     stats.add_stage('media_io', 0.1, Modality.IMAGE.value)
     stats.record_failure('media_io', Modality.IMAGE.value)
@@ -68,7 +68,7 @@ def fake_load_from_url(data_src, media_io):
             'image': 'file:///tmp/a.png',
         }]
     }]
-    stats = MultimodalStats(detailed=True)
+    stats = MultimodalStats()
     parsed = [None]
 
     MultimodalProcessor._parse_multimodal_item(0, messages, parsed, {}, stats)

From 4265ff9b7f12c7f4c6af87bcd4f685e8b1745bb9 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 1 Jun 2026 16:00:36 +0800
Subject: [PATCH 3/5] style: fix metrics lint

---
 lmdeploy/metrics/stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
index 78d95fc539..80a6480bcf 100644
--- a/lmdeploy/metrics/stats.py
+++ b/lmdeploy/metrics/stats.py
@@ -3,10 +3,10 @@
 
 import time
 from collections import defaultdict
+from collections.abc import Iterator
 from contextlib import contextmanager
 from dataclasses import dataclass
 from threading import Lock
-from typing import Iterator
 
 import numpy as np
 

From 18b8be342ec4b8545c189827e6e87be11ccb77ca Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 1 Jun 2026 16:20:15 +0800
Subject: [PATCH 4/5] fix: record multimodal parse failures

---
 lmdeploy/serve/processors/multimodal.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 4005354799..6328122326 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -139,13 +139,15 @@ def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[d
                 item_params = {k: v for k, v in item.items() if k not in ('type', item_type)}
                 data_src = item_val
 
+            modality = None
+
             def _require_data_src():
                 if data_src is not None:
                     return data_src
+                mm_stats.record_failure('parse', modality.value)
                 raise ValueError(f'Invalid multimodal item at index {i}: {item}. '
                                  f'Expected "{item_type}" to be a direct value or a dict containing "url" or "data".')
 
-            modality = None
             if item_type == 'image_data':
                 modality = Modality.IMAGE
                 mm_stats.add_item(modality.value)
@@ -160,22 +162,23 @@ def _require_data_src():
                     with _mm_stage(mm_stats, 'media_io', modality.value):
                         data = load_from_url(data_src, ImageMediaIO(**media_io_kwargs.get('image', {})))
                 else:
+                    mm_stats.record_failure('parse', modality.value)
                     raise ValueError(f'Invalid multimodal image item at index {i}: {item}. '
                                         'Expected a str URL/path/data URL or PIL.Image.Image.')
             elif item_type in ('video_url', 'video'):
                 modality = Modality.VIDEO
                 mm_stats.add_item(modality.value)
+                data_src = _require_data_src()
                 with _mm_stage(mm_stats, 'media_io', modality.value):
                     data, metadata = load_from_url(
-                        _require_data_src(),
-                        VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {})))
+                        data_src, VideoMediaIO(image_io=ImageMediaIO(), **media_io_kwargs.get('video', {})))
                 item_params['video_metadata'] = metadata
             elif item_type in ('time_series_url', 'time_series'):
                 modality = Modality.TIME_SERIES
                 mm_stats.add_item(modality.value)
+                data_src = _require_data_src()
                 with _mm_stage(mm_stats, 'media_io', modality.value):
-                    data = load_from_url(_require_data_src(),
-                                            TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {})))
+                    data = load_from_url(data_src, TimeSeriesMediaIO(**media_io_kwargs.get('time_series', {})))
             else:
                 mm_stats.record_failure('media_io', 'unknown')
                 raise NotImplementedError(f'unknown type: {item_type}')

From c6df63de52405e00e3afa4dc701a9f8efbd97b08 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Mon, 1 Jun 2026 16:43:46 +0800
Subject: [PATCH 5/5] refactor: simplify multimodal metrics defaults

---
 lmdeploy/metrics/loggers.py             |  3 +--
 lmdeploy/serve/processors/multimodal.py | 11 ++++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lmdeploy/metrics/loggers.py b/lmdeploy/metrics/loggers.py
index 002e4175b5..1661439434 100644
--- a/lmdeploy/metrics/loggers.py
+++ b/lmdeploy/metrics/loggers.py
@@ -119,8 +119,7 @@ def log(self):
         now = time.perf_counter()
 
         # skip logging if no tokens were processed
-        if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0
-                and self.total_multimodal_requests == 0):
+        if (self.total_prompt_tokens == 0 and self.total_generation_tokens == 0):
             self._reset(now)
             return
 
diff --git a/lmdeploy/serve/processors/multimodal.py b/lmdeploy/serve/processors/multimodal.py
index 6328122326..140e37358f 100644
--- a/lmdeploy/serve/processors/multimodal.py
+++ b/lmdeploy/serve/processors/multimodal.py
@@ -102,12 +102,12 @@ def merge_message_content(msg: dict) -> dict:
         return result
 
     @staticmethod
-    def _parse_multimodal_item(i: int, in_messages: list[dict], out_messages: list[dict], media_io_kwargs: dict[str,
-                                                                                                                Any],
-                               mm_stats: MultimodalStats | None = None):
+    def _parse_multimodal_item(i: int,
+                               in_messages: list[dict],
+                               out_messages: list[dict],
+                               media_io_kwargs: dict[str, Any],
+                               mm_stats: MultimodalStats):
         """Synchronous helper to parse a single multimodal message item."""
-        media_io_kwargs = media_io_kwargs or {}
-        mm_stats = mm_stats or MultimodalStats(enabled=False)
         role = in_messages[i]['role']
         content = in_messages[i]['content']
 
@@ -198,6 +198,7 @@ async def async_parse_multimodal_item(messages: list[dict],
 
         out_messages = [None] * len(messages)
         media_io_kwargs = media_io_kwargs or {}
+        mm_stats = mm_stats or MultimodalStats(enabled=False)
         loop = asyncio.get_event_loop()
 
         await asyncio.gather(*[