PaddlePaddle
diff --git a/‎docs/features/weight_update.md‎
Lines changed: 4 additions & 8 deletions b/‎docs/features/weight_update.md‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎docs/zh/features/weight_update.md‎
Lines changed: 4 additions & 8 deletions b/‎docs/zh/features/weight_update.md‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎fastdeploy/__init__.py‎
Lines changed: 0 additions & 8 deletions b/‎fastdeploy/__init__.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 34 additions & 3 deletions b/‎fastdeploy/config.py‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎fastdeploy/entrypoints/openai/api_server.py‎
Lines changed: 5 additions & 10 deletions b/‎fastdeploy/entrypoints/openai/api_server.py‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎fastdeploy/entrypoints/openai/multi_api_server.py‎
Lines changed: 9 additions & 0 deletions b/‎fastdeploy/entrypoints/openai/multi_api_server.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎fastdeploy/metrics/metrics.py‎
Lines changed: 41 additions & 13 deletions b/‎fastdeploy/metrics/metrics.py‎
Lines changed: 41 additions & 13 deletions
diff --git a/‎fastdeploy/model_executor/models/qwen3_vl/qwen3_vl.py‎
Lines changed: 4 additions & 0 deletions b/‎fastdeploy/model_executor/models/qwen3_vl/qwen3_vl.py‎
Lines changed: 4 additions & 0 deletions
@@ -50,7 +50,7 @@ In FastDeploy >= 2.6, the underlying control-signal communication path is optimi
 | `/v1/is_paused` | `GET` | none | Return `{"is_paused": bool}`. |
 | `/v1/sleep` | `POST` | `?tags=weight,kv_cache` | Offload selected GPU memory objects. Supported tags are `weight` and `kv_cache`. If omitted, both are used. |
 | `/v1/wakeup` | `POST` | `?tags=weight,kv_cache` | Reload previously offloaded weights and/or KV cache. On success, the engine resumes automatically. |
-| `/v1/update_weights` | `POST` | JSON `{"version":"...", "rsync_config": {...}}` | Refresh weights in place through the worker control path. This API is intended for remote versioned updates, especially `load_strategy=rsync`. |
+| `/v1/update_weights` | `POST` | JSON `{"version":"...", "verify_checksum": false}` | Refresh weights in place through the worker control path. This API is intended for remote versioned updates, especially `load_strategy=rsync`. |
 
 ### Compatibility Notes
 
@@ -114,7 +114,7 @@ After `wakeup` succeeds, FastDeploy automatically calls `resume`.
 Current request fields:
 
 - `version`: optional string. Used to choose a target checkpoint version.
-- `rsync_config`: optional dictionary. Must contain `etcd_server` when provided.
+- `verify_checksum`: optional boolean. Defaults to `false`. Set to `true` to verify data integrity during weight synchronization.
 
 Important semantics:
 
@@ -186,9 +186,7 @@ curl -X POST http://127.0.0.1:8000/v1/update_weights \
   -H "Content-Type: application/json" \
   -d '{
     "version": "global_step_1200",
-    "rsync_config": {
-      "etcd_server": "127.0.0.1:2379"
-    }
+    "verify_checksum": false
   }'
 ```
 
@@ -261,9 +259,7 @@ curl -X POST http://127.0.0.1:8000/v1/update_weights \
   -H "Content-Type: application/json" \
   -d '{
     "version": "global_step_1200",
-    "rsync_config": {
-      "etcd_server": "127.0.0.1:2379"
-    }
+    "verify_checksum": false
   }'
 
 # Resume the service after the update completes
 
@@ -50,7 +50,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
 | `/v1/is_paused` | `GET` | 无 | 返回 `{"is_paused": bool}`。 |
 | `/v1/sleep` | `POST` | `?tags=weight,kv_cache` | 卸载指定 GPU 内存对象。支持 `weight` 与 `kv_cache`；不传时默认同时处理两者。 |
 | `/v1/wakeup` | `POST` | `?tags=weight,kv_cache` | 重新加载之前被卸载的权重和/或 KV Cache。成功后会自动 `resume`。 |
-| `/v1/update_weights` | `POST` | JSON `{"version":"...", "rsync_config": {...}}` | 通过 worker 控制链路原地刷新模型权重。该接口主要面向 `load_strategy=rsync` 的远端版本更新。 |
+| `/v1/update_weights` | `POST` | JSON `{"version":"...", "verify_checksum": false}` | 通过 worker 控制链路原地刷新模型权重。该接口主要面向 `load_strategy=rsync` 的远端版本更新。 |
 
 ### 兼容性说明
 
@@ -113,7 +113,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
 当前支持的请求字段：
 
 - `version`：可选字符串，用于指定目标 checkpoint 版本。
-- `rsync_config`：可选字典；如果传入，必须包含 `etcd_server`。
+- `verify_checksum`：可选布尔值；默认为 `false`。设置为 `true` 时，会在权重同步过程中校验数据完整性。
 
 关键语义：
 
@@ -185,9 +185,7 @@ curl -X POST http://127.0.0.1:8000/v1/update_weights \
   -H "Content-Type: application/json" \
   -d '{
     "version": "global_step_1200",
-    "rsync_config": {
-      "etcd_server": "127.0.0.1:2379"
-    }
+    "verify_checksum": false
   }'
 ```
 
@@ -260,9 +258,7 @@ curl -X POST http://127.0.0.1:8000/v1/update_weights \
   -H "Content-Type: application/json" \
   -d '{
     "version": "global_step_1200",
-    "rsync_config": {
-      "etcd_server": "127.0.0.1:2379"
-    }
+    "verify_checksum": false
   }'
 
 # 更新完成后恢复服务
 
@@ -15,19 +15,11 @@
 """
 
 import os
-import uuid
 
 # suppress warning log from paddlepaddle
 os.environ["GLOG_minloglevel"] = "2"
 # suppress log from aistudio
 os.environ["AISTUDIO_LOG"] = "critical"
-# set prometheus dir
-if os.getenv("PROMETHEUS_MULTIPROC_DIR", "") == "":
-    prom_dir = f"/tmp/fd_prom_{str(uuid.uuid4())}"
-    os.environ["PROMETHEUS_MULTIPROC_DIR"] = prom_dir
-    if os.path.exists(prom_dir):
-        os.rmdir(prom_dir)
-    os.mkdir(prom_dir)
 
 import typing
 
 
@@ -1215,6 +1215,37 @@ def update_enable_early_stop(self, argument: bool):
             argument = self.enable_early_stop
 
 
+class DeployModality(str, Enum):
+    """Modality mode for the serving engine deployment.
+
+    Determines which input modalities the serving engine should handle:
+      - TEXT:  Text-only deployment. The engine only processes text inputs,
+               skipping multimodal preprocessing (e.g., vision encoder, audio
+               encoder). This reduces GPU memory usage and startup time when
+               multimodal capabilities are not needed.
+      - MIXED: Multimodal deployment (default). The engine handles mixed-modality
+               inputs including text, images, audio, and video. All modality-specific
+               encoders and preprocessing pipelines are initialized at startup.
+
+    Usage:
+      --deploy-modality text    # text-only, lower resource footprint
+      --deploy-modality mixed   # full multimodal support (default)
+    """
+
+    TEXT = "text"
+    MIXED = "mixed"
+
+    @classmethod
+    def from_str(cls, value: str) -> "DeployModality":
+        """Parse a string into a DeployModality enum, with validation."""
+        value = value.strip().lower()
+        try:
+            return cls(value)
+        except ValueError:
+            valid = ", ".join(f"'{m.value}'" for m in cls)
+            raise ValueError(f"Invalid deploy_modality '{value}'. Must be one of: {valid}")
+
+
 class LoadChoices(str, Enum):
     """LoadChoices"""
 
@@ -1697,6 +1728,7 @@ def __init__(
         tool_parser: str = None,
         test_mode=False,
         routing_replay_config: Optional[RoutingReplayConfig] = None,
+        deploy_modality: "DeployModality" = None,
     ):
         self.model_config: ModelConfig = model_config  # type: ignore
         self.cache_config: CacheConfig = cache_config  # type: ignore
@@ -1713,8 +1745,7 @@ def __init__(
         self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
         self.router_config: RouterConfig = router_config
         self.routing_replay_config = routing_replay_config
-
-        # Initialize cuda graph capture list
+        self.deploy_modality: DeployModality = deploy_modality if deploy_modality is not None else DeployModality.MIXED
         max_capture_shape = self.scheduler_config.max_num_seqs
         if self.speculative_config is not None and self.speculative_config.method in ["mtp", "suffix"]:
             max_capture_shape = self.scheduler_config.max_num_seqs * (
@@ -2209,7 +2240,7 @@ def get_max_chunk_tokens(self, mm_max_tokens_per_item=None):
                 num_tokens = self.scheduler_config.max_num_seqs
         else:
             num_tokens = self.scheduler_config.max_num_batched_tokens
-            if mm_max_tokens_per_item is not None:
+            if mm_max_tokens_per_item is not None and self.deploy_modality != DeployModality.TEXT:
                 max_mm_tokens = max(
                     mm_max_tokens_per_item.get("image", 0),
                     mm_max_tokens_per_item.get("video", 0),
 
@@ -459,19 +459,14 @@ async def update_weights(request: Request) -> Response:
             )
         args["version"] = request_data["version"]
 
-    # Validate and extract rsync_config parameter
-    if "rsync_config" in request_data and request_data["rsync_config"] is not None:
-        if not isinstance(request_data["rsync_config"], dict):
+    # Validate and extract verify_checksum parameter
+    if "verify_checksum" in request_data and request_data["verify_checksum"] is not None:
+        if not isinstance(request_data["verify_checksum"], bool):
             return JSONResponse(
                 status_code=400,
-                content={"error": "Invalid parameter type", "message": "rsync_config must be a dictionary"},
+                content={"error": "Invalid parameter type", "message": "verify_checksum must be a boolean"},
             )
-        if "etcd_server" not in request_data["rsync_config"]:
-            return JSONResponse(
-                status_code=400,
-                content={"error": "Invalid parameter type", "message": "rsync_config must contain etcd_server"},
-            )
-        args["rsync_config"] = request_data["rsync_config"]
+        args["verify_checksum"] = request_data["verify_checksum"]
 
     control_request = ControlRequest(request_id, "update_weights", args)
     control_response = await app.state.engine_client.run_control_method(control_request)
 
@@ -107,6 +107,15 @@ def start_servers(
         env = os.environ.copy()
         env["FD_ENABLE_MULTI_API_SERVER"] = "1"
         env["FD_LOG_DIR"] = env.get("FD_LOG_DIR", "log") + f"/log_{i}"
+        if "PROMETHEUS_MULTIPROC_DIR" in env:
+            prom_dir = env.get("PROMETHEUS_MULTIPROC_DIR")
+            prom_dir_i = os.path.join(os.path.dirname(prom_dir), os.path.basename(prom_dir) + f"_dp{i}")
+            # Create the directory if it doesn't exist
+            if not os.path.exists(prom_dir_i):
+                os.makedirs(prom_dir_i, exist_ok=True)
+            env["PROMETHEUS_MULTIPROC_DIR"] = prom_dir_i
+            logger.info(f"Set PROMETHEUS_MULTIPROC_DIR for DP {i}: {prom_dir_i}")
+
         cmd = [
             sys.executable,
             "-m",
 
@@ -65,7 +65,7 @@ def collect(self):
             Metric: Prometheus Metric objects that are not excluded.
         """
         for metric in self.base_registry.collect():
-            if not any(name.startswith(metric.name) for name in self.exclude_names):
+            if not any(metric.name.startswith(name) for name in self.exclude_names):
                 yield metric
 
 
@@ -83,11 +83,15 @@ def get_filtered_metrics() -> str:
         multiprocess.MultiProcessCollector(base_registry)
 
         filtered_registry = CollectorRegistry()
-        # 注册一个新的colletor，过滤gauge指标
-        filtered_registry.register(SimpleCollector(base_registry, EXCLUDE_LABELS))
+        # 动态获取需要排除的 gauge 指标列表
+        exclude_labels = main_process_metrics.get_excluded_metrics()
+        # 注册一个新的collector，过滤gauge指标
+        filtered_registry.register(SimpleCollector(base_registry, exclude_labels))
 
         # 将gauge指标重新注册到filtered_registry中，从内存中读取
         main_process_metrics.re_register_gauge(filtered_registry)
+        # 将speculative中的gauge指标也重新注册
+        main_process_metrics.re_register_speculative_gauge(filtered_registry)
 
         return generate_latest(filtered_registry).decode("utf-8")
 
@@ -195,7 +199,7 @@ class MetricsManager:
             "type": Gauge,
             "name": "fastdeploy:num_requests_running",
             "description": "Number of requests currently running",
-            "kwargs": {"multiprocess_mode": "sum"},
+            "kwargs": {},
         },
         "num_requests_waiting": {
             "type": Gauge,
@@ -625,19 +629,22 @@ def __init__(self):
         # 在模块加载，指标注册先设置Prometheus环境变量
         setup_multiprocess_prometheus()
 
-        # 动态创建所有指标
+        # 动态创建所有非 gauge 型指标
         for metric_name, config in self.METRICS.items():
             setattr(
                 self,
                 metric_name,
                 config["type"](config["name"], config["description"], **config["kwargs"]),
             )
-        # 动态创建所有指标
+        # 动态创建所有 gauge 型指标，统一配置 multiprocess_mode 为 livesum
         for metric_name, config in self.GAUGE_METRICS.items():
+            kwargs = config["kwargs"].copy()
+            if "multiprocess_mode" not in kwargs:
+                kwargs["multiprocess_mode"] = "livesum"
             setattr(
                 self,
                 metric_name,
-                config["type"](config["name"], config["description"], **config["kwargs"]),
+                config["type"](config["name"], config["description"], **kwargs),
             )
         # 动态创建server metrics
         for metric_name, config in self.SERVER_METRICS.items():
@@ -695,17 +702,22 @@ def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
                         Gauge(
                             f"{config['name']}_{i}",
                             f"{config['description']} (head {i})",
+                            multiprocess_mode="livesum",
                         )
                     )
                     setattr(self, metric_name, gauges)
             else:
+                # For Gauge metrics, automatically add multiprocess_mode="livesum"
+                kwargs = config["kwargs"].copy()
+                if config["type"] == Gauge and "multiprocess_mode" not in kwargs:
+                    kwargs["multiprocess_mode"] = "livesum"
                 setattr(
                     self,
                     metric_name,
                     config["type"](
                         config["name"],
                         config["description"],
-                        **config["kwargs"],
+                        **kwargs,
                     ),
                 )
 
@@ -766,6 +778,19 @@ def register_speculative_metrics(self, registry: CollectorRegistry):
             else:
                 registry.register(getattr(self, metric_name))
 
+    def re_register_speculative_gauge(self, registry: CollectorRegistry):
+        """Re-register gauge metrics from SPECULATIVE_METRICS to the specified registry"""
+        # Check if SPECULATIVE_METRICS was initialized in this process
+        # (it's an instance attribute set by _init_speculative_metrics, not the class-level empty dict)
+        if not hasattr(self, "spec_decode_draft_acceptance_rate"):
+            return
+        for metric_name, config in self.SPECULATIVE_METRICS.items():
+            if metric_name == "spec_decode_draft_single_head_acceptance_rate":
+                for gauge in getattr(self, metric_name):
+                    registry.register(gauge)
+            elif config["type"] == Gauge:
+                registry.register(getattr(self, metric_name))
+
     def re_register_gauge(self, registry: CollectorRegistry):
         """Re-register gauge to the specified registry"""
         for metric_name in self.GAUGE_METRICS:
@@ -789,16 +814,19 @@ def register_all(self, registry: CollectorRegistry):
         if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
             self.register_speculative_metrics(registry)
 
-    @classmethod
-    def get_excluded_metrics(cls) -> Set[str]:
+    def get_excluded_metrics(self) -> Set[str]:
         """Get the set of indicator names that need to be excluded"""
-        return {config["name"] for config in cls.GAUGE_METRICS.values()}
+        excluded = {config["name"] for config in self.GAUGE_METRICS.values()}
+        # Also add gauge metrics from SPECULATIVE_METRICS (if initialized)
+        if hasattr(self, "SPECULATIVE_METRICS"):
+            for config in self.SPECULATIVE_METRICS.values():
+                if config["type"] == Gauge or config["type"] == list[Gauge]:
+                    excluded.add(config["name"])
+        return excluded
 
 
 main_process_metrics = MetricsManager()
 
 # 由于zmq指标记录比较耗时，默认不开启，通过DEBUG参数开启
 if envs.FD_DEBUG:
     main_process_metrics.init_zmq_metrics()
-
-EXCLUDE_LABELS = MetricsManager.get_excluded_metrics()
 
@@ -381,6 +381,10 @@ def forward(
 
         return hidden_states
 
+    def clear_grpah_opt_backend(self):
+        """Clear graph optimization backend, the captured cuda graph will be cleaned"""
+        self.model.clear_grpah_opt_backend(fd_config=self.fd_config)
+
 
 class Qwen3VLPretrainedModel(PretrainedModel):
     """Utilities for tensor-parallel weight splitting."""