From 99d3d4dc10c018b1f864be7a8fe701cf489d5351 Mon Sep 17 00:00:00 2001
From: qwes5s5 <1522419171@qq.com>
Date: Mon, 26 Jan 2026 15:36:06 +0000
Subject: [PATCH] add token ratio metrics

---
 docs/online_serving/metrics.md       |  1 +
 docs/zh/online_serving/metrics.md    |  1 +
 fastdeploy/metrics/metrics.py        | 51 ++++++++++++++++++++++++++++
 fastdeploy/output/token_processor.py | 12 +++----
 4 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md
index a538db548aa..d7fe4f2a6a2 100644
--- a/docs/online_serving/metrics.md
+++ b/docs/online_serving/metrics.md
@@ -20,6 +20,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
 | Token | `fastdeploy:prompt_tokens_total` | Counter | Total number of processed prompt tokens | count |
 | Token | `fastdeploy:generation_tokens_total` | Counter | Total number of generated tokens | count |
 | Token | `fastdeploy:request_prompt_tokens` | Histogram | Prompt token count per request | count |
+| Token | `fastdeploy:request_token_ratio`   | Histogram | Token generation rate per Request | count |
 | Token | `fastdeploy:request_generation_tokens` | Histogram | Generation token count per request | count |
 | Token | `fastdeploy:request_params_max_tokens` | Histogram | Distribution of `max_tokens` per request | count |
 | Batch | `fastdeploy:available_batch_size` | Gauge | Number of additional requests that can be inserted during Decode | count |
diff --git a/docs/zh/online_serving/metrics.md b/docs/zh/online_serving/metrics.md
index 75576995ebb..618e8700ab9 100644
--- a/docs/zh/online_serving/metrics.md
+++ b/docs/zh/online_serving/metrics.md
@@ -20,6 +20,7 @@
 | Token | `fastdeploy:prompt_tokens_total`          | Counter   | 已处理的 prompt token 总数   | 个   |
 | Token | `fastdeploy:generation_tokens_total`      | Counter   | 已生成的 token 总数          | 个   |
 | Token | `fastdeploy:request_prompt_tokens`        | Histogram | 每个请求的 prompt token 数量 | 个   |
+| Token | `fastdeploy:request_token_ratio`          | Histogram | 每个请求的token生成速率       | 个   |
 | Token | `fastdeploy:request_generation_tokens`    | Histogram | 每个请求的 generation token 数量    | 个   |
 | Token | `fastdeploy:request_params_max_tokens`    | Histogram | 请求的 max_tokens 分布       | 个   |
 | 批处理 | `fastdeploy:available_batch_size`         | Gauge     | Decode 阶段还可以插入的请求数量 | 个   |
diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py
index 4da49a9666d..719ef4a887e 100644
--- a/fastdeploy/metrics/metrics.py
+++ b/fastdeploy/metrics/metrics.py
@@ -184,6 +184,7 @@ class MetricsManager:
     request_params_max_tokens: "Histogram"
     prompt_tokens_total: "Counter"
     request_prompt_tokens: "Histogram"
+    request_token_ratio: "Histogram"
 
     # 定义所有指标配置
 
@@ -559,6 +560,56 @@ class MetricsManager:
             "description": "Number of prefill tokens processed",
             "kwargs": {"buckets": build_1_2_5_buckets(33792)},
         },
+        "request_token_ratio": {
+            "type": Histogram,
+            "name": "fastdeploy:request_token_ratio",
+            "description": "Ratio of output tokens to input tokens (generation_tokens / prompt_tokens)",
+            "kwargs": {
+                "buckets": [
+                    0,
+                    5,
+                    10,
+                    15,
+                    20,
+                    25,
+                    30,
+                    35,
+                    40,
+                    45,
+                    50,
+                    55,
+                    60,
+                    65,
+                    70,
+                    75,
+                    80,
+                    85,
+                    90,
+                    95,
+                    100,
+                    105,
+                    110,
+                    115,
+                    120,
+                    125,
+                    130,
+                    135,
+                    140,
+                    145,
+                    150,
+                    155,
+                    160,
+                    165,
+                    170,
+                    175,
+                    180,
+                    185,
+                    190,
+                    195,
+                    200,
+                ]
+            },
+        },
     }
 
     def __init__(self):
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index 00eeb04dc76..8461fde41d6 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -206,9 +206,9 @@ def _process_per_token(self, task, batch_id: int, token_ids: np.ndarray, result:
                 llm_logger.info(
                     f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
                 )
-                llm_logger.info(
-                    f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"
-                )
+                token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time)
+                llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
+                main_process_metrics.request_token_ratio.observe(token_ratio)
                 llm_logger.info(f"{self.resource_manager.info()}")
                 if self.cfg.speculative_config.method:
                     self._compute_speculative_status()
@@ -823,9 +823,9 @@ def _process_batch_output(self):
                         f"Request: {task_id} finished, number of "
                         f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
                     )
-                    llm_logger.info(
-                        f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"
-                    )
+                    token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time)
+                    llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
+                    main_process_metrics.request_token_ratio.observe(token_ratio)
                     llm_logger.info(f"{self.resource_manager.info()}")
                     if self.cfg.speculative_config.method:
                         self._compute_speculative_status(result)