From 99d3d4dc10c018b1f864be7a8fe701cf489d5351 Mon Sep 17 00:00:00 2001 From: qwes5s5 <1522419171@qq.com> Date: Mon, 26 Jan 2026 15:36:06 +0000 Subject: [PATCH] add token ratio metrics --- docs/online_serving/metrics.md | 1 + docs/zh/online_serving/metrics.md | 1 + fastdeploy/metrics/metrics.py | 51 ++++++++++++++++++++++++++++ fastdeploy/output/token_processor.py | 12 +++---- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md index a538db548aa..d7fe4f2a6a2 100644 --- a/docs/online_serving/metrics.md +++ b/docs/online_serving/metrics.md @@ -20,6 +20,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo | Token | `fastdeploy:prompt_tokens_total` | Counter | Total number of processed prompt tokens | count | | Token | `fastdeploy:generation_tokens_total` | Counter | Total number of generated tokens | count | | Token | `fastdeploy:request_prompt_tokens` | Histogram | Prompt token count per request | count | +| Token | `fastdeploy:request_token_ratio` | Histogram | Token generation rate per Request | count | | Token | `fastdeploy:request_generation_tokens` | Histogram | Generation token count per request | count | | Token | `fastdeploy:request_params_max_tokens` | Histogram | Distribution of `max_tokens` per request | count | | Batch | `fastdeploy:available_batch_size` | Gauge | Number of additional requests that can be inserted during Decode | count | diff --git a/docs/zh/online_serving/metrics.md b/docs/zh/online_serving/metrics.md index 75576995ebb..618e8700ab9 100644 --- a/docs/zh/online_serving/metrics.md +++ b/docs/zh/online_serving/metrics.md @@ -20,6 +20,7 @@ | Token | `fastdeploy:prompt_tokens_total` | Counter | 已处理的 prompt token 总数 | 个 | | Token | `fastdeploy:generation_tokens_total` | Counter | 已生成的 token 总数 | 个 | | Token | `fastdeploy:request_prompt_tokens` | Histogram | 每个请求的 prompt token 数量 | 个 | +| Token | `fastdeploy:request_token_ratio` | Histogram | 每个请求的token生成速率 | 个 | | Token | `fastdeploy:request_generation_tokens` | Histogram | 每个请求的 generation token 数量 | 个 | | Token | `fastdeploy:request_params_max_tokens` | Histogram | 请求的 max_tokens 分布 | 个 | | 批处理 | `fastdeploy:available_batch_size` | Gauge | Decode 阶段还可以插入的请求数量 | 个 | diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index 4da49a9666d..719ef4a887e 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -184,6 +184,7 @@ class MetricsManager: request_params_max_tokens: "Histogram" prompt_tokens_total: "Counter" request_prompt_tokens: "Histogram" + request_token_ratio: "Histogram" # 定义所有指标配置 @@ -559,6 +560,56 @@ class MetricsManager: "description": "Number of prefill tokens processed", "kwargs": {"buckets": build_1_2_5_buckets(33792)}, }, + "request_token_ratio": { + "type": Histogram, + "name": "fastdeploy:request_token_ratio", + "description": "Ratio of output tokens to input tokens (generation_tokens / prompt_tokens)", + "kwargs": { + "buckets": [ + 0, + 5, + 10, + 15, + 20, + 25, + 30, + 35, + 40, + 45, + 50, + 55, + 60, + 65, + 70, + 75, + 80, + 85, + 90, + 95, + 100, + 105, + 110, + 115, + 120, + 125, + 130, + 135, + 140, + 145, + 150, + 155, + 160, + 165, + 170, + 175, + 180, + 185, + 190, + 195, + 200, + ] + }, + }, } def __init__(self): diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 00eeb04dc76..8461fde41d6 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -206,9 +206,9 @@ def _process_per_token(self, task, batch_id: int, token_ids: np.ndarray, result: llm_logger.info( f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}." ) - llm_logger.info( - f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}" - ) + token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time) + llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}") + main_process_metrics.request_token_ratio.observe(token_ratio) llm_logger.info(f"{self.resource_manager.info()}") if self.cfg.speculative_config.method: self._compute_speculative_status() @@ -823,9 +823,9 @@ def _process_batch_output(self): f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}" ) - llm_logger.info( - f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}" - ) + token_ratio = self.tokens_counter[task_id] / (time.time() - task.inference_start_time) + llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}") + main_process_metrics.request_token_ratio.observe(token_ratio) llm_logger.info(f"{self.resource_manager.info()}") if self.cfg.speculative_config.method: self._compute_speculative_status(result)