command: ["/bin/bash", "-c"]
args:
- |
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
PYTHONNOUSERSITE=1 sglang serve \
--model-path /data/models/DeepSeek-V4-Pro \
--host 0.0.0.0 \
--port 30000 \
--trust-remote-code \
--tp 8 \
--dp-size 8 \
--enable-dp-attention \
--moe-a2a-backend deepep \
--deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
--chunked-prefill-size 32768 \
--max-running-requests 768 \
--mem-fraction-static 0.90 \
--swa-full-tokens-ratio 0.1 \
--disable-radix-cache
but tps and TTFT is low
============ Serving Benchmark Result ============
Backend: sglang
Traffic request rate: inf
Max request concurrency: 512
Successful requests: 1024
Benchmark duration (s): 222.49
Total input tokens: 4128823
Total input text tokens: 4128823
Total generated tokens: 528804
Total generated tokens (retokenized): 528796
Request throughput (req/s): 4.60
Input token throughput (tok/s): 18557.33
Output token throughput (tok/s): 2376.75
Peak output token throughput (tok/s): 13620.00
Peak concurrent requests: 527
Total token throughput (tok/s): 20934.08
Concurrency: 469.42
----------------End-to-End Latency----------------
Mean E2E Latency (ms): 101994.44
Median E2E Latency (ms): 91578.25
P90 E2E Latency (ms): 198349.32
P99 E2E Latency (ms): 203565.58
---------------Time to First Token----------------
Mean TTFT (ms): 14212.86
Median TTFT (ms): 5328.88
P99 TTFT (ms): 61155.86
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 330.72
Median TPOT (ms): 179.35
P99 TPOT (ms): 3376.99
---------------Inter-Token Latency----------------
Mean ITL (ms): 170.31
Median ITL (ms): 43.84
P95 ITL (ms): 465.11
P99 ITL (ms): 1127.14
Max ITL (ms): 32249.67