From cfc31b78e39c33b7257aca160ac7eef8cc902e32 Mon Sep 17 00:00:00 2001 From: Chenhan Yu Date: Sat, 30 May 2026 20:18:56 -0700 Subject: [PATCH] =?UTF-8?q?[OMNIML-4869]=20author=5Fyaml=20=E2=80=94=20pen?= =?UTF-8?q?sieve-intern=20agent=20draft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Chenhan Yu --- .../Qwen3.5-4B/specdec_bench_mtp_vllm.yaml | 67 +++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm.yaml diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm.yaml new file mode 100644 index 00000000000..93b36fe74d0 --- /dev/null +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm.yaml @@ -0,0 +1,67 @@ +# SPEED-bench MTP speculative-decoding run for Qwen3.5-4B via vLLM. +# +# The qwen3_5 model_type needs transformers >= 4.58, which is NOT in +# vllm/vllm-openai:latest yet — use the qwen3_5-cu130 tag instead. +# +# Slurm run on cw_dfw: +# uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm.yaml --yes + +job_name: Qwen3.5-4B_specdec_bench_mtp_vllm + +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3.5-4B + + # task_0: SPEED qualitative split + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 1 + - --ep_size 1 + - --concurrency 32 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/qualitative + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: vllm/vllm-openai:qwen3_5-cu130 + + # task_1: SPEED throughput_32k split + task_1: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 3 + - --tp_size 1 + - --ep_size 1 + - --concurrency 8 + - --num_requests 80 + - --runtime_params common/specdec_bench/runtime_params_throughput_32k.yaml + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/{sweep_name_default}/throughput_32k + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 1 + container: vllm/vllm-openai:qwen3_5-cu130