From 8e8b8b975882db24bf0c5669ae9f12c0c201dfc3 Mon Sep 17 00:00:00 2001 From: pensieve-intern Date: Mon, 1 Jun 2026 18:49:30 +0000 Subject: [PATCH] =?UTF-8?q?[OMNIML-4887]=20cell=5Ft0=5Fd7=20=E2=80=94=20pe?= =?UTF-8?q?nsieve-intern=20agent=20draft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../_cells/qwen35_4b_mtp_vllm_t0_d7.yaml | 4 ++ .../specdec_bench_mtp_vllm_t0_d7.yaml | 58 +++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 tools/launcher/common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm_t0_d7.yaml diff --git a/tools/launcher/common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml b/tools/launcher/common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml new file mode 100644 index 00000000000..fcf893c989e --- /dev/null +++ b/tools/launcher/common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml @@ -0,0 +1,4 @@ +sampling_kwargs: + temperature: 0 +engine_args: + max_model_len: 40960 diff --git a/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm_t0_d7.yaml b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm_t0_d7.yaml new file mode 100644 index 00000000000..69cb877d848 --- /dev/null +++ b/tools/launcher/examples/Qwen/Qwen3.5-4B/specdec_bench_mtp_vllm_t0_d7.yaml @@ -0,0 +1,58 @@ +job_name: Qwen3.5-4B_specdec_bench_mtp_vllm_t0_d7 + +pipeline: + global_vars: + hf_model: /hf-local/Qwen/Qwen3.5-4B + + task_0: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/qualitative + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 7 + - --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml + - --tp_size 2 + - --ep_size 1 + - --concurrency 32 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d7/qualitative + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 2 + container: vllm/vllm-openai:qwen3_5-cu130 + + task_1: + script: common/specdec_bench/run.sh + args: + - --dataset speed + - --dataset_path /hf-local/nvidia/SPEED-Bench-Internal/throughput_32k + - --engine VLLM + - --speculative_algorithm MTP + - --draft_length 7 + - --runtime_params common/specdec_bench/_cells/qwen35_4b_mtp_vllm_t0_d7.yaml + - --tp_size 2 + - --ep_size 1 + - --concurrency 8 + - --num_requests 80 + - --output_length 4096 + - --aa_timing + - --show_progress + - --save_dir /scratchspace/qwen35_4b_mtp_vllm_t0_d7/throughput_32k + environment: + - HF_MODEL_CKPT: <> + - HF_LOCAL: /hf-local + slurm_config: + _factory_: "slurm_factory" + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 2 + container: vllm/vllm-openai:qwen3_5-cu130