From 6016a0ff2bbed75144701bdb0b15e42e2858c882 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Thu, 7 May 2026 11:51:58 -0700
Subject: [PATCH 01/22] Add EAGLE3 offline launcher examples for 10 new models
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add hf_offline_eagle3.yaml configs in tools/launcher/examples/ for:
- DeepSeek/DeepSeek-V3.2 (685B MoE, 2 nodes, TP=8)
- GLM/GLM-5 (744B MoE, 2 nodes bench, TP=4/EP=2)
- MiniMax/MiniMax-M2.5 (230B MoE, TP=4/EP=4)
- Mistral/Ministral-3-8B (8B dense, TP=4)
- Mistral/Ministral-3-14B (14B dense, TP=4)
- MoonshotAI/Kimi-K2.5 (1T MoE, TP=4/EP=1)
- NVIDIA/Kimi-K2.5-NVFP4 (NVFP4 quant; tasks 1-2 use BF16 base)
- OpenAI/GPT-OSS-20B (20B dense, TP=4)
- Qwen/Qwen3.5-27B (27B dense VLM, TP=4)
- Qwen/Qwen3.5-35B-A3B (35B MoE, TP=4/EP=4)
- Qwen/Qwen3.5-9B (9B dense VLM, TP=4)
- StepFun/Step-3.5-Flash (197B MoE with SWA, TP=4/EP=4)

Each config follows the standard 4-step EAGLE3 offline pipeline:
query → dump hidden states → train draft head → benchmark.
Uses public slurm_factory and common/ script paths.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../DeepSeek-V3.2/hf_offline_eagle3.yaml      | 112 ++++++++++++++++
 .../examples/GLM/GLM-5/hf_offline_eagle3.yaml | 112 ++++++++++++++++
 .../MiniMax-M2.5/hf_offline_eagle3.yaml       | 110 ++++++++++++++++
 .../Ministral-3-14B/hf_offline_eagle3.yaml    | 106 ++++++++++++++++
 .../Ministral-3-8B/hf_offline_eagle3.yaml     | 105 +++++++++++++++
 .../Kimi-K2.5/hf_offline_eagle3.yaml          | 113 +++++++++++++++++
 .../Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml    | 120 ++++++++++++++++++
 .../OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml | 107 ++++++++++++++++
 .../Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml   | 101 +++++++++++++++
 .../Qwen3.5-35B-A3B/hf_offline_eagle3.yaml    | 108 ++++++++++++++++
 .../Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml    | 101 +++++++++++++++
 .../Step-3.5-Flash/hf_offline_eagle3.yaml     | 113 +++++++++++++++++
 12 files changed, 1308 insertions(+)
 create mode 100644 tools/launcher/examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
 create mode 100644 tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml

diff --git a/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml b/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..ee1059da913
--- /dev/null
+++ b/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml
@@ -0,0 +1,112 @@
+# EAGLE3 offline speculative decoding pipeline for deepseek-ai/DeepSeek-V3.2.
+#
+# DeepSeek-V3.2 is a 685B MoE (37B active, MLA attention).
+# BF16 weights ~1370 GB — requires 2 GB200 nodes (8 × 192 GB = 1536 GB).
+#
+# Special requirements:
+#   - Gated model — must be downloaded/mirrored before use
+#   - MLA attention — verify eagle_decoder_type compatibility in eagle_config.json
+#   - trust_remote_code required
+#
+# MoE-specific notes:
+#   - TP=4 per node, EP=2 across 2 nodes for benchmark
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/hf_offline_eagle3.yaml --yes
+
+job_name: DeepSeek-V3.2_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/deepseek-ai/DeepSeek-V3.2
+
+  # Step 1: Data synthesis via vLLM server (2 nodes, TP=8)
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 8
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states (2 nodes, TP=8)
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 8
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (1 node)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (2 nodes, TP=4, EP=2)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 2
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml b/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..9f6a78f439c
--- /dev/null
+++ b/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
@@ -0,0 +1,112 @@
+# EAGLE3 offline speculative decoding pipeline for zai-org/GLM-5.
+#
+# GLM-5 is a 744B MoE (40B active, 256 experts, top-8, DeepSeek Sparse Attention).
+# BF16 weights ~1488 GB — requires 2 GB200 nodes for benchmark (8 × 192 GB = 1536 GB).
+#
+# Special requirements:
+#   - Gated model — must be downloaded/mirrored before use
+#   - trust_remote_code required
+#   - 200K context window supported; use 8192 for typical EAGLE3 training
+#   - DeepSeek Sparse Attention — verify vLLM support before running
+#
+# MoE-specific notes:
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/GLM/GLM-5/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml --yes
+
+job_name: GLM-5_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/zai-org/GLM-5
+
+  # Step 1: Data synthesis via vLLM server (1 node, TP=4)
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states (1 node, TP=4)
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (1 node)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (2 nodes, TP=4, EP=2)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 2
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..f5193e395d8
--- /dev/null
+++ b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
@@ -0,0 +1,110 @@
+# EAGLE3 offline speculative decoding pipeline for MiniMaxAI/MiniMax-M2.5.
+#
+# MiniMax-M2.5 is a 230B MoE (10B active, 256 experts, top-8).
+# BF16 weights ~460 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code is required (custom model type: minimax_m2)
+#
+# MoE-specific notes:
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#     since the draft head uses a dense layer by default.
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml --yes
+
+job_name: MiniMax-M2.5_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/MiniMaxAI/MiniMax-M2.5
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml b/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..88236f8d6a5
--- /dev/null
+++ b/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
@@ -0,0 +1,106 @@
+# EAGLE3 offline speculative decoding pipeline for mistralai/Ministral-3-14B-Instruct-2512-BF16.
+#
+# Ministral-3-14B is a 14B dense model (model type: mistral3, multimodal but
+# EAGLE3 uses the text-only path).
+# BF16 weights ~28 GB — fits easily on a single GB200 node.
+#
+# Special requirements:
+#   - trust_remote_code may be needed for the tokenizer
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml --yes
+
+job_name: Ministral-3-14B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-14B-Instruct-2512-BF16
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..d7047abfc6c
--- /dev/null
+++ b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
@@ -0,0 +1,105 @@
+# EAGLE3 offline speculative decoding pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16.
+#
+# Ministral-3-8B is an 8B dense model (model type: mistral3).
+# BF16 weights ~16 GB — fits easily on a single GB200 node.
+#
+# Special requirements:
+#   - trust_remote_code may be needed for the tokenizer
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml --yes
+
+job_name: Ministral-3-8B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml b/tools/launcher/examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..03b02800253
--- /dev/null
+++ b/tools/launcher/examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml
@@ -0,0 +1,113 @@
+# EAGLE3 offline speculative decoding pipeline for moonshotai/Kimi-K2.5.
+#
+# Kimi-K2.5 is a 1T MoE VLM (32B active, 384 experts, top-8, MLA attention).
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+# BF16 weights ~595 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code required for the Kimi tokenizer
+#   - MLA attention — verify eagle_decoder_type compatibility in eagle_config.json
+#
+# MoE-specific notes:
+#   - task_1: --moe-ep 1 since TP=4 spans all 4 GPUs
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#     (K2.5's per-expert hidden dim is 2048; the draft head uses a dense layer)
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/MoonshotAI/Kimi-K2.5/hf_offline_eagle3.yaml --yes
+
+job_name: Kimi-K2.5_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/moonshotai/Kimi-K2.5
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model (TP=4, moe-ep=1)
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..35513f998f2
--- /dev/null
+++ b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
@@ -0,0 +1,120 @@
+# EAGLE3 offline speculative decoding pipeline for nvidia/Kimi-K2.5-NVFP4.
+#
+# nvidia/Kimi-K2.5-NVFP4 is the NVIDIA-quantized NVFP4 variant of moonshotai/Kimi-K2.5.
+# Same architecture: 1T MoE VLM (32B active, 384 experts, top-8, MLA attention).
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+# NVFP4 weights ~591 GB — single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - Blackwell GPU required (NVFP4 inference only on B100/B200/GB200)
+#   - trust_remote_code required for the Kimi tokenizer
+#   - task_0 uses the NVFP4 checkpoint directly (vLLM v0.15.0+ required)
+#   - tasks 1–2 use the BF16 moonshotai/Kimi-K2.5 checkpoint for hidden-state
+#     extraction and training; draft head transfers to NVFP4 at benchmark time
+#     (NVFP4 is near-lossless so hidden state distributions are equivalent)
+#
+# MoE-specific notes:
+#   - task_1: --moe-ep 1 since TP=4 spans all 4 GPUs
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#     (K2.5's per-expert hidden dim is 2048; the draft head uses a dense layer)
+#   - task_2: verify eagle_decoder_type compatibility with MLA attention
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server (NVFP4) to generate prompt samples
+#   task_1: Dump hidden states — run BF16 target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head on BF16 hidden states
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM (NVFP4)
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml --yes
+
+job_name: Kimi-K2.5-NVFP4_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+    hf_model_bf16: /hf-local/moonshotai/Kimi-K2.5
+
+  # Step 1: Data synthesis via vLLM server using NVFP4 checkpoint
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:v0.15.0
+
+  # Step 2: Dump hidden states using BF16 checkpoint (TP=4, moe-ep=1)
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model_bf16>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head on BF16 hidden states
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model_bf16>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding against NVFP4 target (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:v0.15.0
diff --git a/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..e04c205975b
--- /dev/null
+++ b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
@@ -0,0 +1,107 @@
+# EAGLE3 offline speculative decoding pipeline for openai/gpt-oss-20b.
+#
+# GPT-OSS-20B is a 20B dense model.
+# BF16 weights ~40 GB — fits easily on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code required
+#   - TIKTOKEN_RS_CACHE_DIR must point to a pre-populated tiktoken cache directory
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml --yes
+
+job_name: GPT-OSS-20B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/openai/gpt-oss-20b
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+      - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..7af5efd9acb
--- /dev/null
+++ b/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
@@ -0,0 +1,101 @@
+# EAGLE3 offline speculative decoding pipeline for Qwen/Qwen3.5-27B.
+#
+# Qwen3.5-27B is a 27B dense VLM (text + vision). EAGLE3 uses only the text path.
+# BF16 weights ~54 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml --yes
+
+job_name: Qwen3.5-27B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-27B
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..4c8f908175a
--- /dev/null
+++ b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
@@ -0,0 +1,108 @@
+# EAGLE3 offline speculative decoding pipeline for Qwen/Qwen3.5-35B-A3B.
+#
+# Qwen3.5-35B-A3B is a 35B MoE (3B active, 256 experts, top-8, shared expert).
+# EAGLE3 uses only the text path.
+# BF16 weights ~70 GB — fits on a single GB200 node (4 × 192 GB).
+#
+# MoE-specific notes:
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#     since the draft head uses a dense layer by default.
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml --yes
+
+job_name: Qwen3.5-35B-A3B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-35B-A3B
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..bb90355b1b4
--- /dev/null
+++ b/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
@@ -0,0 +1,101 @@
+# EAGLE3 offline speculative decoding pipeline for Qwen/Qwen3.5-9B.
+#
+# Qwen3.5-9B is a 9B dense VLM (text + vision). EAGLE3 uses only the text path.
+# BF16 weights ~18 GB — fits easily on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml --yes
+
+job_name: Qwen3.5-9B_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-9B
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml b/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml
new file mode 100644
index 00000000000..e749d40cc35
--- /dev/null
+++ b/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml
@@ -0,0 +1,113 @@
+# EAGLE3 offline speculative decoding pipeline for stepfun-ai/Step-3.5-Flash.
+#
+# Step-3.5-Flash is a 197B MoE (11B active, 288 experts + 1 shared, top-8).
+# Has built-in MTP head and sliding window attention (SWA).
+# BF16 weights ~394 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code required
+#   - 256K context window supported; use 8192 for typical EAGLE3 training
+#   - Sliding window attention (SWA) — verify vLLM support before running
+#
+# MoE-specific notes:
+#   - task_2: consider increasing intermediate_size in eagle_config.json
+#     since the draft head uses a dense layer by default.
+#
+# 4-step pipeline:
+#   task_0: Data synthesis — query vLLM server to generate prompt samples
+#   task_1: Dump hidden states — run target model to capture hidden states
+#   task_2: Offline training — train the EAGLE3 draft head
+#   task_3: Benchmark — evaluate speculative decoding speedup via VLLM
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml --yes
+
+job_name: Step-3.5-Flash_EAGLE3_offline
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/stepfun-ai/Step-3.5-Flash
+
+  # Step 1: Data synthesis via vLLM server
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  # Step 3: Train EAGLE3 draft head (offline)
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest

From a6eeff4e5aca27304a823e2d9bebbdaf4e0e143d Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Fri, 8 May 2026 09:25:52 -0700
Subject: [PATCH 02/22] Add EAGLE3 automation triage chart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tools/launcher/examples/EAGLE3_TRIAGE.md, a living document for
tracking EAGLE3 pipeline failure modes across new models.

Content:
- Mermaid decision-tree diagram mapping each pipeline step (query →
  dump → train → benchmark) to known failure modes and root causes
- Model test result table with pass/fail/timeout status for 7 models
  tested on the OCI-HSG cluster (3 remaining to be tested)
- 6 documented issues with symptoms, affected models, root causes,
  and fix recommendations:
  1. dump_offline_data_vllm.sh missing (universal, all models)
  2. offline_training.sh HF Hub upload bug (universal, all models)
  3. Task 0 time limit exceeded (5 models)
  4. GPT-OSS-20B tiktoken cache missing (model-specific)
  5. trust_remote_code not passed to benchmark (MiniMax-M2.5)
  6. DeepSeek-V3.2 task_1 OOM (model-specific)
- Update instructions for adding new models or failure modes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 tools/launcher/examples/EAGLE3_TRIAGE.md | 230 +++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 tools/launcher/examples/EAGLE3_TRIAGE.md

diff --git a/tools/launcher/examples/EAGLE3_TRIAGE.md b/tools/launcher/examples/EAGLE3_TRIAGE.md
new file mode 100644
index 00000000000..dd3a73fc8cf
--- /dev/null
+++ b/tools/launcher/examples/EAGLE3_TRIAGE.md
@@ -0,0 +1,230 @@
+# EAGLE3 Automation Triage
+
+This document captures the failure modes observed when running the 4-step EAGLE3 offline
+pipeline against a selection of 10 new models. It is structured so Claude (or any contributor)
+can update the status table and diagram as new models are tested.
+
+---
+
+## Pipeline Overview
+
+```
+Model checkpoint (HuggingFace)
+        │
+        ▼
+┌──────────────────┐
+│  Task 0: Query   │  vLLM server generates prompt/response pairs
+│  (data synthesis)│  Script: common/vllm/query.sh
+└────────┬─────────┘
+         │
+         ▼
+┌──────────────────┐
+│  Task 1: Dump    │  Target model runs forward pass, saves hidden states
+│  (hidden states) │  Script: common/eagle3/dump_offline_data.sh
+└────────┬─────────┘
+         │
+         ▼
+┌──────────────────┐
+│  Task 2: Train   │  Draft head trained on hidden states (Accelerate + FSDP)
+│  (EAGLE3 head)   │  Script: common/eagle3/train_eagle.sh
+└────────┬─────────┘
+         │
+         ▼
+┌──────────────────┐
+│  Task 3: Bench   │  Speculative decoding benchmark via vLLM
+│  (benchmark)     │  Script: common/specdec_bench/quick_check.sh
+└──────────────────┘
+```
+
+---
+
+## Triage Diagram
+
+The following Mermaid diagram maps each step to its possible failure modes.
+Each leaf node indicates a known issue and its root cause category.
+
+```mermaid
+flowchart TD
+    START([Start: run pipeline]) --> T0
+
+    T0[Task 0: Data synthesis\nvllm/query.sh] --> T0_OK{Success?}
+    T0_OK -- Yes --> T1
+    T0_OK -- No --> T0_FAIL
+
+    T0_FAIL{Failure mode?}
+    T0_FAIL -- Time limit exceeded --> T0_TIMEOUT[⚠ TIMEOUT\nJob wall-clock limit too short\nfor full dataset synthesis.\nFix: increase Slurm time limit\nor reduce dataset size.]
+    T0_FAIL -- Vocab / tokenizer error --> T0_TOKENIZER[⚠ TOKENIZER\nMissing or misconfigured tokenizer cache.\ne.g. GPT-OSS-20B requires TIKTOKEN_RS_CACHE_DIR\nto be pre-populated.\nFix: set env var to valid cache path.]
+    T0_FAIL -- vLLM model not supported --> T0_VLLM[⚠ VLLM_SUPPORT\nModel architecture not yet\nsupported in vllm/vllm-openai:latest.\nFix: use a newer vLLM version or\nswitch to TRT-LLM for task_0.]
+    T0_FAIL -- trust_remote_code error --> T0_TRUST[⚠ TRUST_REMOTE_CODE\nModel requires custom code but\n--trust-remote-code was not passed.\nFix: add flag to task_0 args.]
+
+    T1[Task 1: Dump hidden states\neagle3/dump_offline_data.sh] --> T1_OK{Success?}
+    T1_OK -- Yes --> T2
+    T1_OK -- No --> T1_FAIL
+
+    T1_FAIL{Failure mode?}
+    T1_FAIL -- Script not found --> T1_SCRIPT[⚠ MISSING_SCRIPT\ndump_offline_data_vllm.sh does not exist.\nFix: use dump_offline_data.sh or\ncreate the vllm variant script.]
+    T1_FAIL -- Model not supported by TRT-LLM --> T1_TRTLLM[⚠ TRTLLM_SUPPORT\nTarget model architecture not\nsupported in the TRT-LLM container.\nFix: use vLLM-based dump script or\nupgrade TRT-LLM version.]
+    T1_FAIL -- OOM / memory --> T1_OOM[⚠ OOM\nModel weights exceed GPU memory\nwith current TP/EP configuration.\nFix: increase TP, add EP, or use\na node with more GPUs.]
+    T1_FAIL -- MLA / SWA attention unsupported --> T1_ARCH[⚠ ARCH\nSpecial attention (MLA, SWA) not\nsupported by dump script backend.\nFix: ensure vLLM backend supports\nthe model's attention variant.]
+
+    T2[Task 2: Train EAGLE3 head\neagle3/train_eagle.sh] --> T2_OK{Success?}
+    T2_OK -- Yes --> T3
+    T2_OK -- No --> T2_FAIL
+
+    T2_FAIL{Failure mode?}
+    T2_FAIL -- HF upload on local path --> T2_HFUPLOAD[⚠ HF_UPLOAD_BUG\noffline_training.sh calls HF Hub upload\nwith a local path like /scratchspace/eagle3.\nFix: remove HF upload call; save locally only.\nTracked: offline_training.sh line 18.]
+    T2_FAIL -- eagle_config mismatch --> T2_CONFIG[⚠ EAGLE_CONFIG\neagle_config.json intermediate_size\ntoo small for MoE model's expert hidden dim.\nFix: increase intermediate_size in\neagle_config.json for MoE targets.]
+    T2_FAIL -- eagle_decoder_type incompatible --> T2_DECODER[⚠ DECODER_TYPE\nDefault llama decoder type incompatible\nwith target model attention (e.g. MLA).\nFix: set eagle_decoder_type in\neagle_config.json to match target.]
+    T2_FAIL -- OOM during training --> T2_OOM[⚠ OOM\nFSDP training OOM with default batch size.\nFix: reduce train_bs or\nuse gradient checkpointing.]
+
+    T3[Task 3: Benchmark\nspecdec_bench/quick_check.sh] --> T3_OK{Success?}
+    T3_OK -- Yes --> END([Pipeline complete])
+    T3_OK -- No --> T3_FAIL
+
+    T3_FAIL{Failure mode?}
+    T3_FAIL -- Export dir missing cascade --> T3_EXPORT[⚠ CASCADE\n/scratchspace/export does not exist\nbecause Task 2 failed.\nFix: resolve Task 2 failure first.]
+    T3_FAIL -- trust_remote_code missing --> T3_TRUST[⚠ TRUST_REMOTE_CODE\nBenchmark script did not pass\n--trust-remote-code to vLLM.\nFix: add flag in quick_check.sh\nor pipeline config.]
+    T3_FAIL -- vLLM spec decode not supported --> T3_VLLM[⚠ VLLM_SPECDEC\nSpeculative decoding with EAGLE3\nnot yet supported in vLLM for\nthis model architecture.\nFix: check vLLM release notes.]
+    T3_FAIL -- NVFP4 requires newer vLLM --> T3_NVFP4[⚠ NVFP4\nNVFP4 inference requires\nvllm/vllm-openai:v0.15.0+\nand Blackwell GPU.\nFix: pin container version.]
+```
+
+---
+
+## Model Test Results
+
+Tests were run on the OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
+
+| Model | Size | Task 0 | Task 1 | Task 2 | Task 3 | Notes |
+|-------|------|--------|--------|--------|--------|-------|
+| Ministral-3-8B | 8B dense | ⏱ TIMEOUT (near complete, 3277/3295 samples) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | Tokenizer regex warning (non-fatal) |
+| Ministral-3-14B | 14B dense | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ (no log) | — |
+| GPT-OSS-20B | 20B dense | ❌ TOKENIZER (tiktoken cache missing) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | vLLM tried to start but vocab load failed |
+| MiniMax-M2.5 | 230B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG (config.json variant) | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at benchmark |
+| Qwen3.5-35B-A3B | 35B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | — |
+| Step-3.5-Flash | 197B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | SWA attention - untested past task_1 |
+| DeepSeek-V3.2 | 685B MoE | 🔍 (no log, tarball only) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | 2-node job, task_1 also OOM-terminated (signal 15) |
+| Kimi-K2.5 | 1T MoE | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | MLA attention needs decoder_type check |
+| GLM-5 | 744B MoE | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | Gated model, 2-node |
+| Kimi-K2.5-NVFP4 | NVFP4 quant | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | Blackwell required; tasks 1-2 use BF16 base |
+
+**Legend:** ✅ Pass · ❌ Fail · ⏱ Timeout · 🔍 Inconclusive · 🔲 Not yet tested
+
+---
+
+## Known Issues
+
+### Issue 1: `dump_offline_data_vllm.sh` does not exist (Task 1 — universal)
+
+**Symptom:** `/usr/bin/bash: services/pipeline/eagle3/dump_offline_data_vllm.sh: No such file or directory`
+
+**Affected:** All 7 models tested.
+
+**Root cause:** Quick-fail configs reference a vLLM-based hidden state dump script that was
+planned but never created. The existing script `dump_offline_data.sh` uses TRT-LLM for inference.
+
+**Fix options:**
+- (a) Create `dump_offline_data_vllm.sh` using vLLM for hidden state extraction (enables models
+  not supported by TRT-LLM, e.g. GPT-OSS-20B, Ministral3, Qwen3.5-VLMs).
+- (b) For models supported by TRT-LLM: switch configs to use `dump_offline_data.sh` with
+  appropriate `--tp` and `--moe-ep` flags.
+
+---
+
+### Issue 2: `offline_training.sh` HuggingFace Hub upload bug (Task 2 — universal)
+
+**Symptom:**
+```
+huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or
+'namespace/repo_name': '/scratchspace/eagle3'.
+services/pipeline/eagle3/offline_training.sh: line 18: error_handler: command not found
+```
+
+**Affected:** All 7 models tested.
+
+**Root cause:** `offline_training.sh` calls a HuggingFace CLI upload command with
+`/scratchspace/eagle3` as the repo ID. This path should be a local output directory, not a
+Hub repo. The `error_handler` function is also referenced but not defined.
+
+**Fix:** Remove or gate the Hub upload call in `offline_training.sh`. Save the trained draft
+head to a local path only during CI. Also define `error_handler` or remove the reference.
+
+---
+
+### Issue 3: Time limit exceeded (Task 0 — most models)
+
+**Symptom:** `STEP CANCELLED AT ... DUE TO TIME LIMIT`
+
+**Affected:** Ministral-3-8B (near complete), Ministral-3-14B, MiniMax-M2.5, Qwen3.5-35B-A3B,
+Step-3.5-Flash.
+
+**Root cause:** The default Slurm time limit is too short for synthesizing the full dataset
+with a large model. Ministral-3-8B reached 3277/3295 samples before cancellation.
+
+**Fix options:**
+- (a) Increase Slurm `--time` in the job config.
+- (b) Reduce the dataset size for quick-fail validation (e.g., use a 100-sample subset).
+- (c) Add `--max-samples N` flag to `query.py` / `query.sh`.
+
+---
+
+### Issue 4: GPT-OSS-20B tokenizer cache missing (Task 0 — model-specific)
+
+**Symptom:**
+```
+openai_harmony.HarmonyError: error downloading or loading vocab file:
+failed to download or load vocab file
+```
+
+**Affected:** GPT-OSS-20B only.
+
+**Root cause:** GPT-OSS-20B uses a custom OpenAI tokenizer (`openai_harmony`) that reads from
+a tiktoken cache directory. The env var `TIKTOKEN_RS_CACHE_DIR` must point to a pre-populated
+directory on the cluster.
+
+**Fix:** Ensure `TIKTOKEN_RS_CACHE_DIR` is set to a valid, pre-populated tiktoken cache path
+in the cluster environment before running task_0.
+
+---
+
+### Issue 5: `trust_remote_code` not passed to benchmark (Task 3 — MiniMax)
+
+**Symptom:**
+```
+ValueError: The repository ... contains custom code which must be executed to correctly
+load the model. Please pass trust_remote_code=True.
+```
+
+**Affected:** MiniMax-M2.5 task_3.
+
+**Root cause:** `quick_check.sh` does not forward `--trust-remote-code` to the vLLM benchmark
+process for models that require it.
+
+**Fix:** Add `--trust-remote-code` to the `quick_check.sh` vLLM invocation when
+`trust_remote_code` is set in the pipeline config.
+
+---
+
+### Issue 6: DeepSeek-V3.2 task_1 OOM (Task 1 — model-specific)
+
+**Symptom:** `pyxis: child terminated with signal 15` (SIGTERM, likely OOM-triggered)
+
+**Affected:** DeepSeek-V3.2 only (685B MoE).
+
+**Root cause:** The 2-node job for DeepSeek-V3.2 task_1 may have been OOM-killed. The model
+requires careful TP/EP configuration across nodes.
+
+**Fix:** Verify TP=8 across 2 nodes is correctly configured; check for `dump_offline_data.sh`
+multi-node support. Also blocked by Issue 1 (missing vllm script).
+
+---
+
+## How to Update This Document
+
+When a new model completes testing:
+
+1. Update the **Model Test Results** table — add a row or change status symbols.
+2. If a new failure mode is found not in the diagram, add a new leaf node to the Mermaid chart
+   under the appropriate step.
+3. If a new issue pattern is discovered, add a new **Issue N** section with symptom, affected
+   models, root cause, and fix.
+4. If an issue is resolved, mark it as ✅ in both the table and the issue section.

From 7c1388a98acc7a34e4cea245f905ec3173c8818f Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Fri, 8 May 2026 10:48:13 -0700
Subject: [PATCH 03/22] Port sandbox fixes: HF dump script, triage chart with
 test results

From the nmm-okr30 sandbox MR (b838b171):
- Add common/eagle3/dump_offline_data_hf.sh: HF-based (device_map=auto)
  hidden state extraction for models not supported by TRT-LLM. Handles
  VLMs, custom-code models, and architectures absent from TRT-LLM.
- Update task_1 for 8 models to use dump_offline_data_hf.sh:
  MiniMax-M2.5, Ministral-3-8B, Ministral-3-14B, GPT-OSS-20B,
  Qwen3.5-9B, Qwen3.5-27B, Qwen3.5-35B-A3B, Step-3.5-Flash.
  Models that retain TRT-LLM dump: DeepSeek-V3.2, GLM-5, Kimi-K2.5,
  Kimi-K2.5-NVFP4 (all pure-text MoE with TRT-LLM support).
- Update EAGLE3_TRIAGE.md with actual test results from 7 models run
  on OCI-HSG cluster on 2026-04-15, marking Issue 2 (HF upload bug)
  as FIXED and correcting Issue 1 status (hf script created as fix).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../common/eagle3/dump_offline_data_hf.sh     |  55 ++++
 tools/launcher/examples/EAGLE3_TRIAGE.md      | 260 +++++++++---------
 .../MiniMax-M2.5/hf_offline_eagle3.yaml       |   8 +-
 .../Ministral-3-14B/hf_offline_eagle3.yaml    |   7 +-
 .../Ministral-3-8B/hf_offline_eagle3.yaml     |   7 +-
 .../OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml |   8 +-
 .../Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml   |   7 +-
 .../Qwen3.5-35B-A3B/hf_offline_eagle3.yaml    |   7 +-
 .../Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml    |   7 +-
 .../Step-3.5-Flash/hf_offline_eagle3.yaml     |   8 +-
 10 files changed, 216 insertions(+), 158 deletions(-)
 create mode 100644 tools/launcher/common/eagle3/dump_offline_data_hf.sh

diff --git a/tools/launcher/common/eagle3/dump_offline_data_hf.sh b/tools/launcher/common/eagle3/dump_offline_data_hf.sh
new file mode 100644
index 00000000000..254df509101
--- /dev/null
+++ b/tools/launcher/common/eagle3/dump_offline_data_hf.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+source ${SCRIPT_DIR}/../service_utils.sh
+
+###################################################################################################
+# HF-based hidden state dumping for models not supported by TRT-LLM.
+# Uses compute_hidden_states_hf.py with device_map="auto" (no TP/EP flags needed).
+# Suitable for: VLMs, models with custom code, architectures not yet in TRT-LLM.
+#
+# Required environment:
+#   HF_MODEL_CKPT   Path to the HF model checkpoint
+#
+# Args passed through to compute_hidden_states_hf.py:
+#   --input-data, --output-dir, --max-seq-len, etc.
+###################################################################################################
+
+pip install datasets 2>/dev/null || true
+
+if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+    TASK_ID=0
+else
+    echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
+    TASK_ID=${SLURM_ARRAY_TASK_ID}
+fi
+
+if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+    TASK_COUNT=1
+else
+    echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
+    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+fi
+
+python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py \
+    --model ${HF_MODEL_CKPT} \
+    --dp-rank ${TASK_ID} \
+    --dp-world-size ${TASK_COUNT} \
+    --trust_remote_code \
+    ${@}
diff --git a/tools/launcher/examples/EAGLE3_TRIAGE.md b/tools/launcher/examples/EAGLE3_TRIAGE.md
index dd3a73fc8cf..7ff08ebbee3 100644
--- a/tools/launcher/examples/EAGLE3_TRIAGE.md
+++ b/tools/launcher/examples/EAGLE3_TRIAGE.md
@@ -1,8 +1,8 @@
-# EAGLE3 Automation Triage
+# EAGLE3 Automation Triage Chart
 
-This document captures the failure modes observed when running the 4-step EAGLE3 offline
-pipeline against a selection of 10 new models. It is structured so Claude (or any contributor)
-can update the status table and diagram as new models are tested.
+This document tracks failure modes discovered when running the 4-step EAGLE3 offline
+pipeline against 10 new models. Updated as models are tested.
+Claude can update the status table, diagram, and issue catalog when new results arrive.
 
 ---
 
@@ -16,12 +16,12 @@ Model checkpoint (HuggingFace)
 │  Task 0: Query   │  vLLM server generates prompt/response pairs
 │  (data synthesis)│  Script: common/vllm/query.sh
 └────────┬─────────┘
-         │
+         │ (afterany — downstream tasks run even if this times out)
          ▼
 ┌──────────────────┐
 │  Task 1: Dump    │  Target model runs forward pass, saves hidden states
-│  (hidden states) │  Script: common/eagle3/dump_offline_data.sh
-└────────┬─────────┘
+│  (hidden states) │  Script: common/eagle3/dump_offline_data.sh  (TRT-LLM)
+└────────┬─────────┘        or  dump_offline_data_hf.sh  (HF/vLLM fallback)
          │
          ▼
 ┌──────────────────┐
@@ -38,74 +38,75 @@ Model checkpoint (HuggingFace)
 
 ---
 
-## Triage Diagram
-
-The following Mermaid diagram maps each step to its possible failure modes.
-Each leaf node indicates a known issue and its root cause category.
+## Triage Decision Tree
 
 ```mermaid
 flowchart TD
-    START([Start: run pipeline]) --> T0
-
-    T0[Task 0: Data synthesis\nvllm/query.sh] --> T0_OK{Success?}
-    T0_OK -- Yes --> T1
-    T0_OK -- No --> T0_FAIL
-
-    T0_FAIL{Failure mode?}
-    T0_FAIL -- Time limit exceeded --> T0_TIMEOUT[⚠ TIMEOUT\nJob wall-clock limit too short\nfor full dataset synthesis.\nFix: increase Slurm time limit\nor reduce dataset size.]
-    T0_FAIL -- Vocab / tokenizer error --> T0_TOKENIZER[⚠ TOKENIZER\nMissing or misconfigured tokenizer cache.\ne.g. GPT-OSS-20B requires TIKTOKEN_RS_CACHE_DIR\nto be pre-populated.\nFix: set env var to valid cache path.]
-    T0_FAIL -- vLLM model not supported --> T0_VLLM[⚠ VLLM_SUPPORT\nModel architecture not yet\nsupported in vllm/vllm-openai:latest.\nFix: use a newer vLLM version or\nswitch to TRT-LLM for task_0.]
-    T0_FAIL -- trust_remote_code error --> T0_TRUST[⚠ TRUST_REMOTE_CODE\nModel requires custom code but\n--trust-remote-code was not passed.\nFix: add flag to task_0 args.]
-
-    T1[Task 1: Dump hidden states\neagle3/dump_offline_data.sh] --> T1_OK{Success?}
-    T1_OK -- Yes --> T2
-    T1_OK -- No --> T1_FAIL
-
-    T1_FAIL{Failure mode?}
-    T1_FAIL -- Script not found --> T1_SCRIPT[⚠ MISSING_SCRIPT\ndump_offline_data_vllm.sh does not exist.\nFix: use dump_offline_data.sh or\ncreate the vllm variant script.]
-    T1_FAIL -- Model not supported by TRT-LLM --> T1_TRTLLM[⚠ TRTLLM_SUPPORT\nTarget model architecture not\nsupported in the TRT-LLM container.\nFix: use vLLM-based dump script or\nupgrade TRT-LLM version.]
-    T1_FAIL -- OOM / memory --> T1_OOM[⚠ OOM\nModel weights exceed GPU memory\nwith current TP/EP configuration.\nFix: increase TP, add EP, or use\na node with more GPUs.]
-    T1_FAIL -- MLA / SWA attention unsupported --> T1_ARCH[⚠ ARCH\nSpecial attention (MLA, SWA) not\nsupported by dump script backend.\nFix: ensure vLLM backend supports\nthe model's attention variant.]
-
-    T2[Task 2: Train EAGLE3 head\neagle3/train_eagle.sh] --> T2_OK{Success?}
-    T2_OK -- Yes --> T3
-    T2_OK -- No --> T2_FAIL
-
-    T2_FAIL{Failure mode?}
-    T2_FAIL -- HF upload on local path --> T2_HFUPLOAD[⚠ HF_UPLOAD_BUG\noffline_training.sh calls HF Hub upload\nwith a local path like /scratchspace/eagle3.\nFix: remove HF upload call; save locally only.\nTracked: offline_training.sh line 18.]
-    T2_FAIL -- eagle_config mismatch --> T2_CONFIG[⚠ EAGLE_CONFIG\neagle_config.json intermediate_size\ntoo small for MoE model's expert hidden dim.\nFix: increase intermediate_size in\neagle_config.json for MoE targets.]
-    T2_FAIL -- eagle_decoder_type incompatible --> T2_DECODER[⚠ DECODER_TYPE\nDefault llama decoder type incompatible\nwith target model attention (e.g. MLA).\nFix: set eagle_decoder_type in\neagle_config.json to match target.]
-    T2_FAIL -- OOM during training --> T2_OOM[⚠ OOM\nFSDP training OOM with default batch size.\nFix: reduce train_bs or\nuse gradient checkpointing.]
-
-    T3[Task 3: Benchmark\nspecdec_bench/quick_check.sh] --> T3_OK{Success?}
-    T3_OK -- Yes --> END([Pipeline complete])
-    T3_OK -- No --> T3_FAIL
-
-    T3_FAIL{Failure mode?}
-    T3_FAIL -- Export dir missing cascade --> T3_EXPORT[⚠ CASCADE\n/scratchspace/export does not exist\nbecause Task 2 failed.\nFix: resolve Task 2 failure first.]
-    T3_FAIL -- trust_remote_code missing --> T3_TRUST[⚠ TRUST_REMOTE_CODE\nBenchmark script did not pass\n--trust-remote-code to vLLM.\nFix: add flag in quick_check.sh\nor pipeline config.]
-    T3_FAIL -- vLLM spec decode not supported --> T3_VLLM[⚠ VLLM_SPECDEC\nSpeculative decoding with EAGLE3\nnot yet supported in vLLM for\nthis model architecture.\nFix: check vLLM release notes.]
-    T3_FAIL -- NVFP4 requires newer vLLM --> T3_NVFP4[⚠ NVFP4\nNVFP4 inference requires\nvllm/vllm-openai:v0.15.0+\nand Blackwell GPU.\nFix: pin container version.]
+    START([EAGLE3 Pipeline Failed]) --> WHICH_STEP{Which step failed?}
+
+    WHICH_STEP -->|task_0: Data synthesis| T0_CHECK{Server started?}
+    WHICH_STEP -->|task_1: Hidden states| T1_CHECK{Script found?}
+    WHICH_STEP -->|task_2: Training| T2_CHECK{Dependencies installed?}
+    WHICH_STEP -->|task_3: Benchmark| T3_CHECK{Engine started?}
+
+    %% ── task_0 ──────────────────────────────────────────────────
+    T0_CHECK -->|No - hangs at health check| T0_OOM{CUDA OOM in log?}
+    T0_CHECK -->|Yes - server up, query fails| T0_QUERY[Check query.py errors:\nbad prompt format,\nconnection timeout,\nempty response]
+    T0_OOM -->|Yes| T0_FIX_OOM[⚠ OOM\nReduce max_num_tokens\nor increase TP]
+    T0_OOM -->|No| T0_ARCH{Error type?}
+    T0_ARCH -->|vocab / tokenizer error| T0_TOKENIZER[⚠ TOKENIZER\nMissing tokenizer cache.\ne.g. GPT-OSS-20B needs\nTIKTOKEN_RS_CACHE_DIR pre-populated]
+    T0_ARCH -->|Architecture / RuntimeError| T0_FIX_ARCH[⚠ VLLM_SUPPORT\nModel arch not supported\nin this vLLM version.\nTry newer container.]
+    T0_ARCH -->|trust_remote_code| T0_FIX_TRC[⚠ TRUST_REMOTE_CODE\nAdd --trust-remote-code\nbefore -- separator in args]
+    T0_CHECK -->|Cancelled - time limit| T0_TIMEOUT[⚠ TIMEOUT\nJob wall-clock limit too short.\nNote: afterany deps ensure\ntask_1 still runs.\nFix: increase time limit\nor reduce dataset size.]
+
+    %% ── task_1 ──────────────────────────────────────────────────
+    T1_CHECK -->|No - script not found| T1_SCRIPT[⚠ MISSING_SCRIPT\ndump_offline_data_vllm.sh does not exist.\nUse dump_offline_data_hf.sh\n(HF device_map=auto, no TP/EP flags)\nor dump_offline_data.sh\n(TRT-LLM, needs --tp / --moe-ep).]
+    T1_CHECK -->|Yes| T1_RUN{Runs OK?}
+    T1_RUN -->|No - OOM| T1_OOM[⚠ OOM\nIncrease TP, add EP,\nor switch to _hf script.]
+    T1_RUN -->|No - NCCL error| T1_NCCL[⚠ NCCL\nNetwork/multi-node issue.\nRetry or reduce EP.]
+    T1_RUN -->|No - arch unsupported| T1_ARCH[⚠ ARCH\nModel not supported by TRT-LLM.\nSwitch to dump_offline_data_hf.sh.]
+    T1_RUN -->|Yes - no .pt output| T1_DATA[Check --input-data path\nand data format from task_0]
+
+    %% ── task_2 ──────────────────────────────────────────────────
+    T2_CHECK -->|No - pip install fails| T2_FIX_DEPS[Network issue in container.\nCheck proxy/mirror.]
+    T2_CHECK -->|Yes| T2_TRAIN{Training starts?}
+    T2_TRAIN -->|No - ImportError| T2_FIX_IMPORT[modelopt not installed\nor wrong version]
+    T2_TRAIN -->|No - FileNotFoundError| T2_FIX_DATA[task_1 output missing.\nRe-run task_1.]
+    T2_TRAIN -->|Yes but crashes| T2_CRASH{Error type?}
+    T2_CRASH -->|OOM| T2_FIX_OOM[⚠ OOM\nReduce train_bs\nor training_seq_len]
+    T2_CRASH -->|NaN loss| T2_FIX_NAN[Reduce lr.\nCheck data quality.]
+    T2_CRASH -->|KeyError / arch| T2_FIX_EAGLE[⚠ ARCH\nModel type not recognized\nby EAGLE3 training code.\nNeeds code change in modelopt.\nCheck eagle_decoder_type in config.]
+    T2_TRAIN -->|Yes - export fails| T2_FIX_EXPORT[Check /scratchspace/eagle3\nhas model.safetensors]
+
+    %% ── task_3 ──────────────────────────────────────────────────
+    T3_CHECK -->|No - export dir missing| T3_EXPORT[⚠ CASCADE\nTask 2 failed or timed out.\nResolve task_2 first.]
+    T3_CHECK -->|No - engine crash| T3_ENGINE{Engine type?}
+    T3_CHECK -->|Yes - AR below threshold| T3_AR[AR too low:\nneed more epochs, data,\nor larger draft head]
+    T3_CHECK -->|Yes - wrong output| T3_FORMAT[Check draft model\nconfig.json vs engine version]
+    T3_ENGINE -->|vLLM - trust_remote_code| T3_TRUST[⚠ TRUST_REMOTE_CODE\nAdd --trust-remote-code\nto quick_check.sh invocation]
+    T3_ENGINE -->|vLLM - spec decode unsupported| T3_VLLM[⚠ VLLM_SPECDEC\nvLLM version too old.\nUse latest container.]
+    T3_ENGINE -->|NVFP4 - unsupported| T3_NVFP4[⚠ NVFP4\nRequires vllm-openai:v0.15.0+\nand Blackwell GPU.]
+    T3_ENGINE -->|OOM| T3_FIX_OOM[Target + draft too large.\nIncrease TP.]
 ```
 
 ---
 
-## Model Test Results
+## Model Test Matrix
 
-Tests were run on the OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
+Tests run on OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
 
-| Model | Size | Task 0 | Task 1 | Task 2 | Task 3 | Notes |
-|-------|------|--------|--------|--------|--------|-------|
-| Ministral-3-8B | 8B dense | ⏱ TIMEOUT (near complete, 3277/3295 samples) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | Tokenizer regex warning (non-fatal) |
-| Ministral-3-14B | 14B dense | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ (no log) | — |
-| GPT-OSS-20B | 20B dense | ❌ TOKENIZER (tiktoken cache missing) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | vLLM tried to start but vocab load failed |
-| MiniMax-M2.5 | 230B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG (config.json variant) | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at benchmark |
-| Qwen3.5-35B-A3B | 35B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | — |
-| Step-3.5-Flash | 197B MoE | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | SWA attention - untested past task_1 |
-| DeepSeek-V3.2 | 685B MoE | 🔍 (no log, tarball only) | ❌ MISSING_SCRIPT | ❌ HF_UPLOAD_BUG | ❌ CASCADE | 2-node job, task_1 also OOM-terminated (signal 15) |
-| Kimi-K2.5 | 1T MoE | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | MLA attention needs decoder_type check |
-| GLM-5 | 744B MoE | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | Gated model, 2-node |
-| Kimi-K2.5-NVFP4 | NVFP4 quant | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | 🔲 Not tested | Blackwell required; tasks 1-2 use BF16 base |
+| # | Model | Type | Size | task_0 | task_1 | task_2 | task_3 | Notes |
+|---|-------|------|------|--------|--------|--------|--------|-------|
+| 1 | Ministral-3-8B | Dense | 8B | ⏱ TIMEOUT (3277/3295) | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | Tokenizer regex warning (non-fatal) |
+| 2 | Ministral-3-14B | Dense | 14B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | 🔍 (no log) | — |
+| 3 | GPT-OSS-20B | Dense | 20B | ❌ TOKENIZER | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | TIKTOKEN_RS_CACHE_DIR not populated |
+| 4 | MiniMax-M2.5 | MoE | 230B/10B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at bench |
+| 5 | Qwen3.5-35B-A3B | MoE | 35B/3B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | — |
+| 6 | Step-3.5-Flash | MoE/SWA | 197B/11B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | SWA attention — untested past t1 |
+| 7 | DeepSeek-V3.2 | MoE/MLA | 685B/37B | 🔍 (tarball only) | ❌ MISSING_SCRIPT + OOM | ❌ (no data from t1) | ❌ CASCADE | 2-node, t1 OOM-killed (SIGTERM) |
+| 8 | Kimi-K2.5 | MoE/MLA | 1T/32B | 🔲 | 🔲 | 🔲 | 🔲 | MLA attention: verify eagle_decoder_type |
+| 9 | GLM-5 | MoE/DSA | 744B/40B | 🔲 | 🔲 | 🔲 | 🔲 | Gated, 2-node |
+| 10 | Kimi-K2.5-NVFP4 | NVFP4 | ~591GB | 🔲 | 🔲 | 🔲 | 🔲 | Blackwell required; t1/t2 use BF16 base |
 
 **Legend:** ✅ Pass · ❌ Fail · ⏱ Timeout · 🔍 Inconclusive · 🔲 Not yet tested
 
@@ -113,108 +114,93 @@ Tests were run on the OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
 
 ## Known Issues
 
-### Issue 1: `dump_offline_data_vllm.sh` does not exist (Task 1 — universal)
-
-**Symptom:** `/usr/bin/bash: services/pipeline/eagle3/dump_offline_data_vllm.sh: No such file or directory`
+### Issue 1: Missing `dump_offline_data_vllm.sh` (Task 1 — universal) — OPEN
 
-**Affected:** All 7 models tested.
+**Symptom:** `/usr/bin/bash: .../dump_offline_data_vllm.sh: No such file or directory`
 
-**Root cause:** Quick-fail configs reference a vLLM-based hidden state dump script that was
-planned but never created. The existing script `dump_offline_data.sh` uses TRT-LLM for inference.
+**Affected:** All 7 models tested (root cause of universal task_1 failure).
 
-**Fix options:**
-- (a) Create `dump_offline_data_vllm.sh` using vLLM for hidden state extraction (enables models
-  not supported by TRT-LLM, e.g. GPT-OSS-20B, Ministral3, Qwen3.5-VLMs).
-- (b) For models supported by TRT-LLM: switch configs to use `dump_offline_data.sh` with
-  appropriate `--tp` and `--moe-ep` flags.
+**Root cause:** Quick-fail pipeline configs reference `dump_offline_data_vllm.sh`, which was
+planned but not created. Two scripts exist: `dump_offline_data.sh` (TRT-LLM based, requires
+`--tp`/`--moe-ep`) and `dump_offline_data_hf.sh` (HF `device_map="auto"`, no parallelism args,
+works for any model supported by HF Transformers).
 
----
+**Status:** `dump_offline_data_hf.sh` was created as a fallback and is working for standalone
+task_1 re-runs (Ministral-3-8B, MiniMax-M2.5, Qwen3.5-35B-A3B, Step-3.5-Flash). The
+quick-fail pipeline configs still reference the non-existent `_vllm` script.
 
-### Issue 2: `offline_training.sh` HuggingFace Hub upload bug (Task 2 — universal)
+**Fix:** Update quick-fail configs to use `dump_offline_data_hf.sh` for models not supported
+by TRT-LLM, or rename `_hf` → `_vllm` if it covers the intended use case.
 
-**Symptom:**
-```
-huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or
-'namespace/repo_name': '/scratchspace/eagle3'.
-services/pipeline/eagle3/offline_training.sh: line 18: error_handler: command not found
-```
+---
 
-**Affected:** All 7 models tested.
+### Issue 2: `offline_training.sh` HuggingFace Hub upload bug — FIXED ✅
 
-**Root cause:** `offline_training.sh` calls a HuggingFace CLI upload command with
-`/scratchspace/eagle3` as the repo ID. This path should be a local output directory, not a
-Hub repo. The `error_handler` function is also referenced but not defined.
+**Was:** `HFValidationError: Repo id must be in the form 'repo_name': '/scratchspace/eagle3'`
 
-**Fix:** Remove or gate the Hub upload call in `offline_training.sh`. Save the trained draft
-head to a local path only during CI. Also define `error_handler` or remove the reference.
+**Fix applied:** `offline_training.sh` was rewritten to call `launch_train.sh` followed by
+`export_hf_checkpoint.py` for local export only. No HF Hub upload. The `error_handler` is
+now properly sourced from `service_utils.sh`.
 
 ---
 
-### Issue 3: Time limit exceeded (Task 0 — most models)
+### Issue 3: Task 0 time limit (most models) — PARTIALLY ADDRESSED ⚠
 
 **Symptom:** `STEP CANCELLED AT ... DUE TO TIME LIMIT`
 
-**Affected:** Ministral-3-8B (near complete), Ministral-3-14B, MiniMax-M2.5, Qwen3.5-35B-A3B,
-Step-3.5-Flash.
+**Affected:** Ministral-3-8B (3277/3295 samples — nearly complete), Ministral-3-14B,
+MiniMax-M2.5, Qwen3.5-35B-A3B, Step-3.5-Flash.
 
-**Root cause:** The default Slurm time limit is too short for synthesizing the full dataset
-with a large model. Ministral-3-8B reached 3277/3295 samples before cancellation.
+**Status:** `afterany` Slurm dependencies were added so downstream tasks (task_1, 2, 3)
+run even when task_0 times out. The data synthesis timeout itself is not yet resolved.
 
 **Fix options:**
-- (a) Increase Slurm `--time` in the job config.
-- (b) Reduce the dataset size for quick-fail validation (e.g., use a 100-sample subset).
-- (c) Add `--max-samples N` flag to `query.py` / `query.sh`.
+- Increase Slurm `--time` limit for task_0.
+- Add `--max-samples N` to limit dataset size for quick-fail validation.
 
 ---
 
-### Issue 4: GPT-OSS-20B tokenizer cache missing (Task 0 — model-specific)
+### Issue 4: GPT-OSS-20B tokenizer cache missing (Task 0) — OPEN
 
-**Symptom:**
-```
-openai_harmony.HarmonyError: error downloading or loading vocab file:
-failed to download or load vocab file
-```
+**Symptom:** `openai_harmony.HarmonyError: error downloading or loading vocab file`
 
-**Affected:** GPT-OSS-20B only.
+**Affected:** GPT-OSS-20B only. vLLM started (model loaded) but vocab download failed.
 
-**Root cause:** GPT-OSS-20B uses a custom OpenAI tokenizer (`openai_harmony`) that reads from
-a tiktoken cache directory. The env var `TIKTOKEN_RS_CACHE_DIR` must point to a pre-populated
-directory on the cluster.
+**Root cause:** GPT-OSS-20B uses the `openai_harmony` tokenizer backed by tiktoken, which
+requires `TIKTOKEN_RS_CACHE_DIR` to point to a pre-populated local cache. The cluster did
+not have this directory populated.
 
-**Fix:** Ensure `TIKTOKEN_RS_CACHE_DIR` is set to a valid, pre-populated tiktoken cache path
-in the cluster environment before running task_0.
+**Fix:** Ensure `TIKTOKEN_RS_CACHE_DIR` is set to a valid pre-populated tiktoken cache
+path before submitting task_0.
 
 ---
 
-### Issue 5: `trust_remote_code` not passed to benchmark (Task 3 — MiniMax)
+### Issue 5: MiniMax-M2.5 missing `trust_remote_code` at benchmark (Task 3) — OPEN
 
 **Symptom:**
 ```
-ValueError: The repository ... contains custom code which must be executed to correctly
-load the model. Please pass trust_remote_code=True.
+ValueError: The repository ... contains custom code... Please pass trust_remote_code=True
 ```
 
 **Affected:** MiniMax-M2.5 task_3.
 
-**Root cause:** `quick_check.sh` does not forward `--trust-remote-code` to the vLLM benchmark
-process for models that require it.
+**Root cause:** `quick_check.sh` does not forward `--trust-remote-code` to vLLM for models
+that require it.
 
-**Fix:** Add `--trust-remote-code` to the `quick_check.sh` vLLM invocation when
-`trust_remote_code` is set in the pipeline config.
+**Fix:** Pass `--trust-remote-code` in the `quick_check.sh` vLLM invocation when
+`trust_remote_code` is set in the pipeline environment.
 
 ---
 
-### Issue 6: DeepSeek-V3.2 task_1 OOM (Task 1 — model-specific)
+### Issue 6: DeepSeek-V3.2 task_1 OOM (Task 1) — OPEN
 
 **Symptom:** `pyxis: child terminated with signal 15` (SIGTERM, likely OOM-triggered)
 
-**Affected:** DeepSeek-V3.2 only (685B MoE).
-
-**Root cause:** The 2-node job for DeepSeek-V3.2 task_1 may have been OOM-killed. The model
-requires careful TP/EP configuration across nodes.
+**Affected:** DeepSeek-V3.2 only (685B MoE, 2-node job).
 
-**Fix:** Verify TP=8 across 2 nodes is correctly configured; check for `dump_offline_data.sh`
-multi-node support. Also blocked by Issue 1 (missing vllm script).
+**Root cause:** Task_1 was also blocked by Issue 1 (missing vllm script); the SIGTERM may
+indicate OOM during the brief moment before the script-not-found failure propagated. Needs
+further investigation with `dump_offline_data_hf.sh`.
 
 ---
 
@@ -222,9 +208,21 @@ multi-node support. Also blocked by Issue 1 (missing vllm script).
 
 When a new model completes testing:
 
-1. Update the **Model Test Results** table — add a row or change status symbols.
-2. If a new failure mode is found not in the diagram, add a new leaf node to the Mermaid chart
-   under the appropriate step.
-3. If a new issue pattern is discovered, add a new **Issue N** section with symptom, affected
-   models, root cause, and fix.
-4. If an issue is resolved, mark it as ✅ in both the table and the issue section.
+1. **Status table**: Update the row — fill in ✅/❌/⏱/🔍 and brief notes.
+2. **Decision tree**: If a new failure mode appears that has no matching leaf, add a new
+   branch under the appropriate step node.
+3. **Issue catalog**: Add a new numbered section with symptom, affected models, root cause,
+   fix, and status (OPEN / FIXED / PARTIALLY ADDRESSED).
+4. Mark resolved issues as **FIXED ✅** and update the status in the table.
+
+Per-model results template:
+```markdown
+#### Model: <name>
+- **Date tested:** YYYY-MM-DD
+- **task_0:** PASS/FAIL/TIMEOUT — <notes>
+- **task_1:** PASS/FAIL — <notes>
+- **task_2:** PASS/FAIL — <notes>
+- **task_3:** PASS/FAIL — <notes>
+- **AR speedup:** <value> (target ≥ 2.1×)
+- **New failure pattern:** Yes/No — <description if yes>
+```
diff --git a/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
index f5193e395d8..7868c28f80a 100644
--- a/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
@@ -51,14 +51,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — MiniMax-M2.5 requires trust_remote_code and
+  # uses a custom architecture not supported by TRT-LLM hidden state extraction.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
-      - --moe-ep 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -66,7 +66,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml b/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
index 88236f8d6a5..1328657cc17 100644
--- a/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Mistral/Ministral-3-14B/hf_offline_eagle3.yaml
@@ -48,13 +48,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — mistral3/Mistral3ForConditionalGeneration is not
+  # supported by TRT-LLM hidden state extraction.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -62,7 +63,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
index d7047abfc6c..a803b57afa1 100644
--- a/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
@@ -47,13 +47,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — mistral3/PixtralForConditionalGeneration is not
+  # supported by TRT-LLM hidden state extraction.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -61,7 +62,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
index e04c205975b..accdd71f76c 100644
--- a/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
@@ -49,21 +49,23 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — GPT-OSS-20B uses a custom OpenAI tokenizer
+  # (openai_harmony) not supported by TRT-LLM hidden state extraction.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
index 7af5efd9acb..eab5005691e 100644
--- a/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-27B/hf_offline_eagle3.yaml
@@ -43,13 +43,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — Qwen3.5-27B is a VLM; HF backend handles
+  # the text-only path without requiring TRT-LLM support.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -57,7 +58,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
index 4c8f908175a..bf2e35632b4 100644
--- a/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/hf_offline_eagle3.yaml
@@ -49,14 +49,13 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — confirmed working in sandbox testing.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
-      - --moe-ep 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -64,7 +63,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml b/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
index bb90355b1b4..a725b54ba0d 100644
--- a/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-9B/hf_offline_eagle3.yaml
@@ -43,13 +43,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — Qwen3.5-9B is a VLM; HF backend handles
+  # the text-only path without requiring TRT-LLM support.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -57,7 +58,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:
diff --git a/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml b/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml
index e749d40cc35..75c639ea7ce 100644
--- a/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/StepFun/Step-3.5-Flash/hf_offline_eagle3.yaml
@@ -54,14 +54,14 @@ pipeline:
       container: vllm/vllm-openai:latest
 
   # Step 2: Dump hidden states from target model
+  # Uses HF backend (device_map=auto) — Step-3.5-Flash uses sliding window attention (SWA)
+  # which is not supported by TRT-LLM hidden state extraction.
   task_1:
-    script: common/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data_hf.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
       - --max-seq-len 8192
-      - --tp 4
-      - --moe-ep 4
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
@@ -69,7 +69,7 @@ pipeline:
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
-      container: vllm/vllm-openai:latest
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
   # Step 3: Train EAGLE3 draft head (offline)
   task_2:

From 642da1fd4f824c4054953e8b2b40dab6c8429371 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 11 May 2026 10:40:30 -0700
Subject: [PATCH 04/22] feat(eagle3): add vLLM hidden-state dump script and fix
 triage chart

- Add compute_hidden_states_vllm.py: uses speculators.VllmHiddenStatesGenerator
  to extract hidden states via vLLM; output format identical to the HF variant
- Add dump_offline_data_vllm.sh: launcher wrapper for the vLLM backend;
  three backends now available for task_1 (TRT-LLM / HF / vLLM)
- Update EAGLE3_TRIAGE.md: mark Issue 1 as FIXED, update pipeline overview
  to show all three dump backends, update model matrix rows 1-7 to
  NEEDS RERUN (blocked by missing script in round 1, now resolved)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py             | 191 ++++++++++++++++++
 .../common/eagle3/dump_offline_data_vllm.sh   |  55 +++++
 tools/launcher/examples/EAGLE3_TRIAGE.md      |  57 +++---
 3 files changed, 278 insertions(+), 25 deletions(-)
 create mode 100644 examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
 create mode 100644 tools/launcher/common/eagle3/dump_offline_data_vllm.sh

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
new file mode 100644
index 00000000000..76a847aeb59
--- /dev/null
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -0,0 +1,191 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Extract hidden states from an LLM using vLLM + speculators."""
+
+import argparse
+from pathlib import Path
+
+import torch
+from datasets import load_dataset
+from speculators.data_generation import VllmHiddenStatesGenerator
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+REMOVE_THINK_CHAT_TEMPLATE = (
+    "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="""Collect hidden states from conversations using vLLM + speculators."""
+    )
+
+    parser.add_argument("--model", type=str, required=True, help="HF model path.")
+    parser.add_argument(
+        "--max-seq-len", type=int, default=3072, help="Max tokens per conversation."
+    )
+    parser.add_argument(
+        "--input-data", type=Path, required=True, help="Path to jsonl file or directory."
+    )
+    parser.add_argument(
+        "--output-dir", type=Path, required=True, help="Directory to save hidden states."
+    )
+    parser.add_argument("--dp-rank", type=int, default=0, help="Data parallel rank.")
+    parser.add_argument("--dp-world-size", type=int, default=1, help="Data parallel world size.")
+    parser.add_argument(
+        "--trust_remote_code", action="store_true", help="Trust remote code for HF models."
+    )
+    parser.add_argument("--tp", type=int, default=None, help="Tensor parallel size.")
+    parser.add_argument(
+        "--debug-max-num-conversations", type=int, default=None, help="Limit conversations."
+    )
+
+    return parser.parse_args()
+
+
+def main(args: argparse.Namespace) -> None:
+    # Load conversations
+    if args.input_data.is_file() and str(args.input_data).endswith(".jsonl"):
+        dataset = load_dataset("json", data_files=str(args.input_data), split="train")
+    elif args.input_data.is_dir():
+        dataset = load_dataset(
+            "json", data_files={"train": f"{args.input_data}/*.jsonl"}, split="train"
+        )
+    else:
+        raise ValueError(
+            f"input_data must be a .jsonl file or directory, got: {args.input_data}"
+        )
+    print(f"Loaded {len(dataset)} conversations from {args.input_data}")
+
+    # Shard data
+    if args.dp_world_size > 1:
+        dataset = dataset.shard(num_shards=args.dp_world_size, index=args.dp_rank)
+    print(f"Sharded to {len(dataset)} conversations for DP#{args.dp_rank}/{args.dp_world_size}")
+
+    # Remove already dumped conversations
+    output_dir = args.output_dir
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    def keep_conversation(entry):
+        conversation_id = entry.get("conversation_id", entry.get("uuid", None))
+        assert conversation_id is not None, "conversation_id is required"
+        return not (output_dir / f"{conversation_id}.pt").exists()
+
+    original_num = len(dataset)
+    dataset = dataset.filter(keep_conversation)
+    print(f"Removed {original_num - len(dataset)} conversations due to existing output files")
+
+    if args.debug_max_num_conversations is not None:
+        dataset = dataset.select(range(args.debug_max_num_conversations))
+
+    # Tokenize conversations
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model, trust_remote_code=args.trust_remote_code
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
+
+    # Prepare prompts for vLLM
+    prompts = []
+    conversation_ids = []
+    num_skipped_too_long = 0
+    num_invalid = 0
+
+    for entry in dataset:
+        conversation_id = entry.get("conversation_id", entry.get("uuid"))
+        conversations = entry["conversations"]
+        if not conversations or not isinstance(conversations, list):
+            num_invalid += 1
+            continue
+
+        tokenized = tokenizer.apply_chat_template(
+            conversations, return_tensors="pt", add_generation_template=False
+        )
+        input_ids = tokenized["input_ids"] if isinstance(tokenized, dict) else tokenized
+        num_tokens = input_ids.shape[1]
+        if num_tokens <= 10 or num_tokens > args.max_seq_len:
+            num_skipped_too_long += 1
+            continue
+
+        prompts.append(input_ids.squeeze(0))
+        conversation_ids.append(conversation_id)
+
+    print(f"Prepared {len(prompts)} prompts ({num_skipped_too_long} skipped too long, {num_invalid} invalid)")
+
+    if len(prompts) == 0:
+        print("No prompts to process.")
+        return
+
+    # Initialize vLLM hidden states generator
+    tp = args.tp
+    if tp is None:
+        import torch as _torch
+        tp = _torch.cuda.device_count()
+
+    generator = VllmHiddenStatesGenerator(
+        model=args.model,
+        tensor_parallel_size=tp,
+        trust_remote_code=args.trust_remote_code,
+        max_model_len=args.max_seq_len,
+    )
+
+    # Generate hidden states
+    results = generator.generate(prompts)
+
+    # Save in the same format as compute_hidden_states_hf.py
+    num_success = 0
+    for conv_id, result in tqdm(
+        zip(conversation_ids, results), total=len(results), desc="Saving"
+    ):
+        input_ids = result["input_ids"]
+        hidden_states_dict = result["hidden_states"]
+
+        # Sort layer indices
+        layer_indices = sorted(hidden_states_dict.keys())
+
+        # Last layer = output hidden states
+        output_hidden_states = hidden_states_dict[layer_indices[-1]].cpu()
+
+        # Aux layers = all except the last
+        aux_layers = layer_indices[:-1]
+        if aux_layers:
+            aux_hidden_states = torch.cat(
+                [hidden_states_dict[i].cpu() for i in aux_layers], dim=-1
+            )
+        else:
+            aux_hidden_states = torch.empty(0)
+
+        output_file = output_dir / f"{conv_id}.pt"
+        with open(output_file, "wb") as f:
+            torch.save(
+                {
+                    "input_ids": input_ids.cpu() if hasattr(input_ids, "cpu") else input_ids,
+                    "hidden_states": output_hidden_states,
+                    "aux_hidden_states": aux_hidden_states,
+                    "conversation_id": conv_id,
+                },
+                f,
+            )
+        num_success += 1
+
+    print(f"Successfully processed {num_success} out of {len(prompts)} conversations.")
+
+
+if __name__ == "__main__":
+    cli_args = parse_args()
+    main(cli_args)
diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
new file mode 100644
index 00000000000..789442d2071
--- /dev/null
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+source ${SCRIPT_DIR}/../service_utils.sh
+
+###################################################################################################
+# vLLM-based hidden state dumping using the speculators library.
+# Uses compute_hidden_states_vllm.py with VllmHiddenStatesGenerator.
+# Suitable for: any model supported by vLLM (broader coverage than TRT-LLM or HF device_map).
+#
+# Required environment:
+#   HF_MODEL_CKPT   Path to the HF model checkpoint
+#
+# Args passed through to compute_hidden_states_vllm.py:
+#   --input-data, --output-dir, --max-seq-len, etc.
+###################################################################################################
+
+pip install speculators datasets 2>/dev/null || true
+
+if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+    TASK_ID=0
+else
+    echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
+    TASK_ID=${SLURM_ARRAY_TASK_ID}
+fi
+
+if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+    TASK_COUNT=1
+else
+    echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
+    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+fi
+
+python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py \
+    --model ${HF_MODEL_CKPT} \
+    --dp-rank ${TASK_ID} \
+    --dp-world-size ${TASK_COUNT} \
+    --trust_remote_code \
+    ${@}
diff --git a/tools/launcher/examples/EAGLE3_TRIAGE.md b/tools/launcher/examples/EAGLE3_TRIAGE.md
index 7ff08ebbee3..e16cf8efd2b 100644
--- a/tools/launcher/examples/EAGLE3_TRIAGE.md
+++ b/tools/launcher/examples/EAGLE3_TRIAGE.md
@@ -20,8 +20,9 @@ Model checkpoint (HuggingFace)
          ▼
 ┌──────────────────┐
 │  Task 1: Dump    │  Target model runs forward pass, saves hidden states
-│  (hidden states) │  Script: common/eagle3/dump_offline_data.sh  (TRT-LLM)
-└────────┬─────────┘        or  dump_offline_data_hf.sh  (HF/vLLM fallback)
+│  (hidden states) │  Script: common/eagle3/dump_offline_data.sh       (TRT-LLM)
+└────────┬─────────┘        or  dump_offline_data_hf.sh   (HF device_map=auto)
+                           or  dump_offline_data_vllm.sh  (vLLM + speculators)
          │
          ▼
 ┌──────────────────┐
@@ -60,7 +61,7 @@ flowchart TD
     T0_CHECK -->|Cancelled - time limit| T0_TIMEOUT[⚠ TIMEOUT\nJob wall-clock limit too short.\nNote: afterany deps ensure\ntask_1 still runs.\nFix: increase time limit\nor reduce dataset size.]
 
     %% ── task_1 ──────────────────────────────────────────────────
-    T1_CHECK -->|No - script not found| T1_SCRIPT[⚠ MISSING_SCRIPT\ndump_offline_data_vllm.sh does not exist.\nUse dump_offline_data_hf.sh\n(HF device_map=auto, no TP/EP flags)\nor dump_offline_data.sh\n(TRT-LLM, needs --tp / --moe-ep).]
+    T1_CHECK -->|No - script not found| T1_SCRIPT[⚠ MISSING_SCRIPT\nVerify script path. Three backends:\n• dump_offline_data_vllm.sh (vLLM + speculators)\n• dump_offline_data_hf.sh (HF device_map=auto)\n• dump_offline_data.sh (TRT-LLM, --tp/--moe-ep)]
     T1_CHECK -->|Yes| T1_RUN{Runs OK?}
     T1_RUN -->|No - OOM| T1_OOM[⚠ OOM\nIncrease TP, add EP,\nor switch to _hf script.]
     T1_RUN -->|No - NCCL error| T1_NCCL[⚠ NCCL\nNetwork/multi-node issue.\nRetry or reduce EP.]
@@ -97,40 +98,46 @@ Tests run on OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
 
 | # | Model | Type | Size | task_0 | task_1 | task_2 | task_3 | Notes |
 |---|-------|------|------|--------|--------|--------|--------|-------|
-| 1 | Ministral-3-8B | Dense | 8B | ⏱ TIMEOUT (3277/3295) | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | Tokenizer regex warning (non-fatal) |
-| 2 | Ministral-3-14B | Dense | 14B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | 🔍 (no log) | — |
-| 3 | GPT-OSS-20B | Dense | 20B | ❌ TOKENIZER | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | TIKTOKEN_RS_CACHE_DIR not populated |
-| 4 | MiniMax-M2.5 | MoE | 230B/10B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at bench |
-| 5 | Qwen3.5-35B-A3B | MoE | 35B/3B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | — |
-| 6 | Step-3.5-Flash | MoE/SWA | 197B/11B | ⏱ TIMEOUT | ❌ MISSING_SCRIPT | ❌ (no data from t1) | ❌ CASCADE | SWA attention — untested past t1 |
-| 7 | DeepSeek-V3.2 | MoE/MLA | 685B/37B | 🔍 (tarball only) | ❌ MISSING_SCRIPT + OOM | ❌ (no data from t1) | ❌ CASCADE | 2-node, t1 OOM-killed (SIGTERM) |
+| 1 | Ministral-3-8B | Dense | 8B | ⏱ TIMEOUT (3277/3295) | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | task_0 nearly complete (99%); t1 re-run needed |
+| 2 | Ministral-3-14B | Dense | 14B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | — |
+| 3 | GPT-OSS-20B | Dense | 20B | ❌ TOKENIZER | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | Fix: populate TIKTOKEN_RS_CACHE_DIR first |
+| 4 | MiniMax-M2.5 | MoE | 230B/10B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at bench |
+| 5 | Qwen3.5-35B-A3B | MoE | 35B/3B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | — |
+| 6 | Step-3.5-Flash | MoE/SWA | 197B/11B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | SWA: use _vllm or _hf script |
+| 7 | DeepSeek-V3.2 | MoE/MLA | 685B/37B | 🔍 (tarball only) | 🔁 NEEDS RERUN (_vllm, 2-node) | 🔲 | 🔲 | 2-node; previous t1 OOM-killed |
 | 8 | Kimi-K2.5 | MoE/MLA | 1T/32B | 🔲 | 🔲 | 🔲 | 🔲 | MLA attention: verify eagle_decoder_type |
 | 9 | GLM-5 | MoE/DSA | 744B/40B | 🔲 | 🔲 | 🔲 | 🔲 | Gated, 2-node |
 | 10 | Kimi-K2.5-NVFP4 | NVFP4 | ~591GB | 🔲 | 🔲 | 🔲 | 🔲 | Blackwell required; t1/t2 use BF16 base |
 
-**Legend:** ✅ Pass · ❌ Fail · ⏱ Timeout · 🔍 Inconclusive · 🔲 Not yet tested
+**Legend:** ✅ Pass · ❌ Fail · ⏱ Timeout · 🔍 Inconclusive · 🔲 Not yet tested · 🔁 Rerun needed
 
 ---
 
 ## Known Issues
 
-### Issue 1: Missing `dump_offline_data_vllm.sh` (Task 1 — universal) — OPEN
+### Issue 1: Missing `dump_offline_data_vllm.sh` (Task 1 — universal) — FIXED ✅
 
 **Symptom:** `/usr/bin/bash: .../dump_offline_data_vllm.sh: No such file or directory`
 
-**Affected:** All 7 models tested (root cause of universal task_1 failure).
-
-**Root cause:** Quick-fail pipeline configs reference `dump_offline_data_vllm.sh`, which was
-planned but not created. Two scripts exist: `dump_offline_data.sh` (TRT-LLM based, requires
-`--tp`/`--moe-ep`) and `dump_offline_data_hf.sh` (HF `device_map="auto"`, no parallelism args,
-works for any model supported by HF Transformers).
-
-**Status:** `dump_offline_data_hf.sh` was created as a fallback and is working for standalone
-task_1 re-runs (Ministral-3-8B, MiniMax-M2.5, Qwen3.5-35B-A3B, Step-3.5-Flash). The
-quick-fail pipeline configs still reference the non-existent `_vllm` script.
-
-**Fix:** Update quick-fail configs to use `dump_offline_data_hf.sh` for models not supported
-by TRT-LLM, or rename `_hf` → `_vllm` if it covers the intended use case.
+**Affected:** All 7 models tested (root cause of universal task_1 failure in first round).
+
+**Root cause:** Quick-fail pipeline configs referenced `dump_offline_data_vllm.sh`, which had
+not yet been created. Only two scripts existed: `dump_offline_data.sh` (TRT-LLM) and
+`dump_offline_data_hf.sh` (HF `device_map="auto"`).
+
+**Fix applied:** `dump_offline_data_vllm.sh` and its backing script
+`compute_hidden_states_vllm.py` were ported from a parallel sandbox branch. The vLLM script
+uses `VllmHiddenStatesGenerator` from the `speculators` library and saves output in the same
+`.pt` format as the HF variant. Both files are now in:
+- `tools/launcher/common/eagle3/dump_offline_data_vllm.sh`
+- `examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py`
+
+Three backends now available for task_1:
+| Backend | Script | When to use |
+|---------|--------|-------------|
+| TRT-LLM | `dump_offline_data.sh` | Pure-text models with TRT-LLM support; needs `--tp`/`--moe-ep` |
+| HF | `dump_offline_data_hf.sh` | VLMs, custom-code models, SWA; `device_map="auto"` |
+| vLLM | `dump_offline_data_vllm.sh` | Broad coverage via vLLM model implementations; requires `speculators` |
 
 ---
 

From 4abca8bea2d0028fb2f11c8102456ce603269467 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 13 Apr 2026 10:01:57 -0700
Subject: [PATCH 05/22] fix(launcher): use afterany dependency for
 allow_to_fail pipelines

nemo-run's SlurmExecutor defaults to dependency_type="afterok", which
cancels all downstream tasks when a predecessor times out or fails.
For pipelines with allow_to_fail=True, use "afterany" so subsequent
tasks run regardless of predecessor exit status.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 tools/launcher/core.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index 8fd4e25ee79..498fb1f36f4 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -471,6 +471,11 @@ def run_jobs(
                     )
                     task_env.update(default_slurm_env)
 
+                # When allow_to_fail is set, use "afterany" so downstream tasks
+                # run even if a predecessor times out or fails.
+                if job.allow_to_fail and hasattr(executor, "dependency_type"):
+                    executor.dependency_type = "afterany"
+
                 task_instance = run.Script(task.script, args=task_args, env=task_env)
                 print(f"job {job_name} task {task_id} slurm_config: {task.slurm_config}")
 

From d0ad01b6bc02ce1d7ec36d94d57e8db76d5f6759 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 11 May 2026 11:43:16 -0700
Subject: [PATCH 06/22] fix(eagle3): fix code-quality CI failures in triage
 chart and vllm script

- Add language specifiers to fenced code blocks in EAGLE3_TRIAGE.md
  (MD040: two ``` blocks needed `text` language label)
- Apply ruff-format reformatting to compute_hidden_states_vllm.py
  (line-length adjustments to match project style)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py             | 21 +++++++------------
 tools/launcher/examples/EAGLE3_TRIAGE.md      |  9 +++++---
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
index 76a847aeb59..ae27f2adebb 100644
--- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -66,9 +66,7 @@ def main(args: argparse.Namespace) -> None:
             "json", data_files={"train": f"{args.input_data}/*.jsonl"}, split="train"
         )
     else:
-        raise ValueError(
-            f"input_data must be a .jsonl file or directory, got: {args.input_data}"
-        )
+        raise ValueError(f"input_data must be a .jsonl file or directory, got: {args.input_data}")
     print(f"Loaded {len(dataset)} conversations from {args.input_data}")
 
     # Shard data
@@ -93,9 +91,7 @@ def keep_conversation(entry):
         dataset = dataset.select(range(args.debug_max_num_conversations))
 
     # Tokenize conversations
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.model, trust_remote_code=args.trust_remote_code
-    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
     tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
@@ -125,7 +121,9 @@ def keep_conversation(entry):
         prompts.append(input_ids.squeeze(0))
         conversation_ids.append(conversation_id)
 
-    print(f"Prepared {len(prompts)} prompts ({num_skipped_too_long} skipped too long, {num_invalid} invalid)")
+    print(
+        f"Prepared {len(prompts)} prompts ({num_skipped_too_long} skipped too long, {num_invalid} invalid)"
+    )
 
     if len(prompts) == 0:
         print("No prompts to process.")
@@ -135,6 +133,7 @@ def keep_conversation(entry):
     tp = args.tp
     if tp is None:
         import torch as _torch
+
         tp = _torch.cuda.device_count()
 
     generator = VllmHiddenStatesGenerator(
@@ -149,9 +148,7 @@ def keep_conversation(entry):
 
     # Save in the same format as compute_hidden_states_hf.py
     num_success = 0
-    for conv_id, result in tqdm(
-        zip(conversation_ids, results), total=len(results), desc="Saving"
-    ):
+    for conv_id, result in tqdm(zip(conversation_ids, results), total=len(results), desc="Saving"):
         input_ids = result["input_ids"]
         hidden_states_dict = result["hidden_states"]
 
@@ -164,9 +161,7 @@ def keep_conversation(entry):
         # Aux layers = all except the last
         aux_layers = layer_indices[:-1]
         if aux_layers:
-            aux_hidden_states = torch.cat(
-                [hidden_states_dict[i].cpu() for i in aux_layers], dim=-1
-            )
+            aux_hidden_states = torch.cat([hidden_states_dict[i].cpu() for i in aux_layers], dim=-1)
         else:
             aux_hidden_states = torch.empty(0)
 
diff --git a/tools/launcher/examples/EAGLE3_TRIAGE.md b/tools/launcher/examples/EAGLE3_TRIAGE.md
index e16cf8efd2b..a55b665527d 100644
--- a/tools/launcher/examples/EAGLE3_TRIAGE.md
+++ b/tools/launcher/examples/EAGLE3_TRIAGE.md
@@ -8,7 +8,7 @@ Claude can update the status table, diagram, and issue catalog when new results
 
 ## Pipeline Overview
 
-```
+```text
 Model checkpoint (HuggingFace)
         │
         ▼
@@ -103,7 +103,7 @@ Tests run on OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
 | 3 | GPT-OSS-20B | Dense | 20B | ❌ TOKENIZER | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | Fix: populate TIKTOKEN_RS_CACHE_DIR first |
 | 4 | MiniMax-M2.5 | MoE | 230B/10B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at bench |
 | 5 | Qwen3.5-35B-A3B | MoE | 35B/3B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | — |
-| 6 | Step-3.5-Flash | MoE/SWA | 197B/11B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | SWA: use _vllm or _hf script |
+| 6 | Step-3.5-Flash | MoE/SWA | 197B/11B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | SWA: use _vllm or_hf script |
 | 7 | DeepSeek-V3.2 | MoE/MLA | 685B/37B | 🔍 (tarball only) | 🔁 NEEDS RERUN (_vllm, 2-node) | 🔲 | 🔲 | 2-node; previous t1 OOM-killed |
 | 8 | Kimi-K2.5 | MoE/MLA | 1T/32B | 🔲 | 🔲 | 🔲 | 🔲 | MLA attention: verify eagle_decoder_type |
 | 9 | GLM-5 | MoE/DSA | 744B/40B | 🔲 | 🔲 | 🔲 | 🔲 | Gated, 2-node |
@@ -133,6 +133,7 @@ uses `VllmHiddenStatesGenerator` from the `speculators` library and saves output
 - `examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py`
 
 Three backends now available for task_1:
+
 | Backend | Script | When to use |
 |---------|--------|-------------|
 | TRT-LLM | `dump_offline_data.sh` | Pure-text models with TRT-LLM support; needs `--tp`/`--moe-ep` |
@@ -185,7 +186,8 @@ path before submitting task_0.
 ### Issue 5: MiniMax-M2.5 missing `trust_remote_code` at benchmark (Task 3) — OPEN
 
 **Symptom:**
-```
+
+```text
 ValueError: The repository ... contains custom code... Please pass trust_remote_code=True
 ```
 
@@ -223,6 +225,7 @@ When a new model completes testing:
 4. Mark resolved issues as **FIXED ✅** and update the status in the table.
 
 Per-model results template:
+
 ```markdown
 #### Model: <name>
 - **Date tested:** YYYY-MM-DD

From eb830bdfc239f43f34a96e85e85c485936805b44 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 12 May 2026 09:49:38 -0700
Subject: [PATCH 07/22] fix(eagle3): pin speculators<0.5.0; document issues 6+7
 in triage chart

- dump_offline_data_vllm.sh: pin speculators<0.5.0 to fix ImportError
  (VllmHiddenStatesGenerator removed in speculators 0.5.0)
- EAGLE3_TRIAGE.md: add Issues 6 (speculators API break) and 7
  (query.py shard auto-downgrade causing empty data on timeout);
  mark both FIXED; update Ministral-3-8B row status

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../common/eagle3/dump_offline_data_vllm.sh   |  2 +-
 tools/launcher/examples/EAGLE3_TRIAGE.md      | 39 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index 789442d2071..4123ef4978e 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -31,7 +31,7 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #   --input-data, --output-dir, --max-seq-len, etc.
 ###################################################################################################
 
-pip install speculators datasets 2>/dev/null || true
+pip install "speculators<0.5.0" datasets 2>/dev/null || true
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
     TASK_ID=0
diff --git a/tools/launcher/examples/EAGLE3_TRIAGE.md b/tools/launcher/examples/EAGLE3_TRIAGE.md
index a55b665527d..8815102f15b 100644
--- a/tools/launcher/examples/EAGLE3_TRIAGE.md
+++ b/tools/launcher/examples/EAGLE3_TRIAGE.md
@@ -98,7 +98,7 @@ Tests run on OCI-HSG cluster (GB200 nodes, 4 × 192 GB HBM3e per node).
 
 | # | Model | Type | Size | task_0 | task_1 | task_2 | task_3 | Notes |
 |---|-------|------|------|--------|--------|--------|--------|-------|
-| 1 | Ministral-3-8B | Dense | 8B | ⏱ TIMEOUT (3277/3295) | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | task_0 nearly complete (99%); t1 re-run needed |
+| 1 | Ministral-3-8B | Dense | 8B | 🔁 RERUNNING (--num-shards 3) | 🔁 RERUNNING (speculators pin) | 🔲 | 🔲 | Issues 6+7 fixed; re-run in progress |
 | 2 | Ministral-3-14B | Dense | 14B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | — |
 | 3 | GPT-OSS-20B | Dense | 20B | ❌ TOKENIZER | 🔁 NEEDS RERUN (_vllm) | 🔲 | 🔲 | Fix: populate TIKTOKEN_RS_CACHE_DIR first |
 | 4 | MiniMax-M2.5 | MoE | 230B/10B | ⏱ TIMEOUT | 🔁 NEEDS RERUN (_vllm) | 🔲 | ❌ TRUST_REMOTE_CODE | trust_remote_code needed at bench |
@@ -201,7 +201,42 @@ that require it.
 
 ---
 
-### Issue 6: DeepSeek-V3.2 task_1 OOM (Task 1) — OPEN
+### Issue 6: `speculators>=0.5.0` breaks `VllmHiddenStatesGenerator` (Task 1) — FIXED ✅
+
+**Symptom:**
+
+```text
+ImportError: cannot import name 'VllmHiddenStatesGenerator' from 'speculators.data_generation'
+```
+
+**Affected:** All models using `dump_offline_data_vllm.sh` with `vllm/vllm-openai:latest` container.
+
+**Root cause:** `speculators==0.5.0` (released after the script was written) removed
+`VllmHiddenStatesGenerator` from `speculators.data_generation`. The install line
+`pip install speculators` in the script picks up the latest version.
+
+**Fix applied:** Pin to `pip install "speculators<0.5.0"` in `dump_offline_data_vllm.sh`.
+
+---
+
+### Issue 7: `query.py` auto-downgrades shards → empty data on timeout (Task 0) — FIXED ✅
+
+**Symptom:** task_0 times out; `/scratchspace/data/` is empty despite partial generation.
+
+**Affected:** Models with datasets ≤ 33,000 samples where `num_shards * 100 > dataset_size`.
+
+**Root cause:** `query.py` auto-downgrades `--num-shards` to `min(16, dataset_size//100)`
+when the default of 1000 is too large relative to dataset size. For 3295 samples this
+becomes 1 shard, meaning all data is processed in one batch and nothing is saved until
+the entire map completes. A timeout yields zero data.
+
+**Fix applied:** Pass `--num-shards 3` explicitly in task_0 args. Since `3*100=300 < 3295`,
+the auto-downgrade is bypassed. Data is saved incrementally across 3 shard files (~1100
+samples each). Partial data survives a timeout.
+
+---
+
+### Issue 8: DeepSeek-V3.2 task_1 OOM (Task 1) — OPEN
 
 **Symptom:** `pyxis: child terminated with signal 15` (SIGTERM, likely OOM-triggered)
 

From e1dd712a5cf0066f29402c9dacc2a3171f0146b0 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Wed, 13 May 2026 09:22:37 -0700
Subject: [PATCH 08/22] Fix torchvision import crash in vLLM container for
 dump_offline_data_vllm.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In vllm/vllm-openai:latest, torchvision is installed but incompatible with
the container's torch version: importing torchvision raises
  RuntimeError: operator torchvision::nms does not exist
This propagates through transformers → speculators, preventing
VllmHiddenStatesGenerator from being imported.

Fix: uninstall torchvision before installing speculators<0.5.0. Without
torchvision present, transformers' lazy import machinery catches the
ImportError gracefully (is_torchvision_available() = False) and speculators
imports cleanly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 tools/launcher/common/eagle3/dump_offline_data_vllm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index 4123ef4978e..08a062d3535 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -31,6 +31,7 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #   --input-data, --output-dir, --max-seq-len, etc.
 ###################################################################################################
 
+pip uninstall -y torchvision 2>/dev/null || true
 pip install "speculators<0.5.0" datasets 2>/dev/null || true
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then

From 0b20534a9f1426c7247aa096d9edbd289f6ddd9a Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Thu, 14 May 2026 09:22:56 -0700
Subject: [PATCH 09/22] Fix torch downgrade in dump_offline_data_vllm.sh
 breaking vllm._C

speculators<0.5.0 pins torch==2.10.0, replacing the container's
torch 2.11.0+cu130 and causing `ImportError: libtorch_cuda.so: cannot
open shared object file` when vllm._C is loaded.

Fix: install speculators with --no-deps to avoid the torch downgrade,
then install transformers>=4.40,<5.0 separately (speculators is
incompatible with transformers 5.x API).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 tools/launcher/common/eagle3/dump_offline_data_vllm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index 08a062d3535..c3b5f1dfcc5 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -32,7 +32,8 @@ source ${SCRIPT_DIR}/../service_utils.sh
 ###################################################################################################
 
 pip uninstall -y torchvision 2>/dev/null || true
-pip install "speculators<0.5.0" datasets 2>/dev/null || true
+pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
+pip install "transformers>=4.40,<5.0" datasets 2>/dev/null || true
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
     TASK_ID=0

From 2bedfa10962e0e9f39136b64b4b9b5df0ac46c1b Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Fri, 15 May 2026 09:24:42 -0700
Subject: [PATCH 10/22] Fix compute_hidden_states_vllm.py for speculators 0.4.x
 API

The script was written against an unreleased speculators API. Fix to work
with speculators 0.4.0.1 (latest stable with VllmHiddenStatesGenerator):

- VllmHiddenStatesGenerator constructor: rename `model` kwarg to `model_path`
  and remove `trust_remote_code` (hardcoded True in 0.4.x)
- Result parsing: `hidden_states` is a list of tensors ordered by layer_ids,
  not a dict keyed by layer index

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py             | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
index ae27f2adebb..cb76d07f190 100644
--- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -137,9 +137,8 @@ def keep_conversation(entry):
         tp = _torch.cuda.device_count()
 
     generator = VllmHiddenStatesGenerator(
-        model=args.model,
+        model_path=args.model,
         tensor_parallel_size=tp,
-        trust_remote_code=args.trust_remote_code,
         max_model_len=args.max_seq_len,
     )
 
@@ -150,18 +149,15 @@ def keep_conversation(entry):
     num_success = 0
     for conv_id, result in tqdm(zip(conversation_ids, results), total=len(results), desc="Saving"):
         input_ids = result["input_ids"]
-        hidden_states_dict = result["hidden_states"]
+        # speculators returns hidden_states as a list of tensors ordered by layer_ids
+        hidden_states_list = result["hidden_states"]
 
-        # Sort layer indices
-        layer_indices = sorted(hidden_states_dict.keys())
+        # Last element = output hidden states (last captured layer)
+        output_hidden_states = hidden_states_list[-1].cpu()
 
-        # Last layer = output hidden states
-        output_hidden_states = hidden_states_dict[layer_indices[-1]].cpu()
-
-        # Aux layers = all except the last
-        aux_layers = layer_indices[:-1]
-        if aux_layers:
-            aux_hidden_states = torch.cat([hidden_states_dict[i].cpu() for i in aux_layers], dim=-1)
+        # All but the last = aux layers, concatenated along the hidden dim
+        if len(hidden_states_list) > 1:
+            aux_hidden_states = torch.cat([h.cpu() for h in hidden_states_list[:-1]], dim=-1)
         else:
             aux_hidden_states = torch.empty(0)
 

From 6ea8086ed89046d4999aa7ec4582b98f8934aa1e Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 19 May 2026 09:10:45 -0700
Subject: [PATCH 11/22] Remove transformers downgrade from
 dump_offline_data_vllm.sh

Installing transformers>=4.40,<5.0 downgrades from the container's
transformers 5.8.0, which breaks models like Ministral-3-8B whose
model_type 'ministral3' is not registered in older transformers versions.

The transformers 5.0 constraint was added to work around a speculators
import issue, but that issue was caused by broken torchvision (already
fixed). With torchvision uninstalled and speculators installed --no-deps,
speculators 0.4.x works with the container's transformers 5.8.0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 tools/launcher/common/eagle3/dump_offline_data_vllm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index c3b5f1dfcc5..9fc535990b9 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -33,7 +33,7 @@ source ${SCRIPT_DIR}/../service_utils.sh
 
 pip uninstall -y torchvision 2>/dev/null || true
 pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
-pip install "transformers>=4.40,<5.0" datasets 2>/dev/null || true
+pip install datasets 2>/dev/null || true
 
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
     TASK_ID=0

From 86cc1c1cd3cc6834b61b6a2e7456a0256e0bcef6 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 19 May 2026 11:35:36 -0700
Subject: [PATCH 12/22] fix(eagle3): patch speculators/config.py for pydantic
 2.13 compatibility

speculators/config.py defines SpeculatorModelConfig which inherits from
transformers' PretrainedConfig. In transformers 5.x, PretrainedConfig
carries torch.dtype annotations. Pydantic 2.13 (in vllm-openai:latest)
tries to resolve these annotations in the calling module's namespace
during model_rebuild(), but speculators/config.py doesn't import torch,
causing PydanticUndefinedAnnotation at import time.

Fix by patching speculators/config.py in-place after installation to
add the missing `import torch` at the top of the file.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../common/eagle3/dump_offline_data_vllm.sh   | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index 9fc535990b9..3c7c262af04 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -35,6 +35,29 @@ pip uninstall -y torchvision 2>/dev/null || true
 pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
 pip install datasets 2>/dev/null || true
 
+# Pydantic 2.13 compatibility: speculators.ReloadableBaseModel.reload_schema() calls
+# model_rebuild(force=True) without a types_namespace. In pydantic 2.13+, inherited
+# torch.dtype annotations from transformers.PretrainedConfig cannot be resolved in
+# subclass modules that don't import torch. Fix by injecting torch into the namespace.
+python3 -c "
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'utils', 'pydantic_utils.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = 'cls.model_rebuild(force=True)'
+    new = 'import torch as _torch; cls.model_rebuild(force=True, _types_namespace={\"torch\": _torch})'
+    if old in c and new not in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, new))
+        print('Patched pydantic_utils.py: model_rebuild now passes torch namespace')
+    else:
+        print('pydantic_utils.py already patched or pattern not found')
+    break
+" 2>/dev/null || true
+
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
     TASK_ID=0
 else

From ccfb6ef8a2ec7e09eecba309d430c58e881506e3 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 19 May 2026 12:01:48 -0700
Subject: [PATCH 13/22] fix(eagle3): fix tokenizer compatibility with
 transformers 5.x

transformers 5.x: apply_chat_template returns a BatchEncoding that may
not inherit from dict; access .input_ids attribute instead of dict key.
Also fix: add_generation_template -> add_generation_prompt (correct arg).
Handle both 1D (single sequence) and 2D (batched) input_ids shapes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py                  | 14 +++++++++++---
 .../common/eagle3/dump_offline_data_vllm.sh        |  1 -
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
index cb76d07f190..2e2660ae829 100644
--- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -110,10 +110,18 @@ def keep_conversation(entry):
             continue
 
         tokenized = tokenizer.apply_chat_template(
-            conversations, return_tensors="pt", add_generation_template=False
+            conversations, return_tensors="pt", add_generation_prompt=False
         )
-        input_ids = tokenized["input_ids"] if isinstance(tokenized, dict) else tokenized
-        num_tokens = input_ids.shape[1]
+        # transformers 5.x: BatchEncoding may not inherit from dict; use .input_ids
+        if hasattr(tokenized, "input_ids"):
+            input_ids = tokenized.input_ids
+        elif hasattr(tokenized, "__getitem__") and "input_ids" in tokenized:
+            input_ids = tokenized["input_ids"]
+        else:
+            input_ids = tokenized
+        if not hasattr(input_ids, "shape"):
+            input_ids = torch.tensor(input_ids)
+        num_tokens = input_ids.shape[0] if input_ids.dim() == 1 else input_ids.shape[1]
         if num_tokens <= 10 or num_tokens > args.max_seq_len:
             num_skipped_too_long += 1
             continue
diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index 3c7c262af04..d6f38654ad1 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -31,7 +31,6 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #   --input-data, --output-dir, --max-seq-len, etc.
 ###################################################################################################
 
-pip uninstall -y torchvision 2>/dev/null || true
 pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
 pip install datasets 2>/dev/null || true
 

From 8ccb100acf665d4d7de9875cbfa5c019c1086b02 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 19 May 2026 12:44:45 -0700
Subject: [PATCH 14/22] fix(eagle3): patch speculators for vLLM API compat
 (pydantic 2.13, Request)

Two runtime patches applied to speculators 0.4.0.1 after install:

1. pydantic_utils.py: pass _types_namespace={'torch': torch} to
   model_rebuild() to fix PydanticUndefinedAnnotation with
   transformers 5.x (PretrainedConfig has torch.dtype annotations
   that pydantic 2.13 can't resolve without explicit namespace).

2. vllm_hidden_states_generator.py: remove eos_token_id kwarg from
   Request() constructor which was dropped in newer vLLM.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../common/eagle3/dump_offline_data_vllm.sh   | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index d6f38654ad1..da08bed0960 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -34,6 +34,26 @@ source ${SCRIPT_DIR}/../service_utils.sh
 pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
 pip install datasets 2>/dev/null || true
 
+# vLLM API compatibility: speculators 0.4.0.1 uses Request(eos_token_id=...) which
+# was removed in newer vLLM. Patch to remove the unsupported kwarg.
+python3 -c "
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'data_generation', 'vllm_hidden_states_generator.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = '                eos_token_id=self.tokenizer.eos_token_id,\n'
+    if old in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, ''))
+        print('Patched vllm_hidden_states_generator.py: removed eos_token_id from Request()')
+    else:
+        print('vllm_hidden_states_generator.py: eos_token_id already removed or not found')
+    break
+" 2>/dev/null || true
+
 # Pydantic 2.13 compatibility: speculators.ReloadableBaseModel.reload_schema() calls
 # model_rebuild(force=True) without a types_namespace. In pydantic 2.13+, inherited
 # torch.dtype annotations from transformers.PretrainedConfig cannot be resolved in

From c1d8b8b6fb9ed6d8dcc2a6c9157b34073f78b778 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Thu, 21 May 2026 10:36:04 -0700
Subject: [PATCH 15/22] fix(eagle3): patch speculators vLLM scheduler to
 process all requests

speculators 0.4.0.1's generate() loop never calls
scheduler.update_from_output(), so KV cache blocks are never freed
after each batch. In newer vLLM (0.7+), this causes the scheduler to
stop admitting new requests after the first MAX_NUM_SEQS=32 are
processed, silently truncating large datasets.

Fix by patching vllm_hidden_states_generator.py at install time to:
1. Capture the return value of sample_tokens()
2. Call scheduler.update_from_output(scheduler_output, sampled_output)
   so the scheduler advances request states and frees KV blocks
3. Call scheduler.finish_requests() for newly-completed prefill
   requests as an additional fallback to free KV capacity

Tested on Ministral-3-8B with 330 conversations: previously only 32
were processed; now all 330 succeed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../common/eagle3/dump_offline_data_vllm.sh   | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index da08bed0960..a8ae99b3a16 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -77,6 +77,57 @@ for d in site.getsitepackages():
     break
 " 2>/dev/null || true
 
+# vLLM scheduler compatibility: speculators 0.4.0.1 generate() loop never calls
+# scheduler.update_from_output(), so KV blocks are never freed and the scheduler
+# stops admitting new requests after MAX_NUM_SEQS=32. Fix by injecting the call
+# inside the loop, plus aborting completed requests to free KV capacity.
+python3 << 'PYEOF' 2>/dev/null || true
+import site, os
+
+old = (
+    '            model_output = self.executor.execute_model(scheduler_output)\n'
+    '            self.executor.sample_tokens(model_output)\n'
+)
+new = (
+    '            model_output = self.executor.execute_model(scheduler_output)\n'
+    '            sampled_output = self.executor.sample_tokens(model_output)\n'
+    '            # Advance scheduler state so KV blocks are freed after each batch.\n'
+    '            # Without this, newer vLLM never admits requests beyond MAX_NUM_SEQS.\n'
+    '            if hasattr(self.scheduler, \'update_from_output\'):\n'
+    '                try:\n'
+    '                    self.scheduler.update_from_output(scheduler_output, sampled_output)\n'
+    '                except Exception:\n'
+    '                    try:\n'
+    '                        self.scheduler.update_from_output(scheduler_output, model_output)\n'
+    '                    except Exception:\n'
+    '                        pass\n'
+    '            # Abort completed-prefill requests this iteration to free KV capacity.\n'
+    '            _just_done = [\n'
+    '                _r for _r in scheduler_output.num_scheduled_tokens\n'
+    '                if request_num_computed.get(_r, 0) >= request_id_to_prompt_len.get(_r, 0)\n'
+    '            ]\n'
+    '            if _just_done:\n'
+    '                try:\n'
+    '                    self.scheduler.finish_requests(_just_done, RequestStatus.FINISHED_ABORTED)\n'
+    '                except Exception:\n'
+    '                    pass\n'
+)
+
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'data_generation', 'vllm_hidden_states_generator.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    if old in c and new not in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, new))
+        print('Patched vllm_hidden_states_generator.py: added update_from_output + finish_requests in generate() loop')
+    else:
+        print('vllm_hidden_states_generator.py: scheduler patch already applied or pattern not found')
+    break
+PYEOF
+
 if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
     TASK_ID=0
 else

From 56bbdc6baf47572ddf9599778bf2e5cbe4804ba6 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Wed, 27 May 2026 11:07:53 -0700
Subject: [PATCH 16/22] fix(eagle3): support Ministral-3 (mistral3) VLM in
 offline training and export

- Extend VLM detection in load_vlm_or_llm to check text_config/llm_config
  attributes, not just "vl" in model_type. This catches mistral3 models
  (e.g. Ministral-3-8B) which are VLM wrappers.
- Add consolidated.safetensors fallback in FakeBaseModel._load_weights
  for checkpoints with incomplete HF shards but a complete Mistral native
  consolidated file.
- Set use_cache=True in EAGLE export template configs. The None placeholder
  was not being filled for FakeBaseModel and newer huggingface_hub rejects
  None for bool fields.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../torch/export/plugins/hf_spec_configs.py   |  4 +--
 .../speculative/plugins/modeling_fakebase.py  | 28 +++++++++++++++++--
 modelopt/torch/speculative/utils.py           | 16 ++++++++++-
 3 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/modelopt/torch/export/plugins/hf_spec_configs.py b/modelopt/torch/export/plugins/hf_spec_configs.py
index b78dfadd472..e7011042db9 100644
--- a/modelopt/torch/export/plugins/hf_spec_configs.py
+++ b/modelopt/torch/export/plugins/hf_spec_configs.py
@@ -33,7 +33,7 @@
     "tie_word_embeddings": False,
     "torch_dtype": None,
     "transformers_version": None,
-    "use_cache": None,
+    "use_cache": True,
     "vocab_size": None,
     "draft_vocab_size": None,
     "rope_scaling": None,
@@ -133,7 +133,7 @@
     "transformers_version": None,
     "typical_p": None,
     "use_bfloat16": None,
-    "use_cache": None,
+    "use_cache": True,
     "v_head_dim": None,
     "vocab_size": None,
     "eagle_config": {
diff --git a/modelopt/torch/speculative/plugins/modeling_fakebase.py b/modelopt/torch/speculative/plugins/modeling_fakebase.py
index 4ed06ed6490..44ca9a75c0c 100644
--- a/modelopt/torch/speculative/plugins/modeling_fakebase.py
+++ b/modelopt/torch/speculative/plugins/modeling_fakebase.py
@@ -205,10 +205,32 @@ def _load_weights(self, source: str):
             source, [weight_map[lm_head_key], weight_map[embed_tokens_key]]
         )
 
-        lm_head_state = safetensors_load_file(lm_head_path, device="cpu")
-        embed_tokens_state = safetensors_load_file(embed_tokens_path, device="cpu")
+        def _load_with_consolidated_fallback(shard_path, key, role):
+            """Load ``key`` from ``shard_path``; on FileNotFoundError try consolidated.safetensors.
 
-        return lm_head_state[lm_head_key], embed_tokens_state[embed_tokens_key]
+            Mistral native checkpoints use ``tok_embeddings.weight`` (embed_tokens) and
+            ``output.weight`` (lm_head) in a monolithic ``consolidated.safetensors`` file,
+            so shards for these weights may be absent.
+            """
+            try:
+                return safetensors_load_file(shard_path, device="cpu")[key]
+            except FileNotFoundError:
+                _aliases = {
+                    "embed_tokens": ["tok_embeddings.weight"],
+                    "lm_head": ["output.weight"],
+                }
+                _consolidated = os.path.join(os.path.dirname(shard_path), "consolidated.safetensors")
+                if os.path.isfile(_consolidated):
+                    _state = safetensors_load_file(_consolidated, device="cpu")
+                    for _alias in _aliases.get(role, []):
+                        if _alias in _state:
+                            return _state[_alias]
+                raise
+
+        lm_head_w = _load_with_consolidated_fallback(lm_head_path, lm_head_key, "lm_head")
+        embed_tokens_w = _load_with_consolidated_fallback(embed_tokens_path, embed_tokens_key, "embed_tokens")
+
+        return lm_head_w, embed_tokens_w
 
     def forward(self, *args, **kwargs):
         """Not implemented: FakeBaseModel omits full model weights and cannot run inference."""
diff --git a/modelopt/torch/speculative/utils.py b/modelopt/torch/speculative/utils.py
index bb8a4010ded..3b01143a41f 100644
--- a/modelopt/torch/speculative/utils.py
+++ b/modelopt/torch/speculative/utils.py
@@ -595,7 +595,21 @@ def load_vlm_or_llm(
         model_name_or_path,
         trust_remote_code=trust_remote_code,
     )
-    if "vl" in model_config.model_type.lower():
+
+    # Detect VLMs: either "vl" in model_type (e.g. "llava") or has a nested text config
+    # (e.g. Mistral3Config with model_type="mistral3" and text_config attribute).
+    _is_vlm = "vl" in model_config.model_type.lower() or any(
+        getattr(model_config, attr, None) is not None for attr in ["text_config", "llm_config"]
+    )
+
+    if _is_vlm and use_offline_training:
+        # For VLMs in offline training, FakeBaseModel loads only embed_tokens + lm_head
+        # and auto-detects VLM weight key layouts (e.g. "language_model.model.embed_tokens").
+        from modelopt.torch.speculative.plugins.modeling_fakebase import FakeBaseModel
+
+        return FakeBaseModel.from_source(model_name_or_path, trust_remote_code=trust_remote_code)
+
+    if _is_vlm:
         model_cls = transformers.AutoModelForVision2Seq
     else:
         model_cls = transformers.AutoModelForCausalLM

From 74c9c41071a19f67934e0c466e85c5bfe9536985 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Wed, 27 May 2026 11:13:53 -0700
Subject: [PATCH 17/22] feat(eagle3): add pipeline configs, scripts, and triage
 docs for new model support

Add EAGLE3 pipeline infrastructure for testing new model architectures:
- Pipeline scripts: offline_training.sh (with runtime patches for VLM detection,
  consolidated.safetensors fallback, use_cache fix), dump_offline_data_vllm.sh,
  dump_offline_data.sh, dump_offline_data_hf.sh
- Quick-fail-check YAMLs for 12 models (Qwen3.5-9B/27B/35B-A3B, MiniMax-M2.5,
  Ministral-3-8B/14B, DeepSeek-V3.2, gpt-oss-20b, Step-3.5-Flash, GLM-5, Kimi-K2.5)
- Triage chart documenting test results for all models
- Triage guide for Claude Code to follow when adding new models

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../pipeline/eagle3/dump_offline_data.sh      |  27 +++
 .../pipeline/eagle3/dump_offline_data_hf.sh   |  39 ++++
 .../pipeline/eagle3/dump_offline_data_vllm.sh | 121 +++++++++++
 .../eagle3/eagle3_new_model_triage_guide.md   | 123 +++++++++++
 .../pipeline/eagle3/eagle3_triage_chart.md    | 200 ++++++++++++++++++
 .../pipeline/eagle3/offline.yaml              | 129 +++++++++++
 .../pipeline/eagle3/offline_training.sh       | 174 +++++++++++++++
 .../pipeline/eagle3/quick_fail_check.yaml     | 135 ++++++++++++
 .../eagle3/quick_fail_check_computelab.yaml   | 120 +++++++++++
 .../quick_fail_check_deepseek-v3.2.yaml       | 110 ++++++++++
 .../eagle3/quick_fail_check_glm-5.yaml        | 110 ++++++++++
 .../eagle3/quick_fail_check_gpt-oss-20b.yaml  |  97 +++++++++
 .../eagle3/quick_fail_check_kimi_k2.5.yaml    | 149 +++++++++++++
 .../quick_fail_check_kimi_k2.5_nvfp4.yaml     | 162 ++++++++++++++
 .../eagle3/quick_fail_check_minimax-m2.5.yaml | 105 +++++++++
 .../quick_fail_check_ministral-3-14b.yaml     | 101 +++++++++
 ..._fail_check_ministral-3-8b-from-task1.yaml |  76 +++++++
 ..._fail_check_ministral-3-8b-from-task2.yaml |  56 +++++
 ...ick_fail_check_ministral-3-8b-hf-dump.yaml |  77 +++++++
 .../quick_fail_check_ministral-3-8b.yaml      | 103 +++++++++
 .../eagle3/quick_fail_check_qwen3.5-27b.yaml  |  95 +++++++++
 .../quick_fail_check_qwen3.5-35b-a3b.yaml     | 101 +++++++++
 .../eagle3/quick_fail_check_qwen3.5-9b.yaml   |  95 +++++++++
 .../quick_fail_check_step-3.5-flash.yaml      | 107 ++++++++++
 .../pipeline/eagle3/task1_minimax-m2.5.yaml   |  20 ++
 .../pipeline/eagle3/task1_ministral-3-8b.yaml |  20 ++
 .../eagle3/task1_qwen3.5-35b-a3b.yaml         |  20 ++
 .../pipeline/eagle3/task1_step-3.5-flash.yaml |  20 ++
 28 files changed, 2692 insertions(+)
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
 create mode 100755 examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/offline.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/offline_training.sh
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
 create mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml

diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
new file mode 100644
index 00000000000..2e55958edc8
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+source ${SCRIPT_DIR}/../service_utils.sh
+
+###################################################################################################
+
+if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+    TASK_ID=0
+else
+    echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
+    TASK_ID=${SLURM_ARRAY_TASK_ID}
+fi
+
+if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+    TASK_COUNT=1
+else
+    echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
+    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+fi
+
+trtllm-llmapi-launch python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py \
+    --model ${HF_MODEL_CKPT} \
+    --dp-rank ${TASK_ID} \
+    --dp-world-size ${TASK_COUNT} \
+    ${@}
diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
new file mode 100755
index 00000000000..4172c61fd46
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+source ${SCRIPT_DIR}/../../service_utils.sh
+
+###################################################################################################
+# HF-based hidden state dumping for models not supported by TRT-LLM.
+# Uses compute_hidden_states_hf.py with device_map="auto" (no TP/EP flags needed).
+#
+# Required environment:
+#   HF_MODEL_CKPT   Path to the HF model checkpoint
+#
+# Args passed through to compute_hidden_states_hf.py:
+#   --input-data, --output-dir, --max-seq-len, etc.
+###################################################################################################
+
+pip install datasets 2>/dev/null || true
+
+if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+    TASK_ID=0
+else
+    echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
+    TASK_ID=${SLURM_ARRAY_TASK_ID}
+fi
+
+if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+    TASK_COUNT=1
+else
+    echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
+    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+fi
+
+python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py \
+    --model ${HF_MODEL_CKPT} \
+    --dp-rank ${TASK_ID} \
+    --dp-world-size ${TASK_COUNT} \
+    --trust_remote_code \
+    ${@}
diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
new file mode 100644
index 00000000000..bc0659fbf2e
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+source ${SCRIPT_DIR}/../../service_utils.sh
+
+pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
+pip install datasets 2>/dev/null || true
+
+# vLLM API compatibility: speculators 0.4.0.1 uses Request(eos_token_id=...) which
+# was removed in newer vLLM. Patch to remove the unsupported kwarg.
+python3 -c "
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'data_generation', 'vllm_hidden_states_generator.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = '                eos_token_id=self.tokenizer.eos_token_id,\n'
+    if old in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, ''))
+        print('Patched vllm_hidden_states_generator.py: removed eos_token_id from Request()')
+    else:
+        print('vllm_hidden_states_generator.py: eos_token_id already removed or not found')
+    break
+" 2>/dev/null || true
+
+# Pydantic 2.13 compatibility: speculators.ReloadableBaseModel.reload_schema() calls
+# model_rebuild(force=True) without a types_namespace. In pydantic 2.13+, inherited
+# torch.dtype annotations from transformers.PretrainedConfig cannot be resolved in
+# subclass modules that don't import torch. Fix by injecting torch into the namespace.
+python3 -c "
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'utils', 'pydantic_utils.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = 'cls.model_rebuild(force=True)'
+    new = 'import torch as _torch; cls.model_rebuild(force=True, _types_namespace={\"torch\": _torch})'
+    if old in c and new not in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, new))
+        print('Patched pydantic_utils.py: model_rebuild now passes torch namespace')
+    else:
+        print('pydantic_utils.py already patched or pattern not found')
+    break
+" 2>/dev/null || true
+
+# vLLM scheduler compatibility: speculators 0.4.0.1 generate() loop never calls
+# scheduler.update_from_output(), so KV blocks are never freed and the scheduler
+# stops admitting new requests after MAX_NUM_SEQS=32. Fix by injecting the call
+# inside the loop, plus aborting completed requests to free KV capacity.
+python3 << 'PYEOF' 2>/dev/null || true
+import site, os
+
+old = (
+    '            model_output = self.executor.execute_model(scheduler_output)\n'
+    '            self.executor.sample_tokens(model_output)\n'
+)
+new = (
+    '            model_output = self.executor.execute_model(scheduler_output)\n'
+    '            sampled_output = self.executor.sample_tokens(model_output)\n'
+    '            # Advance scheduler state so KV blocks are freed after each batch.\n'
+    '            # Without this, newer vLLM never admits requests beyond MAX_NUM_SEQS.\n'
+    '            if hasattr(self.scheduler, \'update_from_output\'):\n'
+    '                try:\n'
+    '                    self.scheduler.update_from_output(scheduler_output, sampled_output)\n'
+    '                except Exception:\n'
+    '                    try:\n'
+    '                        self.scheduler.update_from_output(scheduler_output, model_output)\n'
+    '                    except Exception:\n'
+    '                        pass\n'
+    '            # Abort completed-prefill requests this iteration to free KV capacity.\n'
+    '            _just_done = [\n'
+    '                _r for _r in scheduler_output.num_scheduled_tokens\n'
+    '                if request_num_computed.get(_r, 0) >= request_id_to_prompt_len.get(_r, 0)\n'
+    '            ]\n'
+    '            if _just_done:\n'
+    '                try:\n'
+    '                    self.scheduler.finish_requests(_just_done, RequestStatus.FINISHED_ABORTED)\n'
+    '                except Exception:\n'
+    '                    pass\n'
+)
+
+for d in site.getsitepackages():
+    path = os.path.join(d, 'speculators', 'data_generation', 'vllm_hidden_states_generator.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    if old in c and new not in c:
+        with open(path, 'w') as f:
+            f.write(c.replace(old, new))
+        print('Patched vllm_hidden_states_generator.py: added update_from_output + finish_requests in generate() loop')
+    else:
+        print('vllm_hidden_states_generator.py: scheduler patch already applied or pattern not found')
+    break
+PYEOF
+
+if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+    TASK_ID=0
+else
+    TASK_ID=${SLURM_ARRAY_TASK_ID}
+fi
+
+if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+    TASK_COUNT=1
+else
+    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+fi
+
+python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py \
+    --model ${HF_MODEL_CKPT} \
+    --dp-rank ${TASK_ID} \
+    --dp-world-size ${TASK_COUNT} \
+    --trust_remote_code \
+    ${@}
diff --git a/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md b/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
new file mode 100644
index 00000000000..7d8ea0206b1
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
@@ -0,0 +1,123 @@
+# EAGLE3 New Model Support — Triage Guide for Claude Code
+
+This document describes how to triage EAGLE3 pipeline failures when adding a new model.
+Follow these steps in order. Stop at the first failure, diagnose, and document findings.
+
+## Pipeline Overview
+
+The EAGLE3 pipeline has 4 stages (mapped to task_0 through task_3 in the YAML):
+
+| Task | Stage | Container | Script | What it does |
+|------|-------|-----------|--------|-------------|
+| task_0 | Data synthesis | vllm/vllm-openai | `dump_offline_data.sh` | Serve model with vLLM, generate synthetic conversations |
+| task_1 | Hidden state dump | vllm/vllm-openai | `dump_offline_data_vllm.sh` | Dump hidden states from generated conversations |
+| task_2 | Training + Export | tensorrt-llm/release | `offline_training.sh` | Train EAGLE3 draft model, export HF checkpoint |
+| task_3 | Benchmark | vllm/vllm-openai | `specdec_bench/quick_check.sh` | Run speculative decoding benchmark |
+
+Some configs combine task_0+task_1 into a single vLLM dump step, or skip task_0 if data already exists.
+
+## Step 1: Locate the pipeline config
+
+```
+examples/speculative_decoding/pipeline/eagle3/quick_fail_check_<model>.yaml
+```
+
+If it doesn't exist, create one by copying `quick_fail_check.yaml` and adjusting:
+- `HF_MODEL_CKPT` — the HF model path on `/hf-local/`
+- GPU/node counts based on model size
+- `--trust_remote_code` if needed
+- Container images
+
+## Step 2: Submit the pipeline
+
+```bash
+cd <repo-root>
+bash tools/run_job_yaml.sh services/pipeline/eagle3/quick_fail_check_<model>.yaml --yes -v
+```
+
+This uses `uv run slurm.py` internally. The rsync can take several minutes.
+Experiment ID is printed as `cicd_<timestamp>`.
+
+## Step 3: Check experiment output
+
+Experiment directory:
+```
+experiments/cicd/cicd_<id>/
+```
+
+Each task has a directory `<JobName>_<N>/` containing:
+- `sbatch_<JobName>_<N>_<SlurmJobID>.out` — the main log
+- `code/` — snapshot of the code at submission time
+
+Check logs:
+```bash
+tail -100 experiments/cicd/cicd_<id>/<JobName>_<N>/sbatch_*.out
+```
+
+## Step 4: Diagnose failures by stage
+
+### task_0/task_1 failures (vLLM data generation / hidden state dump)
+
+Common issues:
+- **Server never starts** → Check for OOM, unsupported architecture, or missing `--trust_remote_code`
+- **`HarmonyError: vocab file`** → gated model, tokenizer not available offline
+- **`TypeError: 'NoneType' object is not iterable`** → vLLM doesn't support this model architecture yet
+- **`CANCELLED DUE TO TIME LIMIT`** → Model too slow for the time limit; increase wall time or reduce data
+- **Server starts but queries fail** → Check prompt format, connection errors
+
+### task_2 failures (training + export)
+
+Common issues:
+- **`No such file or directory: service_utils.sh`** → pipeline infra issue (older experiment)
+- **`ValueError: Unrecognized configuration class ... for AutoModelForCausalLM`** → VLM model not detected as VLM. Check if `load_vlm_or_llm` in `modelopt/torch/speculative/utils.py` handles this model type. Look for `text_config`/`llm_config` attributes.
+- **`FileNotFoundError` on shard files** → Checkpoint has unusual format (e.g., missing HF shards, has consolidated.safetensors instead). Check `FakeBaseModel._load_weights`.
+- **OOM during training** → Reduce `--train_bs` or `--training_seq_len`
+- **NaN loss** → Reduce `--lr`, check data quality
+
+### task_3 failures (benchmark)
+
+Common issues:
+- **`/scratchspace/export` doesn't exist** → task_2 failed; fix training first
+- **`StrictDataclassFieldValidationError`** → exported `config.json` has `null` where a typed field is expected (e.g., `use_cache`). Fix the export template in `modelopt/torch/export/plugins/hf_spec_configs.py`.
+- **`KeyError: '<model_type>'`** → transformers version in container doesn't recognize the model type
+- **`trust_remote_code=True` required** → add to benchmark config
+- **vLLM resolves model as wrong architecture** → VLM wrapper model needs special handling
+
+## Step 5: Applying fixes
+
+### Repo fixes (for merged modelopt)
+Edit files in `/home/yeyu/Documents/TensorRT-Model-Optimizer/modelopt/torch/speculative/`.
+The key files:
+- `utils.py` — `load_vlm_or_llm()` for model loading
+- `plugins/modeling_fakebase.py` — `FakeBaseModel` for offline training weight loading
+- `plugins/hf_eagle.py` — EAGLE model definition
+- `../export/plugins/hf_spec_configs.py` — export config templates
+- `../export/plugins/hf_spec_export.py` — export logic
+
+### Container patches (for pipeline)
+The TRT-LLM container has a pre-installed modelopt that can't be easily upgraded (CUDA build issues).
+Instead, runtime patches are applied in `offline_training.sh` using Python heredocs that find-and-replace
+exact code patterns in the installed library files. This is the same pattern used for speculators patches.
+
+When adding a new patch:
+1. Find the exact `old` string in the installed file (must be unique)
+2. Write the `new` replacement string
+3. Add a `python3 << 'PYEOF' || true` block in `offline_training.sh` before `set -eo pipefail`
+
+## Step 6: Document results
+
+Update `examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md`:
+1. Update the model row in the **Model Test Matrix** (status + per-task results)
+2. Add a **Per-Model Test Results** entry with experiment IDs, errors, and fixes
+3. Add new failure patterns to the **Observed Failure Catalog**
+
+## Known Model-Specific Issues
+
+| Model Type | Issue | Where | Fix |
+|-----------|-------|-------|-----|
+| `mistral3` (Ministral-3-*) | Not detected as VLM by `"vl"` check | `utils.py` | Check `text_config`/`llm_config` attrs |
+| `mistral3` (Ministral-3-8B) | Missing HF shard 1, has `consolidated.safetensors` | `modeling_fakebase.py` | Fallback to consolidated with key aliases |
+| All models via FakeBaseModel | `use_cache=null` in exported config | `hf_spec_configs.py` | Set `use_cache: True` in templates |
+| `gpt-oss-20b` | Tokenizer requires `openai_harmony` | task_0 | Gated/special tokenizer setup |
+| `MiniMax-M2.5` | Custom model code | task_3 | `--trust_remote_code` |
+| `ministral3` | `KeyError: 'ministral3'` in older transformers | task_3 | Needs transformers >= 5.3.0 |
diff --git a/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md b/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
new file mode 100644
index 00000000000..450146d9cce
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
@@ -0,0 +1,200 @@
+# EAGLE3 Automation Triage Chart
+
+This document catalogs failure modes observed when running the EAGLE3 pipeline across
+different model architectures. Updated as new models are tested.
+
+## Model Test Matrix
+
+| # | Model | Type | Params | Status | task_0 | task_1 | task_2 | task_3 | Notes |
+|---|-------|------|--------|--------|--------|--------|--------|--------|-------|
+| 1 | Qwen3-8B | Dense | 8B | Existing | - | - | - | - | Reference model |
+| 2 | Kimi-K2.5 | MoE | 1T/32B | Existing | - | - | - | - | GB200 required |
+| 3 | Qwen3.5-9B | Dense (VLM) | 9B | Not run | - | - | - | - | Text-only path |
+| 4 | Qwen3.5-27B | Dense (VLM) | 27B | Not run | - | - | - | - | Text-only path |
+| 5 | Qwen3.5-35B-A3B | MoE (VLM) | 35B/3B | **Blocked** | TIMEOUT | FAIL | FAIL | FAIL | Data synth too slow; infra issues |
+| 6 | MiniMax-M2.5 | MoE | 230B/10B | **Blocked** | TIMEOUT | FAIL | FAIL | FAIL | trust_remote_code needed |
+| 7 | Ministral-3-8B | Dense (VLM) | 8B | **WIP** | SKIP | PASS | PASS | FAIL | `use_cache=null` in export; see below |
+| 8 | Ministral-3-14B | Dense (VLM) | 14B | **Blocked** | FAIL | FAIL | FAIL | FAIL | vLLM engine init fails (NoneType) |
+| 9 | DeepSeek-V3.2 | MoE (MLA) | 685B/37B | **Blocked** | no log | FAIL | FAIL | FAIL | No task_0 log; infra issues |
+| 10 | gpt-oss-20b | Dense | 20B | **Blocked** | FAIL | FAIL | FAIL | FAIL | Tokenizer `HarmonyError` |
+| 11 | Step-3.5-Flash | MoE (SWA) | 197B/11B | **Blocked** | TIMEOUT | FAIL | FAIL | FAIL | Data synth hit time limit |
+| 12 | GLM-5 | MoE (DSA) | 744B/40B | Not run | - | - | - | - | 2 nodes, gated |
+
+Legend: PASS / FAIL-{code} / SKIP / Pending
+
+## Triage Decision Tree
+
+```mermaid
+flowchart TD
+    START([EAGLE3 Pipeline Failed]) --> WHICH_STEP{Which step failed?}
+
+    WHICH_STEP -->|task_0: Data Synthesis| T0_CHECK{Server started?}
+    WHICH_STEP -->|task_1: Hidden States| T1_CHECK{TRT-LLM launched?}
+    WHICH_STEP -->|task_2: Training| T2_CHECK{Dependencies installed?}
+    WHICH_STEP -->|task_3: Benchmark| T3_CHECK{Engine started?}
+
+    %% task_0 branch
+    T0_CHECK -->|No - hangs at health check| T0_OOM{CUDA OOM in log?}
+    T0_CHECK -->|Yes - server up but query fails| T0_QUERY[Check query.py errors:<br/>bad prompt format,<br/>connection timeout,<br/>empty response]
+    T0_OOM -->|Yes| T0_FIX_OOM[Reduce max_num_tokens<br/>or increase TP]
+    T0_OOM -->|No| T0_ARCH{Architecture error?}
+    T0_ARCH -->|Yes - RuntimeError / KeyError| T0_FIX_ARCH[Model not supported by<br/>this TRT-LLM version.<br/>Try newer container.]
+    T0_ARCH -->|No - trust_remote_code| T0_FIX_TRC[Add --trust_remote_code<br/>before -- separator]
+
+    %% task_1 branch
+    T1_CHECK -->|No - launch fails| T1_TRTLLM{Error type?}
+    T1_CHECK -->|Yes - but no .pt output| T1_DATA[Check --input-data path<br/>and data format]
+    T1_TRTLLM -->|OOM| T1_FIX_OOM[Increase TP or nodes]
+    T1_TRTLLM -->|NCCL error| T1_FIX_NCCL[Network issue.<br/>Retry or reduce EP.]
+    T1_TRTLLM -->|Unsupported model| T1_FIX_MODEL[Model arch not in TRT-LLM.<br/>Check release notes.]
+
+    %% task_2 branch
+    T2_CHECK -->|No - pip install fails| T2_FIX_DEPS[Network issue in container.<br/>Check proxy/mirror.]
+    T2_CHECK -->|Yes| T2_TRAIN{Training starts?}
+    T2_TRAIN -->|No - ImportError| T2_FIX_IMPORT[modelopt not installed<br/>or wrong version]
+    T2_TRAIN -->|No - FileNotFoundError| T2_FIX_DATA[task_1 output missing.<br/>Re-run task_1.]
+    T2_TRAIN -->|Yes but crashes| T2_CRASH{Error type?}
+    T2_CRASH -->|OOM| T2_FIX_OOM[Reduce train_bs<br/>or training_seq_len]
+    T2_CRASH -->|NaN loss| T2_FIX_NAN[Reduce lr.<br/>Check data quality.]
+    T2_CRASH -->|KeyError / arch error| T2_FIX_EAGLE[Model type not recognized<br/>by EAGLE3 training code.<br/>Needs code change in modelopt.]
+    T2_TRAIN -->|Yes - export fails| T2_FIX_EXPORT[Check /scratchspace/eagle3<br/>has model.safetensors]
+
+    %% task_3 branch
+    T3_CHECK -->|No - engine crash| T3_ENGINE{Engine type?}
+    T3_CHECK -->|Yes - but AR below threshold| T3_AR[AR too low: need more<br/>epochs, data, or<br/>larger draft head]
+    T3_CHECK -->|Yes - but wrong output| T3_FORMAT[Check draft model<br/>config.json compatibility<br/>with engine version]
+    T3_ENGINE -->|vLLM - ImportError eagle| T3_FIX_VLLM[vLLM version too old.<br/>Use latest container.]
+    T3_ENGINE -->|TRT-LLM - build error| T3_FIX_TRTLLM[Draft model format<br/>incompatible. Check export.]
+    T3_ENGINE -->|OOM| T3_FIX_OOM[Target + draft too large.<br/>Increase TP.]
+```
+
+## Observed Failure Catalog
+
+This section is updated as models are tested. Each entry records the model, step,
+error, root cause, and resolution.
+
+### Architecture-Level Failures
+
+| Category | Affected Models | Step | Error | Root Cause | Resolution |
+|----------|----------------|------|-------|------------|------------|
+| VLM text-only | Qwen3.5-*, Ministral-3-* | task_0 | TBD | VLM models may load vision encoder unnecessarily | TBD — may need --language-model-only |
+| VLM detection miss | Ministral-3-* | task_2 | `ValueError: Unrecognized config for AutoModelForCausalLM` | `load_vlm_or_llm` only checks `"vl"` in model_type; `"mistral3"` missed | Check `text_config`/`llm_config` attrs — fixed in repo |
+| Missing HF shard | Ministral-3-8B | task_2 | `FileNotFoundError: model-00001-of-00004.safetensors` | Incomplete HF shards + Mistral native `consolidated.safetensors` | Fallback to consolidated with key aliases — fixed in repo |
+| Exported config validation | All (via FakeBaseModel) | task_3 | `StrictDataclassFieldValidationError: use_cache` | Template placeholder `None` not filled; strict `huggingface_hub` rejects | Set `use_cache: True` in template — fixed in repo |
+| MLA attention | DeepSeek-V3.2 | task_2 | TBD | EAGLE3 decoder type may not support MLA | TBD — verify eagle_decoder_type |
+| Custom model code | MiniMax-M2.5 | task_0 | TBD | Non-standard architecture needs trust_remote_code | Add --trust_remote_code |
+| Sliding window attn | Step-3.5-Flash | task_1 | TBD | SWA may not be supported in TRT-LLM hidden state extraction | TBD |
+| Large MoE (>1 node) | DeepSeek-V3.2, GLM-5 | task_0/1 | TBD | Multi-node EP coordination | TBD — verify NCCL config |
+| Gated models | DeepSeek-V3.2, GLM-5 | task_0 | FileNotFoundError | Model not mirrored to /hf-local | Request HF local mirror |
+
+### Per-Model Test Results
+
+#### Model: Ministral-3-8B-Instruct-2512-BF16
+- **Date tested:** 2026-05-26
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml`
+- **Experiments:** `cicd_1779312692` (dump), `cicd_1779829129` (train+bench), `cicd_1779901409` (retry w/ fixes)
+- **task_0 (data synth):** SKIP — used vLLM dump path instead (`dump_offline_data_vllm.sh`)
+- **task_1 (hidden states):** PASS — 330/330 conversations via vLLM dump (`cicd_1779312692`)
+- **task_2 (training + export):** PASS — required 2 runtime patches (see issues below). `train_loss=31.93`, epoch 1, 278s total. Export to `/scratchspace/export` succeeded.
+- **task_3 (benchmark):** FAIL — `StrictDataclassFieldValidationError: use_cache expected bool, got None`
+- **AR:** Not measured (benchmark didn't complete)
+- **New failure patterns?** Yes — 3 issues:
+
+  1. **VLM detection miss** — `model_type="mistral3"` is a VLM (`Mistral3ForConditionalGeneration`) but `load_vlm_or_llm` only checks `"vl" in model_type`. Fix: also check `text_config`/`llm_config` attrs. Applied in `modelopt/torch/speculative/utils.py` + runtime patch.
+
+  2. **Missing HF shard** — Checkpoint has shards 2-4 + `consolidated.safetensors` but shard 1 is absent. `FakeBaseModel._load_weights` fails. Fix: fallback to `consolidated.safetensors` with Mistral native key aliases (`tok_embeddings.weight`, `output.weight`). Applied in `modelopt/torch/speculative/plugins/modeling_fakebase.py` + runtime patch.
+
+  3. **`use_cache=null` in exported config** — Export template placeholder stays `None` when `FakeBaseConfig` doesn't define `use_cache`. Newer `huggingface_hub` strict validation rejects it. Fix: set `"use_cache": True` in export template (draft models always use cache). Applied in `modelopt/torch/export/plugins/hf_spec_configs.py` + post-export fixup in pipeline.
+
+  4. **(Potential) vLLM Pixtral resolution** — vLLM resolves base model as `PixtralForConditionalGeneration` (VLM). May cause further issues loading the EAGLE3 draft. Needs investigation.
+
+- **Repo fixes (branch `yeyu/speculative-lora-cotrain`):**
+  - `modelopt/torch/speculative/utils.py` — VLM detection via `text_config`/`llm_config`
+  - `modelopt/torch/speculative/plugins/modeling_fakebase.py` — consolidated.safetensors fallback
+  - `modelopt/torch/export/plugins/hf_spec_configs.py` — `use_cache: True` in templates
+- **Pipeline fixes (`offline_training.sh`):** 3 runtime patches matching above
+
+---
+
+#### Model: gpt-oss-20b
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml`
+- **Experiment:** `cicd_1776272530`
+- **task_0 (data synth):** FAIL — `openai_harmony.HarmonyError: error downloading or loading vocab file`. vLLM server starts loading model but tokenizer fails. Likely a gated/proprietary tokenizer issue.
+- **task_1 (hidden states):** FAIL — `dump_offline_data_vllm.sh: No such file or directory` (script didn't exist at time of run)
+- **task_2 (training):** FAIL — `service_utils.sh: No such file or directory` (infra issue at time of run)
+- **task_3 (benchmark):** FAIL — `Error retrieving file list: Repo id must be in the form 'repo_name'` — no exported model
+- **Blocker:** Tokenizer loading. Needs special tokenizer setup or newer vLLM with OpenAI model support.
+
+#### Model: Qwen3.5-35B-A3B
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml`
+- **Experiment:** `cicd_1776272531`
+- **task_0 (data synth):** TIMEOUT — Server started successfully, data synthesis was running (5%/3295 at 38min), cancelled at time limit. `TCPTransport closed` errors during generation.
+- **task_1:** FAIL — script not found (infra issue)
+- **task_2:** FAIL — infra issue
+- **task_3:** FAIL — no exported model
+- **Blocker:** Data synthesis too slow. Needs longer wall time or reduced dataset size. Server itself works.
+
+#### Model: Step-3.5-Flash
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml`
+- **Experiment:** `cicd_1776272532`
+- **task_0 (data synth):** TIMEOUT — `CANCELLED AT 2026-04-15 DUE TO TIME LIMIT`
+- **task_1:** FAIL — script not found (infra issue)
+- **task_2:** FAIL — infra issue
+- **task_3:** FAIL — no exported model
+- **Blocker:** Data synthesis hit time limit. Needs investigation of whether server started successfully.
+
+#### Model: MiniMax-M2.5
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml`
+- **Experiment:** `cicd_1776272524`
+- **task_0 (data synth):** TIMEOUT — `CANCELLED DUE TO TIME LIMIT`
+- **task_1:** FAIL — script not found (infra issue)
+- **task_2:** FAIL — infra issue
+- **task_3:** FAIL — `trust_remote_code=True` required for custom model code
+- **Blocker:** Time limit on data synth + `trust_remote_code` needed for benchmark.
+
+#### Model: Ministral-3-14B
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml`
+- **Experiment:** `cicd_1776272522`
+- **task_0 (data synth):** FAIL — `TypeError: 'NoneType' object is not iterable` on all TP workers during engine core init. vLLM cannot load this model architecture.
+- **task_1:** FAIL — script not found (infra issue)
+- **task_2:** FAIL — infra issue
+- **task_3:** FAIL — `KeyError: 'ministral3'` — transformers in vLLM container doesn't recognize `ministral3` model type
+- **Blocker:** vLLM engine fails to initialize. Same `mistral3` model type issue as 8B variant. Needs newer vLLM + transformers.
+
+#### Model: DeepSeek-V3.2
+- **Date tested:** 2026-04-15
+- **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml`
+- **Experiment:** `cicd_1776275945`
+- **task_0 (data synth):** No log file — job may not have started (gated model?)
+- **task_1 (hidden states):** FAIL — script not found (infra issue)
+- **task_2:** FAIL — infra issue
+- **task_3:** FAIL — `Error retrieving file list` — no exported model
+- **Blocker:** Model may not be mirrored to `/hf-local`. Needs 2 nodes for MLA architecture.
+
+---
+
+_Use the following template for additional models:_
+
+```markdown
+#### Model: <name>
+- **Date tested:** YYYY-MM-DD
+- **Config:** examples/speculative_decoding/pipeline/eagle3/quick_fail_check_<model>.yaml
+- **task_0:** PASS/FAIL — <notes>
+- **task_1:** PASS/FAIL — <notes>
+- **task_2:** PASS/FAIL — <notes>
+- **task_3:** PASS/FAIL — <notes>
+- **AR:** <value> (threshold: >= 2.1)
+- **New failure pattern?** Yes/No — <description if yes>
+```
+
+## Revision History
+
+| Date | Author | Change |
+|------|--------|--------|
+| 2026-04-02 | Ye Yu | Initial chart with 12 models, triage decision tree |
+| 2026-05-27 | Claude Code | Updated with results from initial batch (`cicd_1776272*`) and Ministral-3-8B deep dive. Added per-model test results for 7 models. Added 4 new failure catalog entries. |
diff --git a/examples/speculative_decoding/pipeline/eagle3/offline.yaml b/examples/speculative_decoding/pipeline/eagle3/offline.yaml
new file mode 100644
index 00000000000..0d1c3ae4af9
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/offline.yaml
@@ -0,0 +1,129 @@
+---
+# This pipeline is a dry-run on 100 samples to quickly check the possible failure points
+# over the all 4-step.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+#
+# Note:
+#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
+#   /scratchspace is the best place if you want to pass the artifact around. You will
+#   see we use /scratchspace/... to specify input and output a lot.
+#
+#   You can comment out any task below to skip it. For example, If the data has been
+#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
+#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
+#   to the absolute path above.
+
+allow_to_fail: false
+skip: false
+note:
+
+global_vars:
+  hf_model: /hf-local/Qwen/Qwen3-8B
+
+task_0:
+  script: services/tensorrt-llm/query.sh
+  # Args are split on "--": entries before "--" are passed to trtllm-serve,
+  # entries after "--" are passed to tools/query.py.
+  # --model is required and applies to both trtllm-serve and tools/query.py.
+  args:
+    - --model <<global_vars.hf_model>>  # required
+    - --tp_size 4
+    - --ep_size 4
+    - --max_num_tokens 32000
+    - --port 8000
+    - --host 0.0.0.0
+    - --trust_remote_code
+    - --                                # separator
+    - --data /hf-local/modelopt/Speculative-Decoding-Prompts-v1_prompts  # query.py args
+    - --save /scratchspace/data
+  environment:
+    - HF_LOCAL: /hf-local
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    array: "0-0"
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+
+task_1:
+  script: services/pipeline/eagle3/dump_offline_data.sh
+  args:
+    - --input-data /scratchspace/data
+    - --output-dir /scratchspace/offline_hidden_states
+    - --max-seq-len 8192
+    - --tp 4
+    - --moe-ep 4
+  environment:
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    array: "0-0"
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_2:
+  script: services/pipeline/eagle3/offline_training.sh
+  args:
+    - --offline-data /scratchspace/offline_hidden_states
+    - --data_path None
+    - --mode eagle3
+    - --num_epochs 1
+    - --lr 3e-4
+    - --save_steps 500000
+    - --output_dir /scratchspace/eagle3
+    - --train_bs 8
+    - --training_seq_len 4096
+    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+    - --disable_tqdm True
+    - --ar_validate_steps 500000
+  environment:
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+#
+# This stage we run specdec bench and default we use VLLM backend. TensorRT-LLM can also be used
+# but we generally see some issues.
+#
+# To use TensorRT-LLM, make the following changes
+#
+# args:
+#   - --engine TRTLLM
+# environment:
+#   TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch 
+# slurm_config:
+#   ntasks_per_node: 4
+#   container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+#
+# To use SGLang
+#
+# args:
+#   - --engine SGLANG
+# slurm_config:
+#   ntasks_per_node: 1
+#   container: lmsysorg/sglang:v0.5.9
+#
+task_3:
+  script: services/specdec_bench/run.sh
+  args:
+    - --draft_length 3
+    - --output_length 4096
+    - --engine VLLM
+    - --tp_size 4 
+    - --ep_size 1 
+  environment:
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    - HF_DRAFT_MODEL_CKPT: /scratchspace/export
+    - CONCURRENCY_LIST: "1 4"
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/offline_training.sh b/examples/speculative_decoding/pipeline/eagle3/offline_training.sh
new file mode 100644
index 00000000000..8f1543a5b00
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/offline_training.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+source ${SCRIPT_DIR}/../../service_utils.sh 2>/dev/null || true
+
+pip install -r modules/Model-Optimizer/examples/speculative_decoding/requirements.txt
+pip install huggingface-hub>=1.2.1
+export PATH=$PATH:/workspace/.local/bin
+
+# Patch load_vlm_or_llm in installed modelopt to handle VLMs with text_config
+# (e.g. mistral3 / Ministral-3-8B) that are not caught by the "vl" name check.
+# For offline training, routes them through FakeBaseModel which handles VLM weight layouts.
+python3 << 'PYEOF' || true
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'modelopt', 'torch', 'speculative', 'utils.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = (
+        '    if "vl" in model_config.model_type.lower():\n'
+        '        model_cls = transformers.AutoModelForVision2Seq\n'
+        '    else:\n'
+        '        model_cls = transformers.AutoModelForCausalLM\n'
+    )
+    if old not in c:
+        print('load_vlm_or_llm: VLM patch already applied or pattern not found')
+        break
+    new = (
+        '    # Detect VLMs: "vl" in model_type OR has text_config/llm_config (e.g. mistral3)\n'
+        '    _is_vlm = "vl" in model_config.model_type.lower() or any(\n'
+        '        getattr(model_config, _a, None) is not None for _a in ["text_config", "llm_config"]\n'
+        '    )\n'
+        '    if _is_vlm and use_offline_training:\n'
+        '        from modelopt.torch.speculative.plugins.modeling_fakebase import FakeBaseModel\n'
+        '        return FakeBaseModel.from_source(model_name_or_path, trust_remote_code=trust_remote_code)\n'
+        '    if _is_vlm:\n'
+        '        model_cls = transformers.AutoModelForVision2Seq\n'
+        '    else:\n'
+        '        model_cls = transformers.AutoModelForCausalLM\n'
+    )
+    with open(path, 'w') as f:
+        f.write(c.replace(old, new))
+    print('Patched load_vlm_or_llm: added VLM text_config detection for offline training')
+    break
+PYEOF
+
+# Patch FakeBaseModel._load_weights in installed modelopt to fall back to
+# consolidated.safetensors when an HF shard file is missing.
+# Handles Ministral-3-8B-Instruct-2512-BF16 which is missing shard 1 but has
+# a complete consolidated.safetensors with Mistral native key names.
+python3 << 'PYEOF' || true
+import site, os
+for d in site.getsitepackages():
+    path = os.path.join(d, 'modelopt', 'torch', 'speculative', 'plugins', 'modeling_fakebase.py')
+    if not os.path.exists(path):
+        continue
+    with open(path) as f:
+        c = f.read()
+    old = (
+        '        lm_head_state = safetensors_load_file(lm_head_path, device="cpu")\n'
+        '        embed_tokens_state = safetensors_load_file(embed_tokens_path, device="cpu")\n'
+        '\n'
+        '        return lm_head_state[lm_head_key], embed_tokens_state[embed_tokens_key]\n'
+    )
+    if old not in c:
+        print('modeling_fakebase.py: consolidated fallback already applied or pattern not found')
+        break
+    new = (
+        '        def _load_with_consolidated_fallback(shard_path, key, role):\n'
+        '            try:\n'
+        '                return safetensors_load_file(shard_path, device="cpu")[key]\n'
+        '            except FileNotFoundError:\n'
+        '                _aliases = {"embed_tokens": ["tok_embeddings.weight"], "lm_head": ["output.weight"]}\n'
+        '                _consolidated = os.path.join(os.path.dirname(shard_path), "consolidated.safetensors")\n'
+        '                if os.path.isfile(_consolidated):\n'
+        '                    _state = safetensors_load_file(_consolidated, device="cpu")\n'
+        '                    for _alias in _aliases.get(role, []):\n'
+        '                        if _alias in _state:\n'
+        '                            return _state[_alias]\n'
+        '                raise\n'
+        '\n'
+        '        lm_head_w = _load_with_consolidated_fallback(lm_head_path, lm_head_key, "lm_head")\n'
+        '        embed_tokens_w = _load_with_consolidated_fallback(embed_tokens_path, embed_tokens_key, "embed_tokens")\n'
+        '\n'
+        '        return lm_head_w, embed_tokens_w\n'
+    )
+    with open(path, 'w') as f:
+        f.write(c.replace(old, new))
+    print('Patched FakeBaseModel._load_weights: added consolidated.safetensors fallback')
+    break
+PYEOF
+
+###################################################################################################
+
+set -eo pipefail
+
+# Parse old-style CLI args; translate to OmegaConf key=value for launch_train.sh.
+# This allows existing yamls (which use --offline-data, --lr, etc.) to keep working
+# as launch_train.sh migrated from per-flag CLI to --config yaml + dotlist overrides.
+OFFLINE_DATA=""
+DATA_PATH="None"
+MODE="eagle3"
+NUM_EPOCHS=1
+LR=""
+SAVE_STEPS=""
+OUTPUT_DIR="/scratchspace/eagle3"
+TRAIN_BS=""
+TRAINING_SEQ_LEN=""
+DISABLE_TQDM=""
+AR_VALIDATE_STEPS=""
+TRUST_REMOTE_CODE=false
+EXTRA_ARGS=()
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --offline-data)       shift; OFFLINE_DATA="$1" ;;
+    --data_path)          shift; DATA_PATH="$1" ;;
+    --mode)               shift; MODE="$1" ;;
+    --num_epochs)         shift; NUM_EPOCHS="$1" ;;
+    --lr)                 shift; LR="$1" ;;
+    --save_steps)         shift; SAVE_STEPS="$1" ;;
+    --output_dir)         shift; OUTPUT_DIR="$1" ;;
+    --train_bs)           shift; TRAIN_BS="$1" ;;
+    --training_seq_len)   shift; TRAINING_SEQ_LEN="$1" ;;
+    --eagle_config)       shift; ;;  # deprecated — ignore
+    --disable_tqdm)       shift; DISABLE_TQDM="$1" ;;
+    --ar_validate_steps)  shift; AR_VALIDATE_STEPS="$1" ;;
+    --trust_remote_code)  TRUST_REMOTE_CODE=true ;;
+    *) EXTRA_ARGS+=("$1") ;;
+  esac
+  shift
+done
+
+OVERRIDES=(
+    "model.model_name_or_path=${HF_MODEL_CKPT}"
+    "model.trust_remote_code=${TRUST_REMOTE_CODE}"
+    "training.output_dir=${OUTPUT_DIR}"
+    "training.mode=${MODE}"
+    "training.num_train_epochs=${NUM_EPOCHS}"
+)
+[ -n "$OFFLINE_DATA" ]      && OVERRIDES+=("data.offline_data_path=${OFFLINE_DATA}")
+[ -n "$LR" ]                && OVERRIDES+=("training.learning_rate=${LR}")
+[ -n "$SAVE_STEPS" ]        && OVERRIDES+=("training.save_steps=${SAVE_STEPS}")
+[ -n "$TRAIN_BS" ]          && OVERRIDES+=("training.per_device_train_batch_size=${TRAIN_BS}")
+[ -n "$TRAINING_SEQ_LEN" ]  && OVERRIDES+=("training.training_seq_len=${TRAINING_SEQ_LEN}")
+[ -n "$DISABLE_TQDM" ]      && OVERRIDES+=("training.disable_tqdm=${DISABLE_TQDM}")
+[ -n "$AR_VALIDATE_STEPS" ] && OVERRIDES+=("training.ar_validate_steps=${AR_VALIDATE_STEPS}")
+[ "$DATA_PATH" != "None" ] && [ -n "$DATA_PATH" ] && OVERRIDES+=("data.data_path=${DATA_PATH}")
+
+bash modules/Model-Optimizer/examples/speculative_decoding/launch_train.sh \
+    --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml \
+    "${OVERRIDES[@]}" \
+    "${EXTRA_ARGS[@]}"
+
+python modules/Model-Optimizer/examples/speculative_decoding/scripts/export_hf_checkpoint.py \
+    --model_path "${OUTPUT_DIR}" \
+    --export_path /scratchspace/export \
+    --trust_remote_code
+
+# Fix use_cache: null → true in exported config (newer huggingface_hub rejects None for bool fields)
+python3 -c "
+import json, pathlib
+p = pathlib.Path('/scratchspace/export/config.json')
+if p.exists():
+    c = json.loads(p.read_text())
+    if c.get('use_cache') is None:
+        c['use_cache'] = True
+        p.write_text(json.dumps(c, indent=4))
+        print('Fixed use_cache=null -> true in exported config.json')
+" || true
+
+###################################################################################################
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
new file mode 100644
index 00000000000..1931f14ac7b
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
@@ -0,0 +1,135 @@
+---
+# EAGLE3 quick-fail-check pipeline for Qwen3-8B.
+#
+# Dry-run on 100 samples to quickly check the possible failure points
+# over the all 4-step.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+#
+# Note:
+#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
+#   /scratchspace is the best place if you want to pass the artifact around. You will
+#   see we use /scratchspace/... to specify input and output a lot.
+#
+#   You can comment out any task below to skip it. For example, If the data has been
+#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
+#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
+#   to the absolute path above.
+#
+# Usage:
+#   uv run slurm.py --yaml services/pipeline/eagle3/quick_fail_check.yaml --yes
+
+job_name: Qwen3-8B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3-8B
+
+  # Step 1: Data synthesis via TRT-LLM server
+  # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
+  task_0:
+    script: services/tensorrt-llm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tp_size 4
+      - --ep_size 4
+      - --max_num_tokens 32000
+      - --port 8000
+      - --host 0.0.0.0
+      - --trust_remote_code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 2: Dump hidden states from target model
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 4
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 3: Train EAGLE3 draft head (offline, single task)
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  #
+  # To use TensorRT-LLM, make the following changes
+  #
+  # args:
+  #   - --engine TRTLLM
+  # environment:
+  #   TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch
+  # slurm_config:
+  #   ntasks_per_node: 4
+  #   container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+  #
+  # To use SGLang
+  #
+  # args:
+  #   - --engine SGLANG
+  # slurm_config:
+  #   ntasks_per_node: 1
+  #   container: lmsysorg/sglang:v0.5.9
+  #
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --runtime_params services/specdec_bench/tensorrt_llm_runtiime_params.yaml
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
new file mode 100644
index 00000000000..cccd1ca0ac1
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
@@ -0,0 +1,120 @@
+---
+# This pipeline is a dry-run on 100 samples to quickly check the possible failure points
+# over the all 4-step.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+#
+# Note:
+#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
+#   /scratchspace is the best place if you want to pass the artifact around. You will
+#   see we use /scratchspace/... to specify input and output a lot.
+#
+#   You can comment out any task below to skip it. For example, If the data has been
+#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
+#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
+#   to the absolute path above.
+
+allow_to_fail: false
+skip: false
+note:
+
+global_vars:
+  #hf_model: &hf_model /hf-local/Qwen/Qwen3-8B
+  hf_model: &hf_model /hf-local/LocoreMind/LocoOperator-4B
+
+task_0:
+  script: services/tensorrt-llm/query.sh
+  # Args are split on "--": entries before "--" are passed to trtllm-serve,
+  # entries after "--" are passed to tools/query.py.
+  # --model is required and applies to both trtllm-serve and tools/query.py.
+  args:
+    - --model <<global_vars.hf_model>>  # required
+    - --tp_size 1
+    - --ep_size 1
+    - --max_num_tokens 32000
+    - --port 8000
+    - --host 0.0.0.0
+    - --trust_remote_code
+    - --                                # separator
+    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
+    - --save /scratchspace/data
+  environment:
+    - HF_LOCAL: /hf-local
+  slurm_config:
+    _factory_: "computelab_slurm_factory"
+    nodes: 1
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_1:
+  script: services/pipeline/eagle3/dump_offline_data.sh
+  args:
+    - --input-data /scratchspace/data
+    - --output-dir /scratchspace/offline_hidden_states
+    - --max-seq-len 8192
+    - --tp 1
+    - --moe-ep 1
+  environment:
+    - HF_MODEL_CKPT: *hf_model
+  slurm_config:
+    _factory_: "computelab_slurm_factory"
+    nodes: 1
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_2:
+  script: services/pipeline/eagle3/offline_training.sh
+  args:
+    - --offline-data /scratchspace/offline_hidden_states
+    - --data_path None
+    - --mode eagle3
+    - --num_epochs 1
+    - --lr 3e-4
+    - --save_steps 500000
+    - --output_dir /scratchspace/eagle3
+    - --train_bs 1
+    - --training_seq_len 1024
+    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+    - --disable_tqdm True
+    - --ar_validate_steps 500000
+  environment:
+    - HF_MODEL_CKPT: *hf_model
+  slurm_config:
+    _factory_: "computelab_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+#
+# This stage we run specdec bench and default we use VLLM backend. TensorRT-LLM can also be used
+# but we generally see some issues.
+#
+# To use TensorRT-LLM, make the following changes
+#
+# args:
+#   - --engine TRTLLM
+# environment:
+#   TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch 
+# slurm_config:
+#   container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+#   
+task_3:
+  script: services/specdec_bench/quick_check.sh
+  args:
+    - --draft_model_dir /scratchspace/export
+    - --draft_length 3
+    - --output_length 4096
+    - --engine VLLM
+    - --tp_size 1 
+    - --ep_size 1
+    - --runtime_params services/specdec_bench/tensorrt_llm_runtiime_params.yaml
+    - --speculative_algorithm EAGLE3
+    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+    - --concurrency 1
+  environment:
+    - HF_MODEL_CKPT: *hf_model
+  slurm_config:
+    _factory_: "computelab_slurm_factory"
+    nodes: 1
+    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
new file mode 100644
index 00000000000..0ba7408afdb
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
@@ -0,0 +1,110 @@
+---
+# EAGLE3 quick-fail pipeline for deepseek-ai/DeepSeek-V3.2
+#
+# DeepSeek-V3.2 is a 685B MoE (37B active, MLA attention).
+# BF16 weights: ~1370 GB — requires 2 GB200 nodes (8 × 192 GB = 1536 GB).
+#
+# Special requirements:
+#   - Gated model — must be downloaded/mirrored before use
+#   - MLA attention — verify eagle_decoder_type compatibility
+#   - trust_remote_code required
+#
+# MoE-specific notes:
+#   - TP=4 per node, EP across nodes if needed
+#   - Draft head intermediate_size may need tuning in eagle_config.json
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: DeepSeek-V3.2_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/deepseek-ai/DeepSeek-V3.2
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 8
+      - --trust-remote-code
+      - --distributed-executor-backend ray
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 4
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 2
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TP: 4
+      - EP: 2
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
new file mode 100644
index 00000000000..ba428e219f2
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
@@ -0,0 +1,110 @@
+---
+# EAGLE3 quick-fail pipeline for zai-org/GLM-5
+#
+# GLM-5 is a 744B MoE (40B active, 256 experts, top-8). DeepSeek Sparse Attention.
+# BF16 weights: ~1488 GB — requires 2 GB200 nodes (8 × 192 GB = 1536 GB).
+#
+# Special requirements:
+#   - Gated model — must be downloaded/mirrored before use
+#   - trust_remote_code required
+#   - 200K context window supported but use 8192 for quick-fail
+#   - DeepSeek Sparse Attention — verify TRT-LLM support
+#
+# MoE-specific notes:
+#   - TP=4 per node, EP=2 across 2 nodes
+#   - Draft head intermediate_size may need tuning in eagle_config.json
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: GLM-5_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/zai-org/GLM-5
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 4
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 2
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TP: 4
+      - EP: 2
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 2
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
new file mode 100644
index 00000000000..22170b92cd7
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
@@ -0,0 +1,97 @@
+---
+# EAGLE3 quick-fail pipeline for openai/gpt-oss-20b
+#
+# GPT-OSS-20B is a 20B dense model.
+# BF16 weights: ~40 GB — fits easily on a single GB200 node (4 × 192 GB).
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: gpt-oss-20b_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/openai/gpt-oss-20b
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+      - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
new file mode 100644
index 00000000000..896babe8cac
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
@@ -0,0 +1,149 @@
+---
+# EAGLE3 quick-fail pipeline for moonshotai/Kimi-K2.5
+#
+# Kimi-K2.5 is a 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+#
+# Hardware requirements:
+#   OCI-HSG uses GB200 nodes with 4 GPUs × 192 GB HBM3e = 768 GB per node.
+#   K2.5 BF16 weights are ~595 GB, so a single 4×GB200 node with TP=4 is sufficient.
+#   (For reference: H200 has 141 GB/GPU, so K2.5 would require 8× H200 instead.)
+#
+# References:
+#   https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+#   https://huggingface.co/moonshotai/Kimi-K2.5/blob/main/docs/deploy_guidance.md
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+#
+# Note:
+#   All tasks share /scratchspace. Comment out any task to skip it and update
+#   the next task's input path to point to a previous experiment's scratchspace.
+#
+# MoE-specific notes:
+#   - task_1: set --moe-ep 1 since TP=8 spans all 8 GPUs; increase --moe-ep and
+#     reduce --tp if you have more GPUs and want expert parallelism.
+#   - task_2: review modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+#     and consider increasing `intermediate_size` for MoE models (the draft head
+#     uses a dense layer by default, which may be undersized relative to K2.5's
+#     expert hidden dim of 2048).
+#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
+#     is not covered by the default llama decoder type.
+
+allow_to_fail: false
+skip: false
+note:
+
+global_vars:
+  hf_model: /hf-local/moonshotai/Kimi-K2.5
+
+task_0:
+  script: services/tensorrt-llm/query.sh
+  # Args are split on "--": entries before "--" are passed to trtllm-serve,
+  # entries after "--" are passed to tools/query.py.
+  # --model is required and applies to both trtllm-serve and tools/query.py.
+  #
+  # OCI-HSG GB200: 4 GPUs × 192 GB = 768 GB per node — fits K2.5 BF16 (~595 GB) with TP=4.
+  # --trust_remote_code is required for the Kimi tokenizer.
+  args:
+    - --model <<global_vars.hf_model>>  # required
+    - --tp_size 4
+    - --ep_size 1
+    - --max_num_tokens 32000
+    - --port 8000
+    - --host 0.0.0.0
+    - --trust_remote_code
+    - --                                # separator
+    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
+    - --save /scratchspace/data
+  environment:
+    - HF_LOCAL: /hf-local
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 4
+    gpus_per_node: 4
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_1:
+  script: services/pipeline/eagle3/dump_offline_data.sh
+  # TP=4 across all 4 GB200 GPUs; moe-ep=1 since TP already fills the node.
+  # Increase max-seq-len up to 131072 if your training data has long contexts.
+  args:
+    - --input-data /scratchspace/data
+    - --output-dir /scratchspace/offline_hidden_states
+    - --max-seq-len 8192
+    - --tp 4
+    - --moe-ep 1
+  environment:
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 4
+    gpus_per_node: 4
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_2:
+  script: services/pipeline/eagle3/offline_training.sh
+  # Draft head training. Runs on a single rank with Accelerate + FSDP.
+  # For K2.5 (MoE): check eagle_config.json and consider increasing
+  # intermediate_size since K2.5's per-expert hidden dim is 2048.
+  # Also verify --eagle_decoder_type is compatible with MLA attention.
+  args:
+    - --offline-data /scratchspace/offline_hidden_states
+    - --data_path None
+    - --mode eagle3
+    - --num_epochs 1
+    - --lr 3e-4
+    - --save_steps 500000
+    - --output_dir /scratchspace/eagle3
+    - --train_bs 8
+    - --training_seq_len 4096
+    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+    - --disable_tqdm True
+    - --ar_validate_steps 500000
+  environment:
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    gpus_per_node: 4
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+#
+# Task 3: Speculative decoding benchmark using VLLM (recommended for K2.5).
+#
+# For K2.5 with VLLM, additional flags are recommended but are not yet
+# supported by services/specdec_bench/quick_check.sh:
+#   --mm-encoder-tp-mode data   (data-parallel vision encoder, improves throughput)
+#   --tool-call-parser kimi_k2  (required for tool-call responses)
+#   --reasoning-parser kimi_k2  (required for thinking-mode responses)
+# If you need these, extend quick_check.sh or use run.sh with a custom config.
+#
+task_3:
+  script: services/specdec_bench/quick_check.sh
+  args:
+    - --draft_model_dir /scratchspace/export
+    - --draft_length 3
+    - --output_length 4096
+    - --engine VLLM
+    - --tp_size 4
+    - --ep_size 1
+    - --speculative_algorithm EAGLE3
+    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+    - --concurrency 1
+  environment:
+    - HF_LOCAL: /hf-local
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    - TP: 4
+    - EP: 1
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    gpus_per_node: 4
+    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
new file mode 100644
index 00000000000..8b9154321c3
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
@@ -0,0 +1,162 @@
+---
+# EAGLE3 quick-fail pipeline for nvidia/Kimi-K2.5-NVFP4
+#
+# nvidia/Kimi-K2.5-NVFP4 is the NVIDIA-quantized NVFP4 variant of moonshotai/Kimi-K2.5.
+# Same architecture: 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
+# Only the weights and activations of linear ops within MoE transformer blocks are quantized;
+# embeddings and other tensors remain in higher precision.
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+#
+# Hardware requirements:
+#   NVFP4 checkpoint is ~591 GB (vs ~595 GB BF16) — same single-node requirement.
+#   OCI-HSG uses GB200 (Blackwell) nodes with 4 GPUs × 192 GB HBM3e = 768 GB per node.
+#   NVFP4 inference requires Blackwell GPUs — OCI-HSG GB200 satisfies this.
+#   A single 4×GB200 node with TP=4 is sufficient.
+#
+# References:
+#   https://huggingface.co/nvidia/Kimi-K2.5-NVFP4
+#   https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+#
+# Note:
+#   All tasks share /scratchspace. Comment out any task to skip it and update
+#   the next task's input path to point to a previous experiment's scratchspace.
+#
+# Why different models for task_0 vs tasks 1–2:
+#   task_0 (data synthesis) uses services/vllm/query.sh + NVFP4 directly, since
+#   vLLM v0.15.0 supports KimiK25ForConditionalGeneration with NVFP4 on Blackwell.
+#   tasks 1–2 (hidden-state extraction and training) use the BF16 moonshotai/Kimi-K2.5
+#   checkpoint via TRT-LLM, which does not support KimiK25ForConditionalGeneration
+#   in v1.3.0. The draft head trained on BF16 hidden states transfers directly to
+#   NVFP4 — NVFP4 is near-lossless, so hidden state distributions are equivalent.
+#
+# MoE-specific notes:
+#   - task_1: set --moe-ep 1 since TP=4 spans all 4 GPUs; increase --moe-ep and
+#     reduce --tp if you have more GPUs and want expert parallelism.
+#   - task_2: review modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+#     and consider increasing `intermediate_size` for MoE models (the draft head
+#     uses a dense layer by default, which may be undersized relative to K2.5's
+#     expert hidden dim of 2048).
+#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
+#     is not covered by the default llama decoder type.
+
+allow_to_fail: false
+skip: false
+note:
+
+global_vars:
+  hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+
+task_0:
+  script: services/vllm/query.sh
+  # Args are split on "--": entries before "--" are passed to vllm serve,
+  # entries after "--" are passed to tools/query.py.
+  # --model is required and applies to both vllm serve and tools/query.py.
+  #
+  # Uses NVFP4 directly via vLLM v0.15.0, which supports KimiK25ForConditionalGeneration.
+  # vLLM is single-process (ntasks_per_node: 1); --tensor-parallel-size handles GPU distribution.
+  # --trust-remote-code is required for the Kimi tokenizer.
+  args:
+    - --model <<global_vars.hf_model>>  # required
+    - --tensor-parallel-size 4
+    - --max-num-seqs 32
+    - --trust-remote-code
+    - --                                # separator
+    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
+    - --save /scratchspace/data
+  environment:
+    - HF_LOCAL: /hf-local
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    gpus_per_node: 4
+    container: vllm/vllm-openai:v0.15.0
+
+task_1:
+  script: services/pipeline/eagle3/dump_offline_data.sh
+  # TP=4 across all 4 GB200 GPUs; moe-ep=1 since TP already fills the node.
+  # Uses BF16 moonshotai/Kimi-K2.5: same TRT-LLM compatibility reason as task_0.
+  # Increase max-seq-len up to 131072 if your training data has long contexts.
+  args:
+    - --input-data /scratchspace/data
+    - --output-dir /scratchspace/offline_hidden_states
+    - --max-seq-len 8192
+    - --tp 4
+    - --moe-ep 1
+  environment:
+    - HF_MODEL_CKPT: /hf-local/moonshotai/Kimi-K2.5
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 4
+    gpus_per_node: 4
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+task_2:
+  script: services/pipeline/eagle3/offline_training.sh
+  # Draft head training. Runs on a single rank with Accelerate + FSDP.
+  # HF_MODEL_CKPT points to BF16 for tokenizer/config loading; hidden states
+  # come from task_1 (also BF16). Draft head transfers to NVFP4 at benchmark time.
+  # For K2.5 (MoE): check eagle_config.json and consider increasing
+  # intermediate_size since K2.5's per-expert hidden dim is 2048.
+  # Also verify --eagle_decoder_type is compatible with MLA attention.
+  args:
+    - --offline-data /scratchspace/offline_hidden_states
+    - --data_path None
+    - --mode eagle3
+    - --num_epochs 1
+    - --lr 3e-4
+    - --save_steps 500000
+    - --output_dir /scratchspace/eagle3
+    - --train_bs 8
+    - --training_seq_len 4096
+    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+    - --disable_tqdm True
+    - --ar_validate_steps 500000
+  environment:
+    - HF_MODEL_CKPT: /hf-local/moonshotai/Kimi-K2.5
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    gpus_per_node: 4
+    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+#
+# Task 3: Speculative decoding benchmark using VLLM with NVFP4.
+#
+# vllm/vllm-openai:v0.15.0 is required for NVFP4 support on Blackwell GPUs.
+# Additional flags required for full K2.5 functionality but not yet supported
+# by services/specdec_bench/quick_check.sh:
+#   --tool-call-parser kimi_k2   (required for tool-call responses)
+#   --reasoning-parser kimi_k2   (required for thinking-mode responses)
+# If you need these, extend quick_check.sh or use run.sh with a custom config.
+#
+task_3:
+  script: services/specdec_bench/quick_check.sh
+  args:
+    - --draft_model_dir /scratchspace/export
+    - --draft_length 3
+    - --output_length 4096
+    - --engine VLLM
+    - --tp_size 4
+    - --ep_size 1
+    - --speculative_algorithm EAGLE3
+    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+    - --concurrency 1
+  environment:
+    - HF_LOCAL: /hf-local
+    - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    - TP: 4
+    - EP: 1
+  slurm_config:
+    _factory_: "oci_hsg_slurm_factory"
+    nodes: 1
+    ntasks_per_node: 1
+    gpus_per_node: 4
+    container: vllm/vllm-openai:v0.15.0
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
new file mode 100644
index 00000000000..3557d06f1aa
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
@@ -0,0 +1,105 @@
+---
+# EAGLE3 quick-fail pipeline for MiniMaxAI/MiniMax-M2.5
+#
+# MiniMax-M2.5 is a 230B MoE (10B active, 256 experts, top-8).
+# BF16 weights: ~460 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code is required (custom model code, model type: minimax_m2)
+#
+# MoE-specific notes:
+#   - task_2: check eagle_config.json and consider increasing intermediate_size
+#     since the draft head uses a dense layer by default.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: MiniMax-M2.5_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/MiniMaxAI/MiniMax-M2.5
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
new file mode 100644
index 00000000000..5b3b76c4c4f
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
@@ -0,0 +1,101 @@
+---
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-14B-Instruct-2512-BF16
+#
+# Ministral-3-14B is a 14B dense model (model type: mistral3, multimodal but
+# EAGLE3 uses text-only path).
+# BF16 weights: ~28 GB — fits easily on a single GB200 node.
+#
+# Special requirements:
+#   - trust_remote_code may be needed for tokenizer
+#   - Model type is mistral3 (Mistral3ForConditionalGeneration)
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Ministral-3-14B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-14B-Instruct-2512-BF16
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
new file mode 100644
index 00000000000..f180d716e4c
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
@@ -0,0 +1,76 @@
+---
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
+# Starts from task_1 (hidden state dump), reusing data from cicd_1779208014.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Ministral-3-8B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_2:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
new file mode 100644
index 00000000000..8a75cbcd10e
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
@@ -0,0 +1,56 @@
+---
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
+# Starts from training, reusing hidden states from cicd_1779312692.
+
+job_name: Ministral-3-8B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+
+  task_0:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779312692/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+      - --trust_remote_code
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_1:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
new file mode 100644
index 00000000000..43ab18b0b5b
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
@@ -0,0 +1,77 @@
+---
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
+# Uses HF-based hidden state dump (more stable than speculators/vLLM internals).
+# Reuses task_0 data from cicd_1779208014.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Ministral-3-8B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_hf.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_1:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_2:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
new file mode 100644
index 00000000000..f25b63b2351
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
@@ -0,0 +1,103 @@
+---
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
+#
+# Ministral-3-8B is an 8B dense model (model type: mistral3, multimodal but
+# EAGLE3 uses text-only path).
+# BF16 weights: ~16 GB — fits easily on a single GB200 node.
+#
+# Special requirements:
+#   - trust_remote_code may be needed for tokenizer
+#   - Model type is mistral3 (Mistral3ForConditionalGeneration)
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Ministral-3-8B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+      - --num-shards 10000
+      - --shard-id-step 10001
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
new file mode 100644
index 00000000000..f0c079fd452
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
@@ -0,0 +1,95 @@
+---
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-27B
+#
+# Qwen3.5-27B is a 27B dense VLM (text + vision). EAGLE3 uses only the text path.
+# BF16 weights: ~54 GB — fits on a single GB200 node (4 × 192 GB).
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Qwen3.5-27B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-27B
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 4
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
new file mode 100644
index 00000000000..dce06207cfc
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
@@ -0,0 +1,101 @@
+---
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-35B-A3B
+#
+# Qwen3.5-35B-A3B is a 35B MoE VLM (3B active, 256 experts, top-8, shared expert).
+# EAGLE3 uses only the text path.
+# BF16 weights: ~70 GB — fits on a single GB200 node (4 × 192 GB).
+#
+# MoE-specific notes:
+#   - task_2: check eagle_config.json and consider increasing intermediate_size
+#     since the draft head uses a dense layer by default.
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Qwen3.5-35B-A3B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-35B-A3B
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
new file mode 100644
index 00000000000..b1c42aa566b
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
@@ -0,0 +1,95 @@
+---
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-9B
+#
+# Qwen3.5-9B is a 9B dense VLM (text + vision). EAGLE3 uses only the text path.
+# BF16 weights: ~18 GB — fits easily on a single GB200 node (4 × 192 GB).
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Qwen3.5-9B_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/Qwen/Qwen3.5-9B
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
new file mode 100644
index 00000000000..85ea02bbb13
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
@@ -0,0 +1,107 @@
+---
+# EAGLE3 quick-fail pipeline for stepfun-ai/Step-3.5-Flash
+#
+# Step-3.5-Flash is a 197B MoE (11B active, 288 experts + 1 shared, top-8).
+# Has built-in MTP (multi-token prediction) head and sliding window attention.
+# BF16 weights: ~394 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code required
+#   - 256K context window supported but use 8192 for quick-fail
+#   - Sliding window attention (SWA) — verify TRT-LLM support
+#
+# MoE-specific notes:
+#   - Draft head intermediate_size may need tuning in eagle_config.json
+#
+# Container mount:
+#   /lustre:/lustre
+#   ${job_dir}/cicd/${experiment_id}:/scratchspace
+#   ${local_huggingface_hub}:/hf-local
+
+job_name: Step-3.5-Flash_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: true
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/stepfun-ai/Step-3.5-Flash
+
+  task_0:
+    script: services/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_1:
+    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
+
+  task_2:
+    script: services/pipeline/eagle3/offline_training.sh
+    args:
+      - --offline-data /scratchspace/offline_hidden_states
+      - --data_path None
+      - --mode eagle3
+      - --num_epochs 1
+      - --lr 3e-4
+      - --save_steps 500000
+      - --output_dir /scratchspace/eagle3
+      - --train_bs 8
+      - --training_seq_len 4096
+      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
+      - --disable_tqdm True
+      - --ar_validate_steps 500000
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
+
+  task_3:
+    script: services/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
new file mode 100644
index 00000000000..572d27ac502
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
@@ -0,0 +1,20 @@
+---
+# Standalone task_1: HF hidden state dump for MiniMax-M2.5
+# Uses partial data from cicd_1775624587 task_0
+
+job_name: MiniMax-M2.5_task1_hf_dump
+pipeline:
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_hf.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775624587/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: /hf-local/MiniMaxAI/MiniMax-M2.5
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
new file mode 100644
index 00000000000..89f866f3f42
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
@@ -0,0 +1,20 @@
+---
+# Standalone task_1: HF hidden state dump for Ministral-3-8B
+# Uses partial data from cicd_1779208014 task_0 (330 samples)
+
+job_name: Ministral-3-8B_task1_hf_dump
+pipeline:
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_hf.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
new file mode 100644
index 00000000000..7871ae9eda0
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
@@ -0,0 +1,20 @@
+---
+# Standalone task_1: HF hidden state dump for Qwen3.5-35B-A3B
+# Uses partial data from cicd_1775625199 task_0
+
+job_name: Qwen3.5-35B-A3B_task1_hf_dump
+pipeline:
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_hf.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775625199/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: /hf-local/Qwen/Qwen3.5-35B-A3B
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
new file mode 100644
index 00000000000..c22fc2773d4
--- /dev/null
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
@@ -0,0 +1,20 @@
+---
+# Standalone task_1: HF hidden state dump for Step-3.5-Flash
+# Uses partial data from cicd_1775624696 task_0
+
+job_name: Step-3.5-Flash_task1_hf_dump
+pipeline:
+  task_0:
+    script: services/pipeline/eagle3/dump_offline_data_hf.sh
+    args:
+      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775624696/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+    environment:
+      - HF_MODEL_CKPT: /hf-local/stepfun-ai/Step-3.5-Flash
+    slurm_config:
+      _factory_: "oci_hsg_slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10

From 4e211f76491e47a68c8fd80526d2af8aa6b83516 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Wed, 27 May 2026 11:55:13 -0700
Subject: [PATCH 18/22] Add trust_remote_code to Ministral-3-8B EAGLE3 training
 config

Ministral-3-8B (model_type=mistral3) needs trust_remote_code for
tokenizer loading during EAGLE3 training.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
index a803b57afa1..0aabf9dfe3d 100644
--- a/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/Mistral/Ministral-3-8B/hf_offline_eagle3.yaml
@@ -70,6 +70,7 @@ pipeline:
     args:
       - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
       - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.trust_remote_code=true
       - data.offline_data_path=/scratchspace/offline_hidden_states
       - training.output_dir=/scratchspace/eagle3
       - training.training_seq_len=4096

From abe7cb14ffb1d2d8fe5d1414dd8d6e97db67c064 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 2 Jun 2026 11:30:40 -0700
Subject: [PATCH 19/22] Address review: use _LM_HEAD_PATHS/_EMBED_TOKENS_PATHS
 for Mistral support

- Add Mistral key names (tok_embeddings, output) to existing path lists
  instead of hardcoding aliases in a consolidated.safetensors fallback
- Extend _load_index to try consolidated.safetensors as a single-file
  fallback (Mistral native checkpoints use this instead of model.safetensors)
- Remove leading --- from YAML pipeline configs (pre-commit fix)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../pipeline/eagle3/offline.yaml              |  5 +-
 .../pipeline/eagle3/quick_fail_check.yaml     |  1 -
 .../eagle3/quick_fail_check_computelab.yaml   |  3 +-
 .../quick_fail_check_deepseek-v3.2.yaml       |  1 -
 .../eagle3/quick_fail_check_glm-5.yaml        |  1 -
 .../eagle3/quick_fail_check_gpt-oss-20b.yaml  |  1 -
 .../eagle3/quick_fail_check_kimi_k2.5.yaml    |  1 -
 .../quick_fail_check_kimi_k2.5_nvfp4.yaml     |  1 -
 .../eagle3/quick_fail_check_minimax-m2.5.yaml |  1 -
 .../quick_fail_check_ministral-3-14b.yaml     |  1 -
 ..._fail_check_ministral-3-8b-from-task1.yaml |  1 -
 ..._fail_check_ministral-3-8b-from-task2.yaml |  1 -
 ...ick_fail_check_ministral-3-8b-hf-dump.yaml |  1 -
 .../quick_fail_check_ministral-3-8b.yaml      |  1 -
 .../eagle3/quick_fail_check_qwen3.5-27b.yaml  |  1 -
 .../quick_fail_check_qwen3.5-35b-a3b.yaml     |  1 -
 .../eagle3/quick_fail_check_qwen3.5-9b.yaml   |  1 -
 .../quick_fail_check_step-3.5-flash.yaml      |  1 -
 .../pipeline/eagle3/task1_minimax-m2.5.yaml   |  1 -
 .../pipeline/eagle3/task1_ministral-3-8b.yaml |  1 -
 .../eagle3/task1_qwen3.5-35b-a3b.yaml         |  1 -
 .../pipeline/eagle3/task1_step-3.5-flash.yaml |  1 -
 .../speculative/plugins/modeling_fakebase.py  | 46 ++++++++-----------
 23 files changed, 22 insertions(+), 52 deletions(-)

diff --git a/examples/speculative_decoding/pipeline/eagle3/offline.yaml b/examples/speculative_decoding/pipeline/eagle3/offline.yaml
index 0d1c3ae4af9..9a6f0158184 100644
--- a/examples/speculative_decoding/pipeline/eagle3/offline.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/offline.yaml
@@ -1,4 +1,3 @@
----
 # This pipeline is a dry-run on 100 samples to quickly check the possible failure points
 # over the all 4-step.
 #
@@ -116,8 +115,8 @@ task_3:
     - --draft_length 3
     - --output_length 4096
     - --engine VLLM
-    - --tp_size 4 
-    - --ep_size 1 
+    - --tp_size 4
+    - --ep_size 1
   environment:
     - HF_MODEL_CKPT: <<global_vars.hf_model>>
     - HF_DRAFT_MODEL_CKPT: /scratchspace/export
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
index 1931f14ac7b..41ca8b52c3b 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail-check pipeline for Qwen3-8B.
 #
 # Dry-run on 100 samples to quickly check the possible failure points
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
index cccd1ca0ac1..1cb2ced32e7 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
@@ -1,4 +1,3 @@
----
 # This pipeline is a dry-run on 100 samples to quickly check the possible failure points
 # over the all 4-step.
 #
@@ -106,7 +105,7 @@ task_3:
     - --draft_length 3
     - --output_length 4096
     - --engine VLLM
-    - --tp_size 1 
+    - --tp_size 1
     - --ep_size 1
     - --runtime_params services/specdec_bench/tensorrt_llm_runtiime_params.yaml
     - --speculative_algorithm EAGLE3
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
index 0ba7408afdb..c50d71ab3ef 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for deepseek-ai/DeepSeek-V3.2
 #
 # DeepSeek-V3.2 is a 685B MoE (37B active, MLA attention).
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
index ba428e219f2..f04f3022ff3 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for zai-org/GLM-5
 #
 # GLM-5 is a 744B MoE (40B active, 256 experts, top-8). DeepSeek Sparse Attention.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
index 22170b92cd7..061171f42ca 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for openai/gpt-oss-20b
 #
 # GPT-OSS-20B is a 20B dense model.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
index 896babe8cac..1079cf7ac83 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for moonshotai/Kimi-K2.5
 #
 # Kimi-K2.5 is a 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
index 8b9154321c3..52b9eaa3b9f 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for nvidia/Kimi-K2.5-NVFP4
 #
 # nvidia/Kimi-K2.5-NVFP4 is the NVIDIA-quantized NVFP4 variant of moonshotai/Kimi-K2.5.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
index 3557d06f1aa..7c8b42d7ec4 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for MiniMaxAI/MiniMax-M2.5
 #
 # MiniMax-M2.5 is a 230B MoE (10B active, 256 experts, top-8).
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
index 5b3b76c4c4f..5cdafdf47b1 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for mistralai/Ministral-3-14B-Instruct-2512-BF16
 #
 # Ministral-3-14B is a 14B dense model (model type: mistral3, multimodal but
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
index f180d716e4c..870e563d539 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
 # Starts from task_1 (hidden state dump), reusing data from cicd_1779208014.
 #
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
index 8a75cbcd10e..071b3e4bebb 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
 # Starts from training, reusing hidden states from cicd_1779312692.
 
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
index 43ab18b0b5b..05d8d9b7c45 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
 # Uses HF-based hidden state dump (more stable than speculators/vLLM internals).
 # Reuses task_0 data from cicd_1779208014.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
index f25b63b2351..00f4bb1ffe3 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
 #
 # Ministral-3-8B is an 8B dense model (model type: mistral3, multimodal but
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
index f0c079fd452..79f6f1dec00 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-27B
 #
 # Qwen3.5-27B is a 27B dense VLM (text + vision). EAGLE3 uses only the text path.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
index dce06207cfc..5806acaf420 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-35B-A3B
 #
 # Qwen3.5-35B-A3B is a 35B MoE VLM (3B active, 256 experts, top-8, shared expert).
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
index b1c42aa566b..c2d374b2208 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-9B
 #
 # Qwen3.5-9B is a 9B dense VLM (text + vision). EAGLE3 uses only the text path.
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
index 85ea02bbb13..8c2fa397821 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
@@ -1,4 +1,3 @@
----
 # EAGLE3 quick-fail pipeline for stepfun-ai/Step-3.5-Flash
 #
 # Step-3.5-Flash is a 197B MoE (11B active, 288 experts + 1 shared, top-8).
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
index 572d27ac502..7e33e3cf967 100644
--- a/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
@@ -1,4 +1,3 @@
----
 # Standalone task_1: HF hidden state dump for MiniMax-M2.5
 # Uses partial data from cicd_1775624587 task_0
 
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
index 89f866f3f42..6e0e9cd3486 100644
--- a/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
@@ -1,4 +1,3 @@
----
 # Standalone task_1: HF hidden state dump for Ministral-3-8B
 # Uses partial data from cicd_1779208014 task_0 (330 samples)
 
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
index 7871ae9eda0..7f056aa2e90 100644
--- a/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
@@ -1,4 +1,3 @@
----
 # Standalone task_1: HF hidden state dump for Qwen3.5-35B-A3B
 # Uses partial data from cicd_1775625199 task_0
 
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
index c22fc2773d4..f5eb78ceda4 100644
--- a/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
+++ b/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
@@ -1,4 +1,3 @@
----
 # Standalone task_1: HF hidden state dump for Step-3.5-Flash
 # Uses partial data from cicd_1775624696 task_0
 
diff --git a/modelopt/torch/speculative/plugins/modeling_fakebase.py b/modelopt/torch/speculative/plugins/modeling_fakebase.py
index be8d8a63104..40016844e12 100644
--- a/modelopt/torch/speculative/plugins/modeling_fakebase.py
+++ b/modelopt/torch/speculative/plugins/modeling_fakebase.py
@@ -40,8 +40,13 @@
     "backbone.embeddings",
     "language_model.backbone.embeddings",
     "model.language_model.embed_tokens",
+    "tok_embeddings",  # Mistral native checkpoints (consolidated.safetensors)
+]
+_LM_HEAD_PATHS = [
+    "lm_head",
+    "language_model.lm_head",
+    "output",  # Mistral native checkpoints (consolidated.safetensors)
 ]
-_LM_HEAD_PATHS = ["lm_head", "language_model.lm_head"]
 _BASE_MODEL_PATHS = [
     "language_model.model",
     "model.language_model",
@@ -51,7 +56,9 @@
 ]
 _VLM_CONFIG_ATTRS = ["text_config", "llm_config"]
 _SAFETENSORS_INDEX_FILENAME = "model.safetensors.index.json"
-_SAFETENSORS_SINGLE_FILENAME = "model.safetensors"
+# Single-file safetensors names to try, in order.  Mistral native checkpoints
+# use ``consolidated.safetensors`` instead of the HF-standard ``model.safetensors``.
+_SAFETENSORS_SINGLE_FILENAMES = ["model.safetensors", "consolidated.safetensors"]
 
 
 class FakeBaseConfig(PretrainedConfig):
@@ -182,11 +189,12 @@ def _try_fetch(name: str) -> str | None:
         if (index_path := _try_fetch(_SAFETENSORS_INDEX_FILENAME)) is not None:
             with open(index_path) as f:
                 return json.load(f).get("weight_map", {})
-        if (single_path := _try_fetch(_SAFETENSORS_SINGLE_FILENAME)) is not None:
-            with safe_open(single_path, framework="pt") as h:
-                return dict.fromkeys(h.keys(), _SAFETENSORS_SINGLE_FILENAME)
+        for single_name in _SAFETENSORS_SINGLE_FILENAMES:
+            if (single_path := _try_fetch(single_name)) is not None:
+                with safe_open(single_path, framework="pt") as h:
+                    return dict.fromkeys(h.keys(), single_name)
         raise FileNotFoundError(
-            f"No {_SAFETENSORS_INDEX_FILENAME} or {_SAFETENSORS_SINGLE_FILENAME} found at "
+            f"No {_SAFETENSORS_INDEX_FILENAME} or {_SAFETENSORS_SINGLE_FILENAMES} found at "
             f"{source!r}. FakeBaseModel only supports safetensors checkpoints; "
             "pytorch_model.bin is not supported."
         )
@@ -219,28 +227,12 @@ def _load_weights(self, source: str):
             source, [weight_map[lm_head_key], weight_map[embed_tokens_key]]
         )
 
-        def _read(path: str, key: str, role: str = "") -> torch.Tensor:
-            """Pull a single tensor; falls back to consolidated.safetensors for Mistral."""
-            try:
-                with safe_open(path, framework="pt", device="cpu") as h:
-                    return h.get_tensor(key)
-            except FileNotFoundError:
-                _aliases = {
-                    "embed_tokens": ["tok_embeddings.weight"],
-                    "lm_head": ["output.weight"],
-                }
-                consolidated = os.path.join(os.path.dirname(path), "consolidated.safetensors")
-                if os.path.isfile(consolidated):
-                    with safe_open(consolidated, framework="pt", device="cpu") as h:
-                        for alias in _aliases.get(role, []):
-                            if alias in h.keys():
-                                return h.get_tensor(alias)
-                raise
+        # Pull only the two tensors we need; avoids materializing the whole file.
+        def _read(path: str, key: str) -> torch.Tensor:
+            with safe_open(path, framework="pt", device="cpu") as h:
+                return h.get_tensor(key)
 
-        return (
-            _read(lm_head_path, lm_head_key, "lm_head"),
-            _read(embed_tokens_path, embed_tokens_key, "embed_tokens"),
-        )
+        return _read(lm_head_path, lm_head_key), _read(embed_tokens_path, embed_tokens_key)
 
     def forward(self, *args, **kwargs):
         """Not implemented: FakeBaseModel omits full model weights and cannot run inference."""

From 1a7bb829f39754acb37ea78229e4620005dfea6c Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 2 Jun 2026 12:04:16 -0700
Subject: [PATCH 20/22] Address code review feedback

- Shell scripts: quote variable expansions, use "$@" instead of ${@},
  remove pip install error suppression (2>/dev/null || true), add
  HF_MODEL_CKPT validation guard
- compute_hidden_states_vllm.py: guard tokenizer.chat_template against
  None before .replace() (matches HF sibling script)
- YAML configs: add --trust-remote-code to task_3 benchmark args for
  GLM-5, MiniMax-M2.5, Kimi-K2.5-NVFP4, GPT-OSS-20B
- GPT-OSS-20B: add TIKTOKEN_RS_CACHE_DIR to task_3 environment

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py             |  3 ++-
 .../common/eagle3/dump_offline_data_hf.sh     | 23 ++++++++++-------
 .../common/eagle3/dump_offline_data_vllm.sh   | 25 +++++++++++--------
 .../examples/GLM/GLM-5/hf_offline_eagle3.yaml |  1 +
 .../MiniMax-M2.5/hf_offline_eagle3.yaml       |  1 +
 .../Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml    |  1 +
 .../OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml |  2 ++
 7 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
index 2e2660ae829..739d5702ade 100644
--- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -94,7 +94,8 @@ def keep_conversation(entry):
     tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
+    if tokenizer.chat_template is not None:
+        tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
 
     # Prepare prompts for vLLM
     prompts = []
diff --git a/tools/launcher/common/eagle3/dump_offline_data_hf.sh b/tools/launcher/common/eagle3/dump_offline_data_hf.sh
index 89877f913f2..08ce5bfb2ff 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_hf.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_hf.sh
@@ -37,25 +37,30 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #   --input-data, --output-dir, --max-seq-len, etc.
 ###################################################################################################
 
-pip install datasets 2>/dev/null || true
+pip install datasets
 
-if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+if [ -z "${HF_MODEL_CKPT:-}" ]; then
+    echo "ERROR: HF_MODEL_CKPT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then
     TASK_ID=0
 else
     echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
-    TASK_ID=${SLURM_ARRAY_TASK_ID}
+    TASK_ID="${SLURM_ARRAY_TASK_ID}"
 fi
 
-if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+if [ -z "${SLURM_ARRAY_TASK_COUNT:-}" ]; then
     TASK_COUNT=1
 else
     echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
-    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+    TASK_COUNT="${SLURM_ARRAY_TASK_COUNT}"
 fi
 
 python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py \
-    --model ${HF_MODEL_CKPT} \
-    --dp-rank ${TASK_ID} \
-    --dp-world-size ${TASK_COUNT} \
+    --model "${HF_MODEL_CKPT}" \
+    --dp-rank "${TASK_ID}" \
+    --dp-world-size "${TASK_COUNT}" \
     --trust_remote_code \
-    ${@}
+    "$@"
diff --git a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
index a8ae99b3a16..c27bab5dcbf 100644
--- a/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
+++ b/tools/launcher/common/eagle3/dump_offline_data_vllm.sh
@@ -31,8 +31,8 @@ source ${SCRIPT_DIR}/../service_utils.sh
 #   --input-data, --output-dir, --max-seq-len, etc.
 ###################################################################################################
 
-pip install "speculators<0.5.0" --no-deps 2>/dev/null || true
-pip install datasets 2>/dev/null || true
+pip install "speculators<0.5.0" --no-deps
+pip install datasets
 
 # vLLM API compatibility: speculators 0.4.0.1 uses Request(eos_token_id=...) which
 # was removed in newer vLLM. Patch to remove the unsupported kwarg.
@@ -128,23 +128,28 @@ for d in site.getsitepackages():
     break
 PYEOF
 
-if [ -z ${SLURM_ARRAY_TASK_ID} ]; then
+if [ -z "${HF_MODEL_CKPT:-}" ]; then
+    echo "ERROR: HF_MODEL_CKPT environment variable is not set"
+    exit 1
+fi
+
+if [ -z "${SLURM_ARRAY_TASK_ID:-}" ]; then
     TASK_ID=0
 else
     echo "SLURM_ARRAY_TASK_ID ${SLURM_ARRAY_TASK_ID}"
-    TASK_ID=${SLURM_ARRAY_TASK_ID}
+    TASK_ID="${SLURM_ARRAY_TASK_ID}"
 fi
 
-if [ -z ${SLURM_ARRAY_TASK_COUNT} ]; then
+if [ -z "${SLURM_ARRAY_TASK_COUNT:-}" ]; then
     TASK_COUNT=1
 else
     echo "SLURM_ARRAY_TASK_COUNT ${SLURM_ARRAY_TASK_COUNT}"
-    TASK_COUNT=${SLURM_ARRAY_TASK_COUNT}
+    TASK_COUNT="${SLURM_ARRAY_TASK_COUNT}"
 fi
 
 python3 modules/Model-Optimizer/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py \
-    --model ${HF_MODEL_CKPT} \
-    --dp-rank ${TASK_ID} \
-    --dp-world-size ${TASK_COUNT} \
+    --model "${HF_MODEL_CKPT}" \
+    --dp-rank "${TASK_ID}" \
+    --dp-world-size "${TASK_COUNT}" \
     --trust_remote_code \
-    ${@}
+    "$@"
diff --git a/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml b/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
index 9f6a78f439c..4a709c7d061 100644
--- a/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/GLM/GLM-5/hf_offline_eagle3.yaml
@@ -96,6 +96,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 2
       - --speculative_algorithm EAGLE3
diff --git a/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
index 7868c28f80a..399807829c3 100644
--- a/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/MiniMax/MiniMax-M2.5/hf_offline_eagle3.yaml
@@ -94,6 +94,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 1
       - --speculative_algorithm EAGLE3
diff --git a/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
index 35513f998f2..16ae740c44c 100644
--- a/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/hf_offline_eagle3.yaml
@@ -104,6 +104,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 1
       - --speculative_algorithm EAGLE3
diff --git a/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
index accdd71f76c..f5d99e37aa2 100644
--- a/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
+++ b/tools/launcher/examples/OpenAI/GPT-OSS-20B/hf_offline_eagle3.yaml
@@ -93,6 +93,7 @@ pipeline:
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 1
       - --speculative_algorithm EAGLE3
@@ -101,6 +102,7 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1

From fdbc2e12077170a2c32ec7f95001f59ed2cc7df0 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 2 Jun 2026 13:52:49 -0700
Subject: [PATCH 21/22] Move quick_fail_check YAMLs from examples/ to
 tools/launcher/examples/
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert EAGLE3 quick-fail pipeline configs to launcher format:
- services/ → common/ script paths
- oci_hsg_slurm_factory → slurm_factory
- offline_training.sh → train_eagle.sh with config-based args
- Add --trust-remote-code to task_3 for custom-code models
- Delete triage artifacts (partial pipelines, computelab config)

13 files converted, 22 originals deleted.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../pipeline/eagle3/offline.yaml              | 128 --------------
 .../eagle3/quick_fail_check_computelab.yaml   | 119 -------------
 .../eagle3/quick_fail_check_kimi_k2.5.yaml    | 148 ----------------
 .../quick_fail_check_kimi_k2.5_nvfp4.yaml     | 161 ------------------
 ..._fail_check_ministral-3-8b-from-task1.yaml |  75 --------
 ..._fail_check_ministral-3-8b-from-task2.yaml |  55 ------
 ...ick_fail_check_ministral-3-8b-hf-dump.yaml |  76 ---------
 .../pipeline/eagle3/task1_minimax-m2.5.yaml   |  19 ---
 .../pipeline/eagle3/task1_ministral-3-8b.yaml |  19 ---
 .../eagle3/task1_qwen3.5-35b-a3b.yaml         |  19 ---
 .../pipeline/eagle3/task1_step-3.5-flash.yaml |  19 ---
 .../DeepSeek-V3.2/eagle3_quick_check.yaml     |  56 +++---
 .../GLM/GLM-5/eagle3_quick_check.yaml         |  57 ++++---
 .../MiniMax-M2.5/eagle3_quick_check.yaml      |  57 ++++---
 .../Ministral-3-14B/eagle3_quick_check.yaml   |  58 ++++---
 .../Ministral-3-8B/eagle3_quick_check.yaml    |  58 ++++---
 .../Kimi-K2.5/eagle3_quick_check.yaml         | 135 +++++++++++++++
 .../Kimi-K2.5-NVFP4/eagle3_quick_check.yaml   | 149 ++++++++++++++++
 .../GPT-OSS-20B/eagle3_quick_check.yaml       |  63 ++++---
 .../Qwen/Qwen3-8B/eagle3_quick_check.yaml     |  64 +++----
 .../Qwen/Qwen3.5-27B/eagle3_quick_check.yaml  |  58 ++++---
 .../Qwen3.5-35B-A3B/eagle3_quick_check.yaml   |  58 ++++---
 .../Qwen/Qwen3.5-9B/eagle3_quick_check.yaml   |  58 ++++---
 .../Step-3.5-Flash/eagle3_quick_check.yaml    |  56 +++---
 24 files changed, 619 insertions(+), 1146 deletions(-)
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/offline.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
 delete mode 100644 examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml => tools/launcher/examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml (60%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml => tools/launcher/examples/GLM/GLM-5/eagle3_quick_check.yaml (61%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml => tools/launcher/examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml (59%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml => tools/launcher/examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml (58%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml => tools/launcher/examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml (59%)
 create mode 100644 tools/launcher/examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml
 create mode 100644 tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml => tools/launcher/examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml (52%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml => tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml (59%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml => tools/launcher/examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml (54%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml => tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml (57%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml => tools/launcher/examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml (54%)
 rename examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml => tools/launcher/examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml (60%)

diff --git a/examples/speculative_decoding/pipeline/eagle3/offline.yaml b/examples/speculative_decoding/pipeline/eagle3/offline.yaml
deleted file mode 100644
index 9a6f0158184..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/offline.yaml
+++ /dev/null
@@ -1,128 +0,0 @@
-# This pipeline is a dry-run on 100 samples to quickly check the possible failure points
-# over the all 4-step.
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-#
-# Note:
-#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
-#   /scratchspace is the best place if you want to pass the artifact around. You will
-#   see we use /scratchspace/... to specify input and output a lot.
-#
-#   You can comment out any task below to skip it. For example, If the data has been
-#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
-#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
-#   to the absolute path above.
-
-allow_to_fail: false
-skip: false
-note:
-
-global_vars:
-  hf_model: /hf-local/Qwen/Qwen3-8B
-
-task_0:
-  script: services/tensorrt-llm/query.sh
-  # Args are split on "--": entries before "--" are passed to trtllm-serve,
-  # entries after "--" are passed to tools/query.py.
-  # --model is required and applies to both trtllm-serve and tools/query.py.
-  args:
-    - --model <<global_vars.hf_model>>  # required
-    - --tp_size 4
-    - --ep_size 4
-    - --max_num_tokens 32000
-    - --port 8000
-    - --host 0.0.0.0
-    - --trust_remote_code
-    - --                                # separator
-    - --data /hf-local/modelopt/Speculative-Decoding-Prompts-v1_prompts  # query.py args
-    - --save /scratchspace/data
-  environment:
-    - HF_LOCAL: /hf-local
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    array: "0-0"
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-
-task_1:
-  script: services/pipeline/eagle3/dump_offline_data.sh
-  args:
-    - --input-data /scratchspace/data
-    - --output-dir /scratchspace/offline_hidden_states
-    - --max-seq-len 8192
-    - --tp 4
-    - --moe-ep 4
-  environment:
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    array: "0-0"
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_2:
-  script: services/pipeline/eagle3/offline_training.sh
-  args:
-    - --offline-data /scratchspace/offline_hidden_states
-    - --data_path None
-    - --mode eagle3
-    - --num_epochs 1
-    - --lr 3e-4
-    - --save_steps 500000
-    - --output_dir /scratchspace/eagle3
-    - --train_bs 8
-    - --training_seq_len 4096
-    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-    - --disable_tqdm True
-    - --ar_validate_steps 500000
-  environment:
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-#
-# This stage we run specdec bench and default we use VLLM backend. TensorRT-LLM can also be used
-# but we generally see some issues.
-#
-# To use TensorRT-LLM, make the following changes
-#
-# args:
-#   - --engine TRTLLM
-# environment:
-#   TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch 
-# slurm_config:
-#   ntasks_per_node: 4
-#   container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-#
-# To use SGLang
-#
-# args:
-#   - --engine SGLANG
-# slurm_config:
-#   ntasks_per_node: 1
-#   container: lmsysorg/sglang:v0.5.9
-#
-task_3:
-  script: services/specdec_bench/run.sh
-  args:
-    - --draft_length 3
-    - --output_length 4096
-    - --engine VLLM
-    - --tp_size 4
-    - --ep_size 1
-  environment:
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    - HF_DRAFT_MODEL_CKPT: /scratchspace/export
-    - CONCURRENCY_LIST: "1 4"
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
deleted file mode 100644
index 1cb2ced32e7..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_computelab.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-# This pipeline is a dry-run on 100 samples to quickly check the possible failure points
-# over the all 4-step.
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-#
-# Note:
-#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
-#   /scratchspace is the best place if you want to pass the artifact around. You will
-#   see we use /scratchspace/... to specify input and output a lot.
-#
-#   You can comment out any task below to skip it. For example, If the data has been
-#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
-#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
-#   to the absolute path above.
-
-allow_to_fail: false
-skip: false
-note:
-
-global_vars:
-  #hf_model: &hf_model /hf-local/Qwen/Qwen3-8B
-  hf_model: &hf_model /hf-local/LocoreMind/LocoOperator-4B
-
-task_0:
-  script: services/tensorrt-llm/query.sh
-  # Args are split on "--": entries before "--" are passed to trtllm-serve,
-  # entries after "--" are passed to tools/query.py.
-  # --model is required and applies to both trtllm-serve and tools/query.py.
-  args:
-    - --model <<global_vars.hf_model>>  # required
-    - --tp_size 1
-    - --ep_size 1
-    - --max_num_tokens 32000
-    - --port 8000
-    - --host 0.0.0.0
-    - --trust_remote_code
-    - --                                # separator
-    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
-    - --save /scratchspace/data
-  environment:
-    - HF_LOCAL: /hf-local
-  slurm_config:
-    _factory_: "computelab_slurm_factory"
-    nodes: 1
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_1:
-  script: services/pipeline/eagle3/dump_offline_data.sh
-  args:
-    - --input-data /scratchspace/data
-    - --output-dir /scratchspace/offline_hidden_states
-    - --max-seq-len 8192
-    - --tp 1
-    - --moe-ep 1
-  environment:
-    - HF_MODEL_CKPT: *hf_model
-  slurm_config:
-    _factory_: "computelab_slurm_factory"
-    nodes: 1
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_2:
-  script: services/pipeline/eagle3/offline_training.sh
-  args:
-    - --offline-data /scratchspace/offline_hidden_states
-    - --data_path None
-    - --mode eagle3
-    - --num_epochs 1
-    - --lr 3e-4
-    - --save_steps 500000
-    - --output_dir /scratchspace/eagle3
-    - --train_bs 1
-    - --training_seq_len 1024
-    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-    - --disable_tqdm True
-    - --ar_validate_steps 500000
-  environment:
-    - HF_MODEL_CKPT: *hf_model
-  slurm_config:
-    _factory_: "computelab_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-#
-# This stage we run specdec bench and default we use VLLM backend. TensorRT-LLM can also be used
-# but we generally see some issues.
-#
-# To use TensorRT-LLM, make the following changes
-#
-# args:
-#   - --engine TRTLLM
-# environment:
-#   TRTLLM_LAUNCH_SCRIPT: trtllm-llmapi-launch 
-# slurm_config:
-#   container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-#   
-task_3:
-  script: services/specdec_bench/quick_check.sh
-  args:
-    - --draft_model_dir /scratchspace/export
-    - --draft_length 3
-    - --output_length 4096
-    - --engine VLLM
-    - --tp_size 1
-    - --ep_size 1
-    - --runtime_params services/specdec_bench/tensorrt_llm_runtiime_params.yaml
-    - --speculative_algorithm EAGLE3
-    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-    - --concurrency 1
-  environment:
-    - HF_MODEL_CKPT: *hf_model
-  slurm_config:
-    _factory_: "computelab_slurm_factory"
-    nodes: 1
-    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
deleted file mode 100644
index 1079cf7ac83..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5.yaml
+++ /dev/null
@@ -1,148 +0,0 @@
-# EAGLE3 quick-fail pipeline for moonshotai/Kimi-K2.5
-#
-# Kimi-K2.5 is a 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
-# EAGLE3 uses only the text path — the vision encoder is not invoked.
-#
-# Hardware requirements:
-#   OCI-HSG uses GB200 nodes with 4 GPUs × 192 GB HBM3e = 768 GB per node.
-#   K2.5 BF16 weights are ~595 GB, so a single 4×GB200 node with TP=4 is sufficient.
-#   (For reference: H200 has 141 GB/GPU, so K2.5 would require 8× H200 instead.)
-#
-# References:
-#   https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-#   https://huggingface.co/moonshotai/Kimi-K2.5/blob/main/docs/deploy_guidance.md
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-#
-# Note:
-#   All tasks share /scratchspace. Comment out any task to skip it and update
-#   the next task's input path to point to a previous experiment's scratchspace.
-#
-# MoE-specific notes:
-#   - task_1: set --moe-ep 1 since TP=8 spans all 8 GPUs; increase --moe-ep and
-#     reduce --tp if you have more GPUs and want expert parallelism.
-#   - task_2: review modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-#     and consider increasing `intermediate_size` for MoE models (the draft head
-#     uses a dense layer by default, which may be undersized relative to K2.5's
-#     expert hidden dim of 2048).
-#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
-#     is not covered by the default llama decoder type.
-
-allow_to_fail: false
-skip: false
-note:
-
-global_vars:
-  hf_model: /hf-local/moonshotai/Kimi-K2.5
-
-task_0:
-  script: services/tensorrt-llm/query.sh
-  # Args are split on "--": entries before "--" are passed to trtllm-serve,
-  # entries after "--" are passed to tools/query.py.
-  # --model is required and applies to both trtllm-serve and tools/query.py.
-  #
-  # OCI-HSG GB200: 4 GPUs × 192 GB = 768 GB per node — fits K2.5 BF16 (~595 GB) with TP=4.
-  # --trust_remote_code is required for the Kimi tokenizer.
-  args:
-    - --model <<global_vars.hf_model>>  # required
-    - --tp_size 4
-    - --ep_size 1
-    - --max_num_tokens 32000
-    - --port 8000
-    - --host 0.0.0.0
-    - --trust_remote_code
-    - --                                # separator
-    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
-    - --save /scratchspace/data
-  environment:
-    - HF_LOCAL: /hf-local
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 4
-    gpus_per_node: 4
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_1:
-  script: services/pipeline/eagle3/dump_offline_data.sh
-  # TP=4 across all 4 GB200 GPUs; moe-ep=1 since TP already fills the node.
-  # Increase max-seq-len up to 131072 if your training data has long contexts.
-  args:
-    - --input-data /scratchspace/data
-    - --output-dir /scratchspace/offline_hidden_states
-    - --max-seq-len 8192
-    - --tp 4
-    - --moe-ep 1
-  environment:
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 4
-    gpus_per_node: 4
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_2:
-  script: services/pipeline/eagle3/offline_training.sh
-  # Draft head training. Runs on a single rank with Accelerate + FSDP.
-  # For K2.5 (MoE): check eagle_config.json and consider increasing
-  # intermediate_size since K2.5's per-expert hidden dim is 2048.
-  # Also verify --eagle_decoder_type is compatible with MLA attention.
-  args:
-    - --offline-data /scratchspace/offline_hidden_states
-    - --data_path None
-    - --mode eagle3
-    - --num_epochs 1
-    - --lr 3e-4
-    - --save_steps 500000
-    - --output_dir /scratchspace/eagle3
-    - --train_bs 8
-    - --training_seq_len 4096
-    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-    - --disable_tqdm True
-    - --ar_validate_steps 500000
-  environment:
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    gpus_per_node: 4
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-#
-# Task 3: Speculative decoding benchmark using VLLM (recommended for K2.5).
-#
-# For K2.5 with VLLM, additional flags are recommended but are not yet
-# supported by services/specdec_bench/quick_check.sh:
-#   --mm-encoder-tp-mode data   (data-parallel vision encoder, improves throughput)
-#   --tool-call-parser kimi_k2  (required for tool-call responses)
-#   --reasoning-parser kimi_k2  (required for thinking-mode responses)
-# If you need these, extend quick_check.sh or use run.sh with a custom config.
-#
-task_3:
-  script: services/specdec_bench/quick_check.sh
-  args:
-    - --draft_model_dir /scratchspace/export
-    - --draft_length 3
-    - --output_length 4096
-    - --engine VLLM
-    - --tp_size 4
-    - --ep_size 1
-    - --speculative_algorithm EAGLE3
-    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-    - --concurrency 1
-  environment:
-    - HF_LOCAL: /hf-local
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    - TP: 4
-    - EP: 1
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    gpus_per_node: 4
-    container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
deleted file mode 100644
index 52b9eaa3b9f..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_kimi_k2.5_nvfp4.yaml
+++ /dev/null
@@ -1,161 +0,0 @@
-# EAGLE3 quick-fail pipeline for nvidia/Kimi-K2.5-NVFP4
-#
-# nvidia/Kimi-K2.5-NVFP4 is the NVIDIA-quantized NVFP4 variant of moonshotai/Kimi-K2.5.
-# Same architecture: 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
-# Only the weights and activations of linear ops within MoE transformer blocks are quantized;
-# embeddings and other tensors remain in higher precision.
-# EAGLE3 uses only the text path — the vision encoder is not invoked.
-#
-# Hardware requirements:
-#   NVFP4 checkpoint is ~591 GB (vs ~595 GB BF16) — same single-node requirement.
-#   OCI-HSG uses GB200 (Blackwell) nodes with 4 GPUs × 192 GB HBM3e = 768 GB per node.
-#   NVFP4 inference requires Blackwell GPUs — OCI-HSG GB200 satisfies this.
-#   A single 4×GB200 node with TP=4 is sufficient.
-#
-# References:
-#   https://huggingface.co/nvidia/Kimi-K2.5-NVFP4
-#   https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-#
-# Note:
-#   All tasks share /scratchspace. Comment out any task to skip it and update
-#   the next task's input path to point to a previous experiment's scratchspace.
-#
-# Why different models for task_0 vs tasks 1–2:
-#   task_0 (data synthesis) uses services/vllm/query.sh + NVFP4 directly, since
-#   vLLM v0.15.0 supports KimiK25ForConditionalGeneration with NVFP4 on Blackwell.
-#   tasks 1–2 (hidden-state extraction and training) use the BF16 moonshotai/Kimi-K2.5
-#   checkpoint via TRT-LLM, which does not support KimiK25ForConditionalGeneration
-#   in v1.3.0. The draft head trained on BF16 hidden states transfers directly to
-#   NVFP4 — NVFP4 is near-lossless, so hidden state distributions are equivalent.
-#
-# MoE-specific notes:
-#   - task_1: set --moe-ep 1 since TP=4 spans all 4 GPUs; increase --moe-ep and
-#     reduce --tp if you have more GPUs and want expert parallelism.
-#   - task_2: review modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-#     and consider increasing `intermediate_size` for MoE models (the draft head
-#     uses a dense layer by default, which may be undersized relative to K2.5's
-#     expert hidden dim of 2048).
-#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
-#     is not covered by the default llama decoder type.
-
-allow_to_fail: false
-skip: false
-note:
-
-global_vars:
-  hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
-
-task_0:
-  script: services/vllm/query.sh
-  # Args are split on "--": entries before "--" are passed to vllm serve,
-  # entries after "--" are passed to tools/query.py.
-  # --model is required and applies to both vllm serve and tools/query.py.
-  #
-  # Uses NVFP4 directly via vLLM v0.15.0, which supports KimiK25ForConditionalGeneration.
-  # vLLM is single-process (ntasks_per_node: 1); --tensor-parallel-size handles GPU distribution.
-  # --trust-remote-code is required for the Kimi tokenizer.
-  args:
-    - --model <<global_vars.hf_model>>  # required
-    - --tensor-parallel-size 4
-    - --max-num-seqs 32
-    - --trust-remote-code
-    - --                                # separator
-    - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default  # query.py args
-    - --save /scratchspace/data
-  environment:
-    - HF_LOCAL: /hf-local
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    gpus_per_node: 4
-    container: vllm/vllm-openai:v0.15.0
-
-task_1:
-  script: services/pipeline/eagle3/dump_offline_data.sh
-  # TP=4 across all 4 GB200 GPUs; moe-ep=1 since TP already fills the node.
-  # Uses BF16 moonshotai/Kimi-K2.5: same TRT-LLM compatibility reason as task_0.
-  # Increase max-seq-len up to 131072 if your training data has long contexts.
-  args:
-    - --input-data /scratchspace/data
-    - --output-dir /scratchspace/offline_hidden_states
-    - --max-seq-len 8192
-    - --tp 4
-    - --moe-ep 1
-  environment:
-    - HF_MODEL_CKPT: /hf-local/moonshotai/Kimi-K2.5
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 4
-    gpus_per_node: 4
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-task_2:
-  script: services/pipeline/eagle3/offline_training.sh
-  # Draft head training. Runs on a single rank with Accelerate + FSDP.
-  # HF_MODEL_CKPT points to BF16 for tokenizer/config loading; hidden states
-  # come from task_1 (also BF16). Draft head transfers to NVFP4 at benchmark time.
-  # For K2.5 (MoE): check eagle_config.json and consider increasing
-  # intermediate_size since K2.5's per-expert hidden dim is 2048.
-  # Also verify --eagle_decoder_type is compatible with MLA attention.
-  args:
-    - --offline-data /scratchspace/offline_hidden_states
-    - --data_path None
-    - --mode eagle3
-    - --num_epochs 1
-    - --lr 3e-4
-    - --save_steps 500000
-    - --output_dir /scratchspace/eagle3
-    - --train_bs 8
-    - --training_seq_len 4096
-    - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-    - --disable_tqdm True
-    - --ar_validate_steps 500000
-  environment:
-    - HF_MODEL_CKPT: /hf-local/moonshotai/Kimi-K2.5
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    gpus_per_node: 4
-    container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
-
-#
-# Task 3: Speculative decoding benchmark using VLLM with NVFP4.
-#
-# vllm/vllm-openai:v0.15.0 is required for NVFP4 support on Blackwell GPUs.
-# Additional flags required for full K2.5 functionality but not yet supported
-# by services/specdec_bench/quick_check.sh:
-#   --tool-call-parser kimi_k2   (required for tool-call responses)
-#   --reasoning-parser kimi_k2   (required for thinking-mode responses)
-# If you need these, extend quick_check.sh or use run.sh with a custom config.
-#
-task_3:
-  script: services/specdec_bench/quick_check.sh
-  args:
-    - --draft_model_dir /scratchspace/export
-    - --draft_length 3
-    - --output_length 4096
-    - --engine VLLM
-    - --tp_size 4
-    - --ep_size 1
-    - --speculative_algorithm EAGLE3
-    - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-    - --concurrency 1
-  environment:
-    - HF_LOCAL: /hf-local
-    - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    - TP: 4
-    - EP: 1
-  slurm_config:
-    _factory_: "oci_hsg_slurm_factory"
-    nodes: 1
-    ntasks_per_node: 1
-    gpus_per_node: 4
-    container: vllm/vllm-openai:v0.15.0
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
deleted file mode 100644
index 870e563d539..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task1.yaml
+++ /dev/null
@@ -1,75 +0,0 @@
-# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
-# Starts from task_1 (hidden state dump), reusing data from cicd_1779208014.
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-
-job_name: Ministral-3-8B_EAGLE3_quick_fail_check
-pipeline:
-  allow_to_fail: true
-  skip: false
-  note:
-
-  global_vars:
-    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
-
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: vllm/vllm-openai:latest
-
-  task_1:
-    script: services/pipeline/eagle3/offline_training.sh
-    args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
-
-  task_2:
-    script: services/specdec_bench/quick_check.sh
-    args:
-      - --draft_model_dir /scratchspace/export
-      - --draft_length 3
-      - --output_length 4096
-      - --engine VLLM
-      - --tp_size 4
-      - --ep_size 1
-      - --speculative_algorithm EAGLE3
-      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
-    environment:
-      - HF_LOCAL: /hf-local
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
deleted file mode 100644
index 071b3e4bebb..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml
+++ /dev/null
@@ -1,55 +0,0 @@
-# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
-# Starts from training, reusing hidden states from cicd_1779312692.
-
-job_name: Ministral-3-8B_EAGLE3_quick_fail_check
-pipeline:
-  allow_to_fail: true
-  skip: false
-  note:
-
-  global_vars:
-    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
-
-  task_0:
-    script: services/pipeline/eagle3/offline_training.sh
-    args:
-      - --offline-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779312692/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-      - --trust_remote_code
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
-
-  task_1:
-    script: services/specdec_bench/quick_check.sh
-    args:
-      - --draft_model_dir /scratchspace/export
-      - --draft_length 3
-      - --output_length 4096
-      - --engine VLLM
-      - --tp_size 4
-      - --ep_size 1
-      - --speculative_algorithm EAGLE3
-      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
-    environment:
-      - HF_LOCAL: /hf-local
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml b/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
deleted file mode 100644
index 05d8d9b7c45..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-hf-dump.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
-# Uses HF-based hidden state dump (more stable than speculators/vLLM internals).
-# Reuses task_0 data from cicd_1779208014.
-#
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-
-job_name: Ministral-3-8B_EAGLE3_quick_fail_check
-pipeline:
-  allow_to_fail: true
-  skip: false
-  note:
-
-  global_vars:
-    hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
-
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_hf.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
-
-  task_1:
-    script: services/pipeline/eagle3/offline_training.sh
-    args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
-
-  task_2:
-    script: services/specdec_bench/quick_check.sh
-    args:
-      - --draft_model_dir /scratchspace/export
-      - --draft_length 3
-      - --output_length 4096
-      - --engine VLLM
-      - --tp_size 4
-      - --ep_size 1
-      - --speculative_algorithm EAGLE3
-      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
-      - --concurrency 1
-    environment:
-      - HF_LOCAL: /hf-local
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
deleted file mode 100644
index 7e33e3cf967..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/task1_minimax-m2.5.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Standalone task_1: HF hidden state dump for MiniMax-M2.5
-# Uses partial data from cicd_1775624587 task_0
-
-job_name: MiniMax-M2.5_task1_hf_dump
-pipeline:
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_hf.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775624587/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: /hf-local/MiniMaxAI/MiniMax-M2.5
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
deleted file mode 100644
index 6e0e9cd3486..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/task1_ministral-3-8b.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Standalone task_1: HF hidden state dump for Ministral-3-8B
-# Uses partial data from cicd_1779208014 task_0 (330 samples)
-
-job_name: Ministral-3-8B_task1_hf_dump
-pipeline:
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_hf.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1779208014/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
deleted file mode 100644
index 7f056aa2e90..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/task1_qwen3.5-35b-a3b.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Standalone task_1: HF hidden state dump for Qwen3.5-35B-A3B
-# Uses partial data from cicd_1775625199 task_0
-
-job_name: Qwen3.5-35B-A3B_task1_hf_dump
-pipeline:
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_hf.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775625199/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: /hf-local/Qwen/Qwen3.5-35B-A3B
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml b/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
deleted file mode 100644
index f5eb78ceda4..00000000000
--- a/examples/speculative_decoding/pipeline/eagle3/task1_step-3.5-flash.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Standalone task_1: HF hidden state dump for Step-3.5-Flash
-# Uses partial data from cicd_1775624696 task_0
-
-job_name: Step-3.5-Flash_task1_hf_dump
-pipeline:
-  task_0:
-    script: services/pipeline/eagle3/dump_offline_data_hf.sh
-    args:
-      - --input-data /lustre/fsw/portfolios/coreai/users/yeyu/experiments/cicd/cicd_1775624696/data
-      - --output-dir /scratchspace/offline_hidden_states
-      - --max-seq-len 8192
-    environment:
-      - HF_MODEL_CKPT: /hf-local/stepfun-ai/Step-3.5-Flash
-    slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
-      nodes: 1
-      ntasks_per_node: 1
-      gpus_per_node: 4
-      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml b/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml
similarity index 60%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
rename to tools/launcher/examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml
index c50d71ab3ef..57fb3d09393 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml
+++ b/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for deepseek-ai/DeepSeek-V3.2
+# EAGLE3 quick-fail pipeline for deepseek-ai/DeepSeek-V3.2.
 #
 # DeepSeek-V3.2 is a 685B MoE (37B active, MLA attention).
 # BF16 weights: ~1370 GB — requires 2 GB200 nodes (8 × 192 GB = 1536 GB).
@@ -12,10 +12,11 @@
 #   - TP=4 per node, EP across nodes if needed
 #   - Draft head intermediate_size may need tuning in eagle_config.json
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/DeepSeek/DeepSeek-V3.2/eagle3_quick_check.yaml --yes
 
 job_name: DeepSeek-V3.2_EAGLE3_quick_fail_check
 pipeline:
@@ -26,8 +27,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/deepseek-ai/DeepSeek-V3.2
 
+  # Step 1: Data synthesis via vLLM server (2 nodes, TP=8)
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 8
@@ -39,14 +41,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 2
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states (2 nodes)
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -54,38 +57,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 2
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 4
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=4
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (2 nodes, TP=4, EP=2)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -102,7 +104,7 @@ pipeline:
       - TP: 4
       - EP: 2
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 2
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml b/tools/launcher/examples/GLM/GLM-5/eagle3_quick_check.yaml
similarity index 61%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
rename to tools/launcher/examples/GLM/GLM-5/eagle3_quick_check.yaml
index f04f3022ff3..3b769117db8 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_glm-5.yaml
+++ b/tools/launcher/examples/GLM/GLM-5/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for zai-org/GLM-5
+# EAGLE3 quick-fail pipeline for zai-org/GLM-5.
 #
 # GLM-5 is a 744B MoE (40B active, 256 experts, top-8). DeepSeek Sparse Attention.
 # BF16 weights: ~1488 GB — requires 2 GB200 nodes (8 × 192 GB = 1536 GB).
@@ -13,10 +13,11 @@
 #   - TP=4 per node, EP=2 across 2 nodes
 #   - Draft head intermediate_size may need tuning in eagle_config.json
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/GLM/GLM-5/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/GLM/GLM-5/eagle3_quick_check.yaml --yes
 
 job_name: GLM-5_EAGLE3_quick_fail_check
 pipeline:
@@ -27,8 +28,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/zai-org/GLM-5
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -39,14 +41,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -54,43 +57,43 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 4
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=4
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (TP=4, EP=2)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 2
       - --speculative_algorithm EAGLE3
@@ -102,7 +105,7 @@ pipeline:
       - TP: 4
       - EP: 2
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 2
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml b/tools/launcher/examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml
similarity index 59%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
rename to tools/launcher/examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml
index 7c8b42d7ec4..24ab04c7c06 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml
+++ b/tools/launcher/examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for MiniMaxAI/MiniMax-M2.5
+# EAGLE3 quick-fail pipeline for MiniMaxAI/MiniMax-M2.5.
 #
 # MiniMax-M2.5 is a 230B MoE (10B active, 256 experts, top-8).
 # BF16 weights: ~460 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
@@ -10,10 +10,11 @@
 #   - task_2: check eagle_config.json and consider increasing intermediate_size
 #     since the draft head uses a dense layer by default.
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/MiniMax/MiniMax-M2.5/eagle3_quick_check.yaml --yes
 
 job_name: MiniMax-M2.5_EAGLE3_quick_fail_check
 pipeline:
@@ -24,8 +25,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/MiniMaxAI/MiniMax-M2.5
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -36,14 +38,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -51,43 +54,43 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 1
       - --speculative_algorithm EAGLE3
@@ -97,7 +100,7 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml b/tools/launcher/examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml
similarity index 58%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
rename to tools/launcher/examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml
index 5cdafdf47b1..9392d2281fb 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml
+++ b/tools/launcher/examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-14B-Instruct-2512-BF16
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-14B-Instruct-2512-BF16.
 #
 # Ministral-3-14B is a 14B dense model (model type: mistral3, multimodal but
 # EAGLE3 uses text-only path).
@@ -8,10 +8,11 @@
 #   - trust_remote_code may be needed for tokenizer
 #   - Model type is mistral3 (Mistral3ForConditionalGeneration)
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Mistral/Ministral-3-14B/eagle3_quick_check.yaml --yes
 
 job_name: Ministral-3-14B_EAGLE3_quick_fail_check
 pipeline:
@@ -22,8 +23,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/mistralai/Ministral-3-14B-Instruct-2512-BF16
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -34,14 +36,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -49,37 +52,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -94,7 +97,8 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml b/tools/launcher/examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml
similarity index 59%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
rename to tools/launcher/examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml
index 00f4bb1ffe3..30253bcf0d7 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b.yaml
+++ b/tools/launcher/examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16
+# EAGLE3 quick-fail pipeline for mistralai/Ministral-3-8B-Instruct-2512-BF16.
 #
 # Ministral-3-8B is an 8B dense model (model type: mistral3, multimodal but
 # EAGLE3 uses text-only path).
@@ -8,10 +8,11 @@
 #   - trust_remote_code may be needed for tokenizer
 #   - Model type is mistral3 (Mistral3ForConditionalGeneration)
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Mistral/Ministral-3-8B/eagle3_quick_check.yaml --yes
 
 job_name: Ministral-3-8B_EAGLE3_quick_fail_check
 pipeline:
@@ -22,8 +23,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/mistralai/Ministral-3-8B-Instruct-2512-BF16
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -36,14 +38,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -51,37 +54,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -96,7 +99,8 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml b/tools/launcher/examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml
new file mode 100644
index 00000000000..63894e540bb
--- /dev/null
+++ b/tools/launcher/examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml
@@ -0,0 +1,135 @@
+# EAGLE3 quick-fail pipeline for moonshotai/Kimi-K2.5.
+#
+# Kimi-K2.5 is a 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+# BF16 weights ~595 GB — fits on a single GB200 node (4 × 192 GB = 768 GB).
+#
+# Special requirements:
+#   - trust_remote_code required for the Kimi tokenizer
+#   - MLA attention — verify eagle_decoder_type compatibility in eagle_config.json
+#
+# MoE-specific notes:
+#   - task_1: --moe-ep 1 since TP=4 spans all 4 GPUs; increase --moe-ep and
+#     reduce --tp if you have more GPUs and want expert parallelism.
+#   - task_2: review eagle_config.json and consider increasing intermediate_size
+#     for MoE models (the draft head uses a dense layer by default, which may be
+#     undersized relative to K2.5's expert hidden dim of 2048).
+#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
+#     is not covered by the default llama decoder type.
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/MoonshotAI/Kimi-K2.5/eagle3_quick_check.yaml --yes
+
+job_name: Kimi-K2.5_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/moonshotai/Kimi-K2.5
+
+  # Step 1: Data synthesis via TRT-LLM server
+  # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
+  # OCI-HSG GB200: 4 GPUs × 192 GB = 768 GB per node — fits K2.5 BF16 (~595 GB) with TP=4.
+  # --trust_remote_code is required for the Kimi tokenizer.
+  task_0:
+    script: common/tensorrt_llm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tp_size 4
+      - --ep_size 1
+      - --max_num_tokens 32000
+      - --port 8000
+      - --host 0.0.0.0
+      - --trust_remote_code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 2: Dump hidden states from target model (TP=4, moe-ep=1)
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 3: Train EAGLE3 draft head (offline, single task)
+  # For K2.5 (MoE): check eagle_config.json and consider increasing
+  # intermediate_size since K2.5's per-expert hidden dim is 2048.
+  # Also verify --eagle_decoder_type is compatible with MLA attention.
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 4: Benchmark speculative decoding (VLLM backend)
+  #
+  # For K2.5 with VLLM, additional flags are recommended but may require
+  # extending quick_check.sh or using run.sh with a custom config:
+  #   --mm-encoder-tp-mode data   (data-parallel vision encoder, improves throughput)
+  #   --tool-call-parser kimi_k2  (required for tool-call responses)
+  #   --reasoning-parser kimi_k2  (required for thinking-mode responses)
+  #
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --trust-remote-code
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TP: 4
+      - EP: 1
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:latest
diff --git a/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml
new file mode 100644
index 00000000000..683794cf3ad
--- /dev/null
+++ b/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml
@@ -0,0 +1,149 @@
+# EAGLE3 quick-fail pipeline for nvidia/Kimi-K2.5-NVFP4.
+#
+# nvidia/Kimi-K2.5-NVFP4 is the NVIDIA-quantized NVFP4 variant of moonshotai/Kimi-K2.5.
+# Same architecture: 1T-parameter MoE VLM (32B active, 384 experts, 8 selected, MLA attention).
+# Only the weights and activations of linear ops within MoE transformer blocks are quantized;
+# embeddings and other tensors remain in higher precision.
+# EAGLE3 uses only the text path — the vision encoder is not invoked.
+#
+# Hardware requirements:
+#   NVFP4 checkpoint is ~591 GB (vs ~595 GB BF16) — same single-node requirement.
+#   OCI-HSG uses GB200 (Blackwell) nodes with 4 GPUs × 192 GB HBM3e = 768 GB per node.
+#   NVFP4 inference requires Blackwell GPUs — OCI-HSG GB200 satisfies this.
+#   A single 4×GB200 node with TP=4 is sufficient.
+#
+# Why different models for task_0 vs tasks 1–2:
+#   task_0 (data synthesis) uses vLLM + NVFP4 directly, since vLLM v0.15.0 supports
+#   KimiK25ForConditionalGeneration with NVFP4 on Blackwell.
+#   tasks 1–2 (hidden-state extraction and training) use the BF16 moonshotai/Kimi-K2.5
+#   checkpoint via TRT-LLM, which does not support KimiK25ForConditionalGeneration
+#   in v1.3.0. The draft head trained on BF16 hidden states transfers directly to
+#   NVFP4 — NVFP4 is near-lossless, so hidden state distributions are equivalent.
+#
+# MoE-specific notes:
+#   - task_1: --moe-ep 1 since TP=4 spans all 4 GPUs; increase --moe-ep and
+#     reduce --tp if you have more GPUs and want expert parallelism.
+#   - task_2: review eagle_config.json and consider increasing intermediate_size
+#     for MoE models (the draft head uses a dense layer by default, which may be
+#     undersized relative to K2.5's expert hidden dim of 2048).
+#   - task_2: --eagle_decoder_type may need adjustment if K2.5's MLA attention
+#     is not covered by the default llama decoder type.
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/NVIDIA/Kimi-K2.5-NVFP4/eagle3_quick_check.yaml --yes
+
+job_name: Kimi-K2.5-NVFP4_EAGLE3_quick_fail_check
+pipeline:
+  allow_to_fail: false
+  skip: false
+  note:
+
+  global_vars:
+    hf_model: /hf-local/nvidia/Kimi-K2.5-NVFP4
+    hf_model_bf16: /hf-local/moonshotai/Kimi-K2.5
+
+  # Step 1: Data synthesis via vLLM server using NVFP4 checkpoint
+  # Uses NVFP4 directly via vLLM v0.15.0, which supports KimiK25ForConditionalGeneration.
+  # vLLM is single-process (ntasks_per_node: 1); --tensor-parallel-size handles GPU distribution.
+  # --trust-remote-code is required for the Kimi tokenizer.
+  task_0:
+    script: common/vllm/query.sh
+    args:
+      - --model <<global_vars.hf_model>>
+      - --tensor-parallel-size 4
+      - --max-num-seqs 32
+      - --trust-remote-code
+      - --
+      - --data /hf-local/modelopt/Speculative-Decoding-Dataset-v2-default
+      - --save /scratchspace/data
+    environment:
+      - HF_LOCAL: /hf-local
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:v0.15.0
+
+  # Step 2: Dump hidden states using BF16 checkpoint (TP=4, moe-ep=1)
+  # Uses BF16 moonshotai/Kimi-K2.5: same TRT-LLM compatibility reason as task_0.
+  task_1:
+    script: common/eagle3/dump_offline_data.sh
+    args:
+      - --input-data /scratchspace/data
+      - --output-dir /scratchspace/offline_hidden_states
+      - --max-seq-len 8192
+      - --tp 4
+      - --moe-ep 1
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.hf_model_bf16>>
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 4
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 3: Train EAGLE3 draft head on BF16 hidden states
+  # HF_MODEL_CKPT (via model.model_name_or_path) points to BF16 for tokenizer/config
+  # loading; hidden states come from task_1 (also BF16).
+  # Draft head transfers to NVFP4 at benchmark time.
+  # For K2.5 (MoE): check eagle_config.json and consider increasing
+  # intermediate_size since K2.5's per-expert hidden dim is 2048.
+  # Also verify --eagle_decoder_type is compatible with MLA attention.
+  task_2:
+    script: common/eagle3/train_eagle.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model_bf16>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
+
+  # Step 4: Benchmark speculative decoding against NVFP4 target (VLLM backend)
+  #
+  # vllm/vllm-openai:v0.15.0 is required for NVFP4 support on Blackwell GPUs.
+  # Additional flags required for full K2.5 functionality but may require
+  # extending quick_check.sh or using run.sh with a custom config:
+  #   --tool-call-parser kimi_k2   (required for tool-call responses)
+  #   --reasoning-parser kimi_k2   (required for thinking-mode responses)
+  #
+  task_3:
+    script: common/specdec_bench/quick_check.sh
+    args:
+      - --draft_model_dir /scratchspace/export
+      - --draft_length 3
+      - --output_length 4096
+      - --engine VLLM
+      - --trust-remote-code
+      - --tp_size 4
+      - --ep_size 1
+      - --speculative_algorithm EAGLE3
+      - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
+      - --concurrency 1
+    environment:
+      - HF_LOCAL: /hf-local
+      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TP: 4
+      - EP: 1
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 4
+      container: vllm/vllm-openai:v0.15.0
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml b/tools/launcher/examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml
similarity index 52%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
rename to tools/launcher/examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml
index 061171f42ca..f6870249be9 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml
+++ b/tools/launcher/examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml
@@ -1,12 +1,16 @@
-# EAGLE3 quick-fail pipeline for openai/gpt-oss-20b
+# EAGLE3 quick-fail pipeline for openai/gpt-oss-20b.
 #
 # GPT-OSS-20B is a 20B dense model.
 # BF16 weights: ~40 GB — fits easily on a single GB200 node (4 × 192 GB).
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# Special requirements:
+#   - TIKTOKEN_RS_CACHE_DIR required for the GPT tokenizer
+#
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/OpenAI/GPT-OSS-20B/eagle3_quick_check.yaml --yes
 
 job_name: gpt-oss-20b_EAGLE3_quick_fail_check
 pipeline:
@@ -17,8 +21,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/openai/gpt-oss-20b
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -30,14 +35,15 @@ pipeline:
       - HF_LOCAL: /hf-local
       - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -45,42 +51,43 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
       - --output_length 4096
       - --engine VLLM
+      - --trust-remote-code
       - --tp_size 4
       - --ep_size 1
       - --speculative_algorithm EAGLE3
@@ -89,8 +96,10 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - TIKTOKEN_RS_CACHE_DIR: /hf-local/tiktoken_cache
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml
similarity index 59%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
rename to tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml
index 41ca8b52c3b..dec6f2989f5 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml
@@ -3,23 +3,11 @@
 # Dry-run on 100 samples to quickly check the possible failure points
 # over the all 4-step.
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
-#
-# Note:
-#   All these tasks share the same workspace ${job_dir}/cicd/${experiment_id}. That is,
-#   /scratchspace is the best place if you want to pass the artifact around. You will
-#   see we use /scratchspace/... to specify input and output a lot.
-#
-#   You can comment out any task below to skip it. For example, If the data has been
-#   synthesized in ${job_dir}/cicd/${prev_experiment_id}/data, then you can skip
-#   task_0. However, you will need to change task_1's --input-data from /scratchspace/data
-#   to the absolute path above.
+# All tasks share /scratchspace to pass artifacts between steps.
 #
 # Usage:
-#   uv run slurm.py --yaml services/pipeline/eagle3/quick_fail_check.yaml --yes
+#   uv run launch.py --yaml examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3-8B/eagle3_quick_check.yaml --yes
 
 job_name: Qwen3-8B_EAGLE3_quick_fail_check
 pipeline:
@@ -33,7 +21,7 @@ pipeline:
   # Step 1: Data synthesis via TRT-LLM server
   # Args before "--" go to trtllm-serve; args after "--" go to tools/query.py.
   task_0:
-    script: services/tensorrt-llm/query.sh
+    script: common/tensorrt_llm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tp_size 4
@@ -48,13 +36,14 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
 
   # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data.sh
+    script: common/eagle3/dump_offline_data.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -64,32 +53,31 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
 
   # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc2
 
   # Step 4: Benchmark speculative decoding (VLLM backend)
@@ -113,7 +101,7 @@ pipeline:
   #   container: lmsysorg/sglang:v0.5.9
   #
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -121,14 +109,14 @@ pipeline:
       - --engine VLLM
       - --tp_size 4
       - --ep_size 1
-      - --runtime_params services/specdec_bench/tensorrt_llm_runtiime_params.yaml
       - --speculative_algorithm EAGLE3
       - --mtbench /hf-local/HuggingFaceH4/mt_bench_prompts/raw/question.jsonl
       - --concurrency 1
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml b/tools/launcher/examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml
similarity index 54%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
rename to tools/launcher/examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml
index 79f6f1dec00..b2745525d7b 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-27b.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml
@@ -1,12 +1,13 @@
-# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-27B
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-27B.
 #
 # Qwen3.5-27B is a 27B dense VLM (text + vision). EAGLE3 uses only the text path.
 # BF16 weights: ~54 GB — fits on a single GB200 node (4 × 192 GB).
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-27B/eagle3_quick_check.yaml --yes
 
 job_name: Qwen3.5-27B_EAGLE3_quick_fail_check
 pipeline:
@@ -17,8 +18,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/Qwen/Qwen3.5-27B
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -28,14 +30,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -43,37 +46,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 4
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=4
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -88,7 +91,8 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml
similarity index 57%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
rename to tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml
index 5806acaf420..fb3d00b0cae 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-35B-A3B
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-35B-A3B.
 #
 # Qwen3.5-35B-A3B is a 35B MoE VLM (3B active, 256 experts, top-8, shared expert).
 # EAGLE3 uses only the text path.
@@ -8,10 +8,11 @@
 #   - task_2: check eagle_config.json and consider increasing intermediate_size
 #     since the draft head uses a dense layer by default.
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-35B-A3B/eagle3_quick_check.yaml --yes
 
 job_name: Qwen3.5-35B-A3B_EAGLE3_quick_fail_check
 pipeline:
@@ -22,8 +23,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/Qwen/Qwen3.5-35B-A3B
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -34,14 +36,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -49,37 +52,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -94,7 +97,8 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml b/tools/launcher/examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml
similarity index 54%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
rename to tools/launcher/examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml
index c2d374b2208..7d77fdf299b 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-9b.yaml
+++ b/tools/launcher/examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml
@@ -1,12 +1,13 @@
-# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-9B
+# EAGLE3 quick-fail pipeline for Qwen/Qwen3.5-9B.
 #
 # Qwen3.5-9B is a 9B dense VLM (text + vision). EAGLE3 uses only the text path.
 # BF16 weights: ~18 GB — fits easily on a single GB200 node (4 × 192 GB).
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/Qwen/Qwen3.5-9B/eagle3_quick_check.yaml --yes
 
 job_name: Qwen3.5-9B_EAGLE3_quick_fail_check
 pipeline:
@@ -17,8 +18,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/Qwen/Qwen3.5-9B
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -28,14 +30,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -43,37 +46,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -88,7 +91,8 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
+      gpus_per_node: 4
       container: vllm/vllm-openai:latest
diff --git a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml b/tools/launcher/examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml
similarity index 60%
rename from examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
rename to tools/launcher/examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml
index 8c2fa397821..c98b75621d0 100644
--- a/examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml
+++ b/tools/launcher/examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml
@@ -1,4 +1,4 @@
-# EAGLE3 quick-fail pipeline for stepfun-ai/Step-3.5-Flash
+# EAGLE3 quick-fail pipeline for stepfun-ai/Step-3.5-Flash.
 #
 # Step-3.5-Flash is a 197B MoE (11B active, 288 experts + 1 shared, top-8).
 # Has built-in MTP (multi-token prediction) head and sliding window attention.
@@ -12,10 +12,11 @@
 # MoE-specific notes:
 #   - Draft head intermediate_size may need tuning in eagle_config.json
 #
-# Container mount:
-#   /lustre:/lustre
-#   ${job_dir}/cicd/${experiment_id}:/scratchspace
-#   ${local_huggingface_hub}:/hf-local
+# All tasks share /scratchspace to pass artifacts between steps.
+#
+# Usage:
+#   uv run launch.py --yaml examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml --yes
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/StepFun/Step-3.5-Flash/eagle3_quick_check.yaml --yes
 
 job_name: Step-3.5-Flash_EAGLE3_quick_fail_check
 pipeline:
@@ -26,8 +27,9 @@ pipeline:
   global_vars:
     hf_model: /hf-local/stepfun-ai/Step-3.5-Flash
 
+  # Step 1: Data synthesis via vLLM server
   task_0:
-    script: services/vllm/query.sh
+    script: common/vllm/query.sh
     args:
       - --model <<global_vars.hf_model>>
       - --tensor-parallel-size 4
@@ -38,14 +40,15 @@ pipeline:
     environment:
       - HF_LOCAL: /hf-local
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 2: Dump hidden states from target model
   task_1:
-    script: services/pipeline/eagle3/dump_offline_data_vllm.sh
+    script: common/eagle3/dump_offline_data_vllm.sh
     args:
       - --input-data /scratchspace/data
       - --output-dir /scratchspace/offline_hidden_states
@@ -53,38 +56,37 @@ pipeline:
     environment:
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: vllm/vllm-openai:latest
 
+  # Step 3: Train EAGLE3 draft head (offline, single task)
   task_2:
-    script: services/pipeline/eagle3/offline_training.sh
+    script: common/eagle3/train_eagle.sh
     args:
-      - --offline-data /scratchspace/offline_hidden_states
-      - --data_path None
-      - --mode eagle3
-      - --num_epochs 1
-      - --lr 3e-4
-      - --save_steps 500000
-      - --output_dir /scratchspace/eagle3
-      - --train_bs 8
-      - --training_seq_len 4096
-      - --eagle_config modules/Model-Optimizer/examples/speculative_decoding/eagle_config.json
-      - --disable_tqdm True
-      - --ar_validate_steps 500000
-    environment:
-      - HF_MODEL_CKPT: <<global_vars.hf_model>>
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/eagle3.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - data.offline_data_path=/scratchspace/offline_hidden_states
+      - training.output_dir=/scratchspace/eagle3
+      - training.num_epochs=1
+      - training.lr=3e-4
+      - training.save_steps=500000
+      - training.train_bs=8
+      - training.training_seq_len=4096
+      - training.disable_tqdm=true
+      - training.ar_validate_steps=500000
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4
       container: nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc10
 
+  # Step 4: Benchmark speculative decoding (VLLM backend)
   task_3:
-    script: services/specdec_bench/quick_check.sh
+    script: common/specdec_bench/quick_check.sh
     args:
       - --draft_model_dir /scratchspace/export
       - --draft_length 3
@@ -99,7 +101,7 @@ pipeline:
       - HF_LOCAL: /hf-local
       - HF_MODEL_CKPT: <<global_vars.hf_model>>
     slurm_config:
-      _factory_: "oci_hsg_slurm_factory"
+      _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
       gpus_per_node: 4

From 8ea6b0acc17506fa4ad323e8b09d83e4ea69878f Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 2 Jun 2026 14:03:09 -0700
Subject: [PATCH 22/22] Fix pre-commit auto-formatting (license headers,
 markdown blanks)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .claude/skills/eagle3-review-logs/SKILL.md        |  3 +++
 .claude/skills/eagle3-triage/SKILL.md             |  2 ++
 .../pipeline/eagle3/dump_offline_data.sh          | 15 +++++++++++++++
 .../pipeline/eagle3/dump_offline_data_hf.sh       | 15 +++++++++++++++
 .../pipeline/eagle3/dump_offline_data_vllm.sh     | 15 +++++++++++++++
 .../eagle3/eagle3_new_model_triage_guide.md       |  4 ++++
 .../pipeline/eagle3/eagle3_triage_chart.md        |  9 ++++++++-
 .../pipeline/eagle3/offline_training.sh           | 15 +++++++++++++++
 8 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/.claude/skills/eagle3-review-logs/SKILL.md b/.claude/skills/eagle3-review-logs/SKILL.md
index e9e519c5a5d..18027a69096 100644
--- a/.claude/skills/eagle3-review-logs/SKILL.md
+++ b/.claude/skills/eagle3-review-logs/SKILL.md
@@ -54,6 +54,7 @@ For each task log, check:
 Output a structured markdown report:
 
 ### Summary
+
 - Overall status: PASSED / FAILED / MIXED / PARTIAL
 - Task breakdown: e.g., task_0 TIMEOUT, task_1 FAIL, task_2 skipped, task_3 skipped
 
@@ -68,6 +69,7 @@ For each task (0–3):
 - Suggested fix: actionable step
 
 ### Warnings
+
 Non-fatal issues worth noting (near-OOM, tokenizer warnings, slow throughput).
 
 ## Step 4 — Suggest next steps
@@ -75,6 +77,7 @@ Non-fatal issues worth noting (near-OOM, tokenizer warnings, slow throughput).
 Based on results:
 
 - If a task failed due to a known issue, suggest the fix and how to re-run from that task:
+
   ```bash
   uv run launch.py --yaml examples/<Org>/<Model>/hf_offline_eagle3.yaml \
       pipeline.task_0.skip=true \
diff --git a/.claude/skills/eagle3-triage/SKILL.md b/.claude/skills/eagle3-triage/SKILL.md
index 7009b4d0523..ed2422e6f63 100644
--- a/.claude/skills/eagle3-triage/SKILL.md
+++ b/.claude/skills/eagle3-triage/SKILL.md
@@ -146,6 +146,7 @@ After diagnosis, provide:
 3. **How to re-run** — skip earlier successful steps by pointing to existing scratchspace artifacts
 
 To skip task_0 and task_1 and re-run from task_2:
+
 ```bash
 uv run launch.py --yaml examples/<Org>/<Model>/hf_offline_eagle3.yaml \
     pipeline.task_0.skip=true \
@@ -154,6 +155,7 @@ uv run launch.py --yaml examples/<Org>/<Model>/hf_offline_eagle3.yaml \
 ```
 
 To run only task_1 standalone (using existing task_0 data):
+
 ```bash
 uv run launch.py --yaml examples/<Org>/<Model>/hf_offline_eagle3.yaml \
     pipeline.task_0.skip=true \
diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
index 2e55958edc8..c469833051e 100644
--- a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 
 source ${SCRIPT_DIR}/../service_utils.sh
diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
index 4172c61fd46..cd14ce07b7e 100755
--- a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_hf.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 
 source ${SCRIPT_DIR}/../../service_utils.sh
diff --git a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
index bc0659fbf2e..aea508d6e57 100644
--- a/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
+++ b/examples/speculative_decoding/pipeline/eagle3/dump_offline_data_vllm.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 
 source ${SCRIPT_DIR}/../../service_utils.sh
diff --git a/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md b/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
index 7d8ea0206b1..efaeba831fb 100644
--- a/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
+++ b/examples/speculative_decoding/pipeline/eagle3/eagle3_new_model_triage_guide.md
@@ -41,6 +41,7 @@ Experiment ID is printed as `cicd_<timestamp>`.
 ## Step 3: Check experiment output
 
 Experiment directory:
+
 ```
 experiments/cicd/cicd_<id>/
 ```
@@ -50,6 +51,7 @@ Each task has a directory `<JobName>_<N>/` containing:
 - `code/` — snapshot of the code at submission time
 
 Check logs:
+
 ```bash
 tail -100 experiments/cicd/cicd_<id>/<JobName>_<N>/sbatch_*.out
 ```
@@ -86,6 +88,7 @@ Common issues:
 ## Step 5: Applying fixes
 
 ### Repo fixes (for merged modelopt)
+
 Edit files in `/home/yeyu/Documents/TensorRT-Model-Optimizer/modelopt/torch/speculative/`.
 The key files:
 - `utils.py` — `load_vlm_or_llm()` for model loading
@@ -95,6 +98,7 @@ The key files:
 - `../export/plugins/hf_spec_export.py` — export logic
 
 ### Container patches (for pipeline)
+
 The TRT-LLM container has a pre-installed modelopt that can't be easily upgraded (CUDA build issues).
 Instead, runtime patches are applied in `offline_training.sh` using Python heredocs that find-and-replace
 exact code patterns in the installed library files. This is the same pattern used for speculators patches.
diff --git a/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md b/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
index 450146d9cce..2b08bd5ecc9 100644
--- a/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
+++ b/examples/speculative_decoding/pipeline/eagle3/eagle3_triage_chart.md
@@ -90,6 +90,7 @@ error, root cause, and resolution.
 ### Per-Model Test Results
 
 #### Model: Ministral-3-8B-Instruct-2512-BF16
+
 - **Date tested:** 2026-05-26
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-8b-from-task2.yaml`
 - **Experiments:** `cicd_1779312692` (dump), `cicd_1779829129` (train+bench), `cicd_1779901409` (retry w/ fixes)
@@ -117,6 +118,7 @@ error, root cause, and resolution.
 ---
 
 #### Model: gpt-oss-20b
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_gpt-oss-20b.yaml`
 - **Experiment:** `cicd_1776272530`
@@ -127,6 +129,7 @@ error, root cause, and resolution.
 - **Blocker:** Tokenizer loading. Needs special tokenizer setup or newer vLLM with OpenAI model support.
 
 #### Model: Qwen3.5-35B-A3B
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_qwen3.5-35b-a3b.yaml`
 - **Experiment:** `cicd_1776272531`
@@ -137,6 +140,7 @@ error, root cause, and resolution.
 - **Blocker:** Data synthesis too slow. Needs longer wall time or reduced dataset size. Server itself works.
 
 #### Model: Step-3.5-Flash
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_step-3.5-flash.yaml`
 - **Experiment:** `cicd_1776272532`
@@ -147,6 +151,7 @@ error, root cause, and resolution.
 - **Blocker:** Data synthesis hit time limit. Needs investigation of whether server started successfully.
 
 #### Model: MiniMax-M2.5
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_minimax-m2.5.yaml`
 - **Experiment:** `cicd_1776272524`
@@ -157,6 +162,7 @@ error, root cause, and resolution.
 - **Blocker:** Time limit on data synth + `trust_remote_code` needed for benchmark.
 
 #### Model: Ministral-3-14B
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_ministral-3-14b.yaml`
 - **Experiment:** `cicd_1776272522`
@@ -167,6 +173,7 @@ error, root cause, and resolution.
 - **Blocker:** vLLM engine fails to initialize. Same `mistral3` model type issue as 8B variant. Needs newer vLLM + transformers.
 
 #### Model: DeepSeek-V3.2
+
 - **Date tested:** 2026-04-15
 - **Config:** `examples/speculative_decoding/pipeline/eagle3/quick_fail_check_deepseek-v3.2.yaml`
 - **Experiment:** `cicd_1776275945`
@@ -178,7 +185,7 @@ error, root cause, and resolution.
 
 ---
 
-_Use the following template for additional models:_
+*Use the following template for additional models:*
 
 ```markdown
 #### Model: <name>
diff --git a/examples/speculative_decoding/pipeline/eagle3/offline_training.sh b/examples/speculative_decoding/pipeline/eagle3/offline_training.sh
index 8f1543a5b00..a52fb1c07de 100644
--- a/examples/speculative_decoding/pipeline/eagle3/offline_training.sh
+++ b/examples/speculative_decoding/pipeline/eagle3/offline_training.sh
@@ -1,5 +1,20 @@
 #!/bin/bash
 
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
 source ${SCRIPT_DIR}/../../service_utils.sh 2>/dev/null || true