From 0831d6556a3683e644e5497e0cdf0c1d81c950de Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 16 Jun 2026 11:00:52 -0700
Subject: [PATCH 1/5] specdec: per-conversation thinking-mode mix in data
 synthesis (for MiniMax-M3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a --thinking-modes cycle to server_generate.py so synthetic conversations are
generated across a mix of thinking modes — e.g. MiniMax-M3's enabled/disabled/adaptive,
passed via chat_template_kwargs — so a DFlash/EAGLE draft trained on the data generalizes
across modes. Conversation i uses modes[i % len(modes)] for an even split; the mode is
recorded on each output record. Empty (default) sends no thinking_mode, unchanged for
models without it.

distributed_generate/worker.sh: pass THINKING_MODES through to server_generate.py, and add
VLLM_SERVE_EXTRA_ARGS / SGLANG_SERVE_EXTRA_ARGS passthroughs for model-specific serve flags
(M3 needs --block-size 128 for MSA sparse attention and --language-model-only for text-only
synthesis; KV cache stays bf16 — M3's MSA fused kernel rejects fp8 KV).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../distributed_generate/worker.sh            | 14 +++++++++--
 .../scripts/server_generate.py                | 23 +++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/examples/speculative_decoding/distributed_generate/worker.sh b/examples/speculative_decoding/distributed_generate/worker.sh
index 97bf14c014a..01f01bb746b 100644
--- a/examples/speculative_decoding/distributed_generate/worker.sh
+++ b/examples/speculative_decoding/distributed_generate/worker.sh
@@ -20,10 +20,15 @@ BACKEND="$2"
 JOBS_PER_NODE="$3"
 SYSTEM_PROMPT="$4"
 
+# Optional model-specific serve flags via env, appended to the serve command. E.g. for
+# MiniMax-M3: VLLM_SERVE_EXTRA_ARGS="--block-size 128 --language-model-only" (--block-size
+# 128 is mandatory for M3's MSA sparse attention; --language-model-only skips the vision
+# encoder for text-only synthesis; KV cache stays bf16 — M3's MSA fused kernel rejects
+# fp8 KV).
 if [ "$BACKEND" == "vllm" ]; then
-    vllm serve /model/  --tensor-parallel-size 8 --served-model-name model --port 8000 --host 0.0.0.0 --trust-remote-code &
+    vllm serve /model/  --tensor-parallel-size 8 --served-model-name model --port 8000 --host 0.0.0.0 --trust-remote-code ${VLLM_SERVE_EXTRA_ARGS:-} &
 else
-    python3 -m sglang.launch_server --model-path /model --served-model-name model --tp 8 --port 8000 --host 0.0.0.0 --trust-remote-code &
+    python3 -m sglang.launch_server --model-path /model --served-model-name model --tp 8 --port 8000 --host 0.0.0.0 --trust-remote-code ${SGLANG_SERVE_EXTRA_ARGS:-} &
 fi
 # Wait for server to start up by polling the health endpoint
 echo "Waiting for server to start..."
@@ -59,6 +64,11 @@ if [ "$mpi_rank" -eq 0 ]; then
         if [ -n "$SYSTEM_PROMPT" ]; then
             cmd+=" --system_prompt $SYSTEM_PROMPT"
         fi
+        # Optional: cycle thinking modes for a mixed dataset (e.g. MiniMax-M3
+        # THINKING_MODES="enabled,disabled,adaptive").
+        if [ -n "${THINKING_MODES:-}" ]; then
+            cmd+=" --thinking-modes $THINKING_MODES"
+        fi
         echo "Running command: $cmd"
         eval $cmd
     done
diff --git a/examples/speculative_decoding/scripts/server_generate.py b/examples/speculative_decoding/scripts/server_generate.py
index 0fb71a0a0a1..46a87d70373 100644
--- a/examples/speculative_decoding/scripts/server_generate.py
+++ b/examples/speculative_decoding/scripts/server_generate.py
@@ -54,8 +54,20 @@
     "--log_empty_conversations", action="store_true", help="Log empty conversations"
 )
 parser.add_argument("--system_prompt", nargs="+", type=str, default="", help="System prompt")
+parser.add_argument(
+    "--thinking-modes",
+    type=str,
+    default="",
+    help="Comma-separated thinking modes to cycle through per conversation, passed to the "
+    "server via chat_template_kwargs (e.g. 'enabled,disabled,adaptive' for MiniMax-M3). "
+    "Conversation i uses modes[i %% len(modes)], giving an even mix across the dataset. "
+    "Empty (default) sends no thinking_mode, preserving behavior for models without it.",
+)
 args = parser.parse_args()
 
+# Parse the thinking-mode cycle; empty -> no thinking_mode injected.
+THINKING_MODES = [m.strip() for m in args.thinking_modes.split(",") if m.strip()]
+
 
 if args.data_path.endswith("jsonl"):
     with open(args.data_path) as f:
@@ -73,6 +85,14 @@ def generate_data(messages, idx, system_prompt):
     try:
         model_name = args.model
 
+        # Cycle thinking modes per conversation for an even mix across the dataset (e.g.
+        # MiniMax-M3 enabled/disabled/adaptive). Passed via chat_template_kwargs; empty
+        # list -> not sent.
+        thinking_mode = THINKING_MODES[idx % len(THINKING_MODES)] if THINKING_MODES else None
+        extra_body = (
+            {"chat_template_kwargs": {"thinking_mode": thinking_mode}} if thinking_mode else {}
+        )
+
         if args.chat:
             output_messages = []
 
@@ -105,6 +125,7 @@ def generate_data(messages, idx, system_prompt):
                         messages=output_messages,
                         max_tokens=args.max_tokens,
                         temperature=args.temperature,
+                        extra_body=extra_body,
                     )
                     if response.choices[0].finish_reason == "length":
                         break
@@ -124,6 +145,8 @@ def generate_data(messages, idx, system_prompt):
                 to_write = {"conversation_id": idx}
             else:
                 to_write = {"conversation_id": idx, "conversations": output_messages}
+                if thinking_mode:
+                    to_write["thinking_mode"] = thinking_mode
             with open(args.output_path, "a") as f:
                 # write in share gpt format
                 f.write(json.dumps(to_write) + "\n")

From cdc03a6ebb8d6882e34b8dd2333767eb2eb4afdb Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Tue, 16 Jun 2026 11:51:01 -0700
Subject: [PATCH 2/5] specdec(server_generate): accept OAI 'messages' input +
 OAI-native output

While wiring MiniMax-M3 synthesis (prompt set Speculative-Decoding-Dataset-v2 is OAI-format):
- Accept both 'conversations' (ShareGPT) and 'messages' (OAI) prompt datasets on input
  (previously KeyError'd on 'messages').
- Add --output-format {oai,sharegpt} (default oai): emit the OpenAI standard
  {'messages': [{role, content}, ...]} instead of the legacy {'conversations': [...]}.
  Pass --output-format sharegpt for the old key.

Validated end-to-end on MiniMax-M3-MXFP8 (single-node TP8 H100): the 3-way thinking-mode
mix renders correctly (disabled -> direct answer; adaptive -> <mm:think> reasoning captured
in content since no reasoning-parser), OAI in/out flows into the vLLM hidden-state dump.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../scripts/server_generate.py                | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/examples/speculative_decoding/scripts/server_generate.py b/examples/speculative_decoding/scripts/server_generate.py
index 46a87d70373..6490055edc7 100644
--- a/examples/speculative_decoding/scripts/server_generate.py
+++ b/examples/speculative_decoding/scripts/server_generate.py
@@ -63,7 +63,17 @@
     "Conversation i uses modes[i %% len(modes)], giving an even mix across the dataset. "
     "Empty (default) sends no thinking_mode, preserving behavior for models without it.",
 )
+parser.add_argument(
+    "--output-format",
+    type=str,
+    default="oai",
+    choices=["oai", "sharegpt"],
+    help="Output chat format: 'oai' writes the OpenAI standard ({'messages': [{role, "
+    "content}, ...]}); 'sharegpt' writes the legacy {'conversations': [...]} key. Both "
+    "use role/content message dicts.",
+)
 args = parser.parse_args()
+MESSAGES_KEY = "messages" if args.output_format == "oai" else "conversations"
 
 # Parse the thinking-mode cycle; empty -> no thinking_mode injected.
 THINKING_MODES = [m.strip() for m in args.thinking_modes.split(",") if m.strip()]
@@ -144,7 +154,7 @@ def generate_data(messages, idx, system_prompt):
                     return
                 to_write = {"conversation_id": idx}
             else:
-                to_write = {"conversation_id": idx, "conversations": output_messages}
+                to_write = {"conversation_id": idx, MESSAGES_KEY: output_messages}
                 if thinking_mode:
                     to_write["thinking_mode"] = thinking_mode
             with open(args.output_path, "a") as f:
@@ -210,7 +220,17 @@ def generate_data(messages, idx, system_prompt):
     for idx, sample in enumerate(data):
         if idx in finished_ids:
             continue
-        future = executor.submit(generate_data, sample["conversations"], idx, system_prompt)
+        # Accept both ShareGPT ("conversations") and OAI-chat ("messages") prompt datasets
+        # (e.g. Speculative-Decoding-Dataset-v2 uses "messages"). generate_data already
+        # handles the from/value and role/content message shapes.
+        sample_messages = sample.get("conversations")
+        if sample_messages is None:
+            sample_messages = sample.get("messages")
+        if sample_messages is None:
+            raise KeyError(
+                f"sample {idx} has neither 'conversations' nor 'messages'; keys: {list(sample)}"
+            )
+        future = executor.submit(generate_data, sample_messages, idx, system_prompt)
         futures.append(future)
 
     for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):

From a02bf8f92b6d9102ebff1e8bee0e3fe6fa686699 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 22 Jun 2026 09:44:50 -0700
Subject: [PATCH 3/5] specdec(dump): enable vLLM hidden-state dump for
 MiniMax-M3 (OMNIML-4747)

Validated extract_hidden_states on MiniMax-M3-MXFP8 (single-node TP8 H100). Required M3
enablement in compute_hidden_states_vllm.py:
- --block-size (M3's MSA sparse attention mandates 128; default None elsewhere).
- --enforce-eager: M3's MSA Triton kernel (_gqa_sparse_fwd_kernel) JIT-recompiles per
  input shape; under cudagraph capture a recompile blows the executor RPC timeout and
  hangs the engine (sample_tokens timeout). Eager mode + a long VLLM_RPC_TIMEOUT fixes it.
- --language-model-only: skip the vision encoder for text-only dumps (M3 is VL).
- Read num_hidden_layers from text_config/llm_config for wrapped VL configs
  (MiniMaxM3VLConfig nests it; previously raised 'no num_hidden_layers attribute').

Output verified: per-conv .pt with input_ids / hidden_states (T,6144) / aux_hidden_states /
loss_mask (length-matched) / conversation_id.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../compute_hidden_states_vllm.py             | 37 ++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
index 77441f8f858..462de6afa65 100644
--- a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
+++ b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_vllm.py
@@ -110,6 +110,25 @@ def parse_args() -> argparse.Namespace:
         "--trust_remote_code", action="store_true", help="Trust remote code for HF models."
     )
     parser.add_argument("--tp", type=int, default=None, help="Tensor parallel size.")
+    parser.add_argument(
+        "--block-size",
+        type=int,
+        default=None,
+        help="KV cache block size. Some models require a specific value — e.g. MiniMax-M3's "
+        "MSA sparse attention mandates 128. Default (None) lets vLLM choose.",
+    )
+    parser.add_argument(
+        "--language-model-only",
+        action="store_true",
+        help="Skip the vision encoder for text-only dumps (multimodal models, e.g. MiniMax-M3).",
+    )
+    parser.add_argument(
+        "--enforce-eager",
+        action="store_true",
+        help="Disable CUDA graph / torch.compile. Needed for MiniMax-M3: its MSA sparse "
+        "kernel JIT-recompiles per shape and a recompile can exceed the executor RPC "
+        "timeout under cudagraph capture, hanging the engine.",
+    )
     parser.add_argument(
         "--debug-max-num-conversations", type=int, default=None, help="Limit conversations."
     )
@@ -168,7 +187,12 @@ def keep_conversation(entry):
     # Resolve the aux-layer indices and append the final-layer output. vLLM saves the
     # final (un-normed) hidden state when ``num_hidden_layers`` is passed as a layer id.
     config = AutoConfig.from_pretrained(args.model, trust_remote_code=args.trust_remote_code)
-    num_hidden_layers = getattr(config, "num_hidden_layers", None)
+    # Vision-language / wrapped configs (e.g. MiniMax-M3's MiniMaxM3VLConfig) nest the
+    # text model's layer count under text_config / llm_config rather than at the top level.
+    text_config = getattr(config, "text_config", None) or getattr(config, "llm_config", None)
+    num_hidden_layers = getattr(config, "num_hidden_layers", None) or getattr(
+        text_config, "num_hidden_layers", None
+    )
     if num_hidden_layers is None:
         raise ValueError(f"model config has no 'num_hidden_layers' attribute: {config}")
     aux_layer_ids = _resolve_aux_layers_standalone(
@@ -244,12 +268,23 @@ def keep_conversation(entry):
     storage_path.mkdir(parents=True, exist_ok=True)
     atexit.register(shutil.rmtree, storage_path, ignore_errors=True)
 
+    # Model-specific extras (e.g. MiniMax-M3 mandates block_size=128 for MSA sparse
+    # attention; --language-model-only skips the vision encoder for text-only dumps).
+    extra_llm_kwargs = {}
+    if args.block_size is not None:
+        extra_llm_kwargs["block_size"] = args.block_size
+    if args.language_model_only:
+        extra_llm_kwargs["language_model_only"] = True
+    if args.enforce_eager:
+        extra_llm_kwargs["enforce_eager"] = True
+
     llm = LLM(
         model=args.model,
         tensor_parallel_size=tp,
         max_model_len=args.max_seq_len,
         trust_remote_code=args.trust_remote_code,
         enable_chunked_prefill=False,  # required by extract_hidden_states
+        **extra_llm_kwargs,
         # With prefix caching on, vLLM serves shared prefixes from cache in block-sized
         # chunks and the hidden-state connector only emits the freshly-computed suffix, so
         # the dumped hidden_states come out short by N*block_size vs the full input_ids /

From 8961d0358ebe51dce8394b577482150682baa18c Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 22 Jun 2026 11:04:31 -0700
Subject: [PATCH 4/5] specdec(recipe): add MiniMax-M3 DFlash offline recipe +
 train template (OMNIML-4747)

2-step offline DFlash recipe for MiniMax-M3 (427B VL-MoE), mirroring MiniMax-M2.7-DFlash:
- hf_offline_dflash.yaml: dump (vLLM extract_hidden_states, MXFP8 single-node TP8) + train
  (FakeBaseModel on bf16). M3-specific: --block-size 128 (MSA), --language-model-only,
  --enforce-eager + VLLM_RPC_TIMEOUT=1800000 (avoid MSA Triton-kernel JIT RPC-timeout hang),
  seq-len 8192 end-to-end, mask token 200061 (200054 is a real special token in M3),
  OVERRIDE_TRANSFORMERS 4.52.4, export-YaRN original_max_position 8192 / factor 24
  (tunable; 128 for full 1M).
- chat_template_train.jinja: M3 chat template with {% generation %} wrapping the assistant
  turn (think + content + tool_calls) for answer_only_loss; header + eos sit outside the
  span, matching the M2.7 convention. Thinking-mode handling preserved verbatim. Validated:
  generation spans cover exactly the assistant outputs across multi-turn + no-think turns.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../chat_template_train.jinja                 | 256 ++++++++++++++++++
 .../MiniMax-M3-DFlash/hf_offline_dflash.yaml  | 125 +++++++++
 2 files changed, 381 insertions(+)
 create mode 100644 tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja
 create mode 100644 tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml

diff --git a/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja
new file mode 100644
index 00000000000..8d2e37cae3e
--- /dev/null
+++ b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja
@@ -0,0 +1,256 @@
+{# MiniMax-M3 chat template with {% generation %} tags for answer_only_loss training.
+   Adapted from https://huggingface.co/MiniMaxAI/MiniMax-M3/blob/main/chat_template.jinja
+   with {% generation %} / {% endgeneration %} wrapping the assistant turn's output
+   (think + content + tool_calls), matching the MiniMax-M2.7-DFlash convention: the
+   ']~b]ai\n' header and the trailing eos sit OUTSIDE the generation span, so the loss
+   mask covers only what the model produces. Thinking-mode handling is preserved verbatim
+   so dumps reflect the same enabled/disabled/adaptive mix used during synthesis.
+#}
+{# ---------- special token variables ---------- #}
+{%- set ns_token               = ']<]minimax[>['                  -%}
+{%- set bod_token              = ']~!b['                          -%}
+{%- set bos_token              = ']~b]'                           -%}
+{%- set eos_token              = '[e~['                           -%}
+{%- set toolcall_begin_token   = ns_token ~ '<tool_call>'         -%}
+{%- set toolcall_end_token     = ns_token ~ '</tool_call>'        -%}
+{%- set think_begin_token      = '<mm:think>'                     -%}
+{%- set think_end_token        = '</mm:think>'                    -%}
+{%- set image_token            = ']<]image[>['                    -%}
+{%- set video_token            = ']<]video[>['                    -%}
+{#- Thinking mode: "enabled" / "disabled" / "adaptive" / not defined -#}
+{#- Recursive XML renderer for tool_call arguments ======================== -#}
+{#- None values are intentionally skipped in mapping iteration so that
+    `<key>null</key>` (which would round-trip to the literal string "null")
+    never appears in the rendered tool_call. The convention is: omit the
+    field entirely. The top-level `_args` loop applies the same rule.
+    The `val is none` branch below is a safety net only — upstream cleaning
+    (drop_none_in_tool_arguments) should ensure no None ever reaches here. -#}
+{%- macro to_xml(val, ns) -%}
+{%- if val is mapping -%}
+{%- for k, v in val.items() if v is not none -%}
+{{ ns }}<{{ k }}>{{ to_xml(v, ns) }}{{ ns }}</{{ k }}>
+{%- endfor -%}
+{%- elif val is iterable and val is not string -%}
+{%- for item in val -%}
+{{ ns }}<item>{{ to_xml(item, ns) }}{{ ns }}</item>
+{%- endfor -%}
+{%- elif val is none -%}
+{#- Should be unreachable when upstream cleaning is applied. -#}
+{%- elif val is boolean -%}
+{{ val | tojson }}
+{%- else -%}
+{{ val }}
+{%- endif -%}
+{%- endmacro -%}
+{#- Tool Rendering Functions ============================================== -#}
+{%- macro render_tool_namespace(namespace_name, tool_list) -%}
+{%- for tool in tool_list -%}
+<tool>{{ tool.function | tojson(ensure_ascii=False) }}</tool>
+{% endfor -%}
+{%- endmacro -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{ content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is mapping and item.type == 'image' -%}
+                {{- image_token }}
+            {%- elif item is mapping and item.type == 'video' -%}
+                {{- video_token}}
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- elif content is none -%}
+        {{- '' }}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{#- System Message Construction ============================================ -#}
+{%- macro build_system_message(system_message) -%}
+    {%- if system_message and system_message.content -%}
+        {{- visible_text(system_message.content) }}
+    {%- else -%}
+        {{- 'Your model version is MiniMax-M3, developed by MiniMax. Knowledge cutoff: January 2026. Founded in early 2022, MiniMax is a global AI foundation model company committed to advancing the frontiers of AI towards AGI.' }}
+    {%- endif -%}
+
+    {#- Thinking mode instructions -#}
+    {{- '\n\n<thinking_instructions>\n' }}
+    {{- 'You have a thinking capability that allows you to reason step by step before responding. When thinking is enabled, wrap your reasoning in ' ~ think_begin_token ~ think_end_token ~ ' tags before your response. When thinking is disabled, begin your response directly after the ' ~ think_end_token ~ ' prefix. When thinking is adaptive, decide on your own whether to think for the current turn.\n' }}
+    {%- if thinking_mode is defined -%}
+        {%- if thinking_mode == "enabled" -%}
+            {{- 'Current thinking mode: enabled. You MUST think step by step before every response, including after receiving function/tool results.\n' }}
+        {%- elif thinking_mode == "disabled" -%}
+            {{- 'Current thinking mode: disabled. Do not output any thinking process.\n' }}
+        {%- elif thinking_mode == "adaptive" -%}
+            {{- 'Current thinking mode: adaptive. You are encouraged to think for complex decision-making, multi-step reasoning, or when analyzing function/tool results.\n' }}
+        {%- endif -%}
+    {%- else -%}
+        {{- 'Current thinking mode: adaptive. You are encouraged to think for complex decision-making, multi-step reasoning, or when analyzing function/tool results.\n' }}
+    {%- endif -%}
+    {{- '</thinking_instructions>' }}
+{%- endmacro -%}
+{%- macro build_developer_message(developer_message) -%}
+    {%- if developer_message and developer_message.content -%}
+        {{- visible_text(developer_message.content) }}
+    {%- else -%}
+        {%- if model_identity is not defined -%}
+            {%- set model_identity = "You are a helpful assistant." -%}
+        {%- endif -%}
+        {{- model_identity }}
+    {%- endif -%}
+{%- endmacro -%}
+{#- Main Template Logic ================================================= -#}
+{#- Role mapping: root -> system sp (high priority), system/developer -> developer sp (low priority) -#}
+{%- set system_message = none -%}
+{%- set developer_message = none -%}
+{%- set conversation_messages = messages -%}
+{%- if messages and messages[0].role == "root" -%}
+    {%- set system_message = messages[0] -%}
+    {%- set conversation_messages = messages[1:] -%}
+    {%- if conversation_messages and conversation_messages[0].role in ["system", "developer"] -%}
+        {%- set developer_message = conversation_messages[0] -%}
+        {%- set conversation_messages = conversation_messages[1:] -%}
+    {%- endif -%}
+{%- elif messages and messages[0].role in ["system", "developer"] -%}
+    {%- set developer_message = messages[0] -%}
+    {%- set conversation_messages = messages[1:] -%}
+{%- endif -%}
+{#- Render system sp (higher priority, root role only) -#}
+{{- bod_token ~ bos_token ~ 'system' ~ '\n' }}
+{{- build_system_message(system_message) }}
+{{- eos_token ~ '\n' }}
+
+{#- Render developer sp (lower priority: system/developer role + tools) -#}
+{{- bos_token ~ 'developer' ~ '\n' }}
+{{- build_developer_message(developer_message) }}
+{%- if tools -%}
+    {{- '\n\n' ~ '# Tools' ~ '\n' ~ 'You may call one or more tools to assist with the user query.\nHere are the tools available in JSONSchema format:' ~ '\n' }}
+    {{- '\n' ~ '<tools>' ~ '\n' }}
+    {{- render_tool_namespace("functions", tools) }}
+    {{- '</tools>' ~ '\n\n' }}
+    {{- 'To call tools, wrap all invocations in a single ' ~ toolcall_begin_token ~ toolcall_end_token ~ ' block. Parameter values containing nested objects or arrays are recursively expanded into XML elements. Example:\n' }}
+    {{- '\n' ~ toolcall_begin_token ~ '\n' }}
+    {{- ns_token + '<invoke name="tool-name-1">' }}
+    {{- ns_token + '<param-1>value-1' + ns_token + '</param-1>' }}
+    {{- ns_token + '<param-2>' }}
+    {{- ns_token + '<item>' }}
+    {{- ns_token + '<key-a>val-a' + ns_token + '</key-a>' }}
+    {{- ns_token + '<key-b>val-b' + ns_token + '</key-b>' }}
+    {{- ns_token + '</item>' }}
+    {{- ns_token + '</param-2>' }}
+    {{- ns_token + '</invoke>\n' }}
+    {{- ns_token + '<invoke name="tool-name-2">' }}
+    {{- ns_token + '<param-1>value-1' + ns_token + '</param-1>' }}
+    {{- ns_token + '</invoke>\n' }}
+    {{- toolcall_end_token }}
+{%- endif -%}
+{{- eos_token ~ '\n' }}
+
+{#- Render messages -#}
+{%- set last_tool_call = namespace(name=none) -%}
+{%- for message in conversation_messages -%}
+    {%- if message.role == 'assistant' -%}
+        {{- bos_token ~ 'ai' ~ '\n' }}
+        {%- generation -%}
+        {%- set reasoning_content = '' %}
+        {%- set content = visible_text(message.content) %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if think_end_token in content %}
+                {%- set reasoning_content = content.split(think_end_token)[0].strip('\n').split(think_begin_token)[-1].strip('\n') %}
+                {%- set content = content.split(think_end_token)[-1].strip('\n') %}
+            {%- endif %}
+        {%- endif %}
+
+        {%- if reasoning_content -%}
+            {#- Render thinking for every assistant turn (all-turn visible) -#}
+            {{- think_begin_token ~ reasoning_content ~ think_end_token }}
+        {%- else -%}
+            {#- No thinking rendered → prefix with think_end_token -#}
+            {{- think_end_token }}
+        {%- endif -%}
+
+        {%- if content -%}
+            {{- content }}
+        {%- endif -%}
+        {%- if message.tool_calls -%}
+            {{- toolcall_begin_token ~ '\n' }}
+
+            {%- for tool_call in message.tool_calls -%}
+                {%- if tool_call.function -%}
+                    {%- set tool_call = tool_call.function -%}
+                {%- endif -%}
+{{- ns_token + '<invoke name="' + tool_call.name + '">' }}
+{%- set _args = tool_call.arguments -%}
+{%- for k, v in _args.items() if v is not none %}
+{{- ns_token + '<' + k + '>' -}}
+{{- to_xml(v, ns_token) -}}
+{{- ns_token + '</' + k + '>' }}
+{%- endfor -%}
+{{- ns_token + '</invoke>' ~ '\n' }}
+            {%- endfor -%}
+
+            {{- toolcall_end_token }}
+            {%- if message.tool_calls[-1].function -%}
+                {%- set last_tool_call.name = message.tool_calls[-1].function.name -%}
+            {%- else -%}
+                {%- set last_tool_call.name = message.tool_calls[-1].name -%}
+            {%- endif -%}
+        {%- else -%}
+            {%- set last_tool_call.name = none -%}
+        {%- endif -%}
+        {%- endgeneration -%}
+        {{- eos_token ~ '\n' }}
+
+    {%- elif message.role == 'tool' -%}
+        {%- if last_tool_call.name is none -%}
+            {{- raise_exception("Message has tool role, but there was no previous assistant message with a tool call!") }}
+        {%- endif -%}
+        {%- if loop.first or (conversation_messages[loop.index0 - 1].role != 'tool') -%}
+            {{- bos_token ~ 'tool' }}
+        {%- endif -%}
+        {{- '\n<response>' }}
+        {%- if message.content is string -%}
+            {{- message.content }}
+        {%- else -%}
+            {%- for tr in message.content -%}
+                {%- if tr is mapping and tr.type is defined and tr.type == 'image' -%}
+                    {{- image_token }}
+                {%- elif tr is mapping and tr.type is defined and tr.type == 'video' -%}
+                    {{- video_token }}
+                {%- else -%}
+                    {{- tr.output if tr.output is defined else (tr.text if tr.type == 'text' and tr.text is defined else tr) }}
+                {%- endif -%}
+            {%- endfor -%}
+        {%- endif -%}
+        {{- '</response>' }}
+        {%- if loop.last or (conversation_messages[loop.index0 + 1].role != 'tool') -%}
+            {{- eos_token ~ '\n' -}}
+        {%- endif -%}
+
+    {%- elif message.role == 'user' -%}
+        {{- bos_token ~ 'user' ~ '\n' }}
+        {{- visible_text(message.content) }}
+        {{- eos_token ~ '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{#- Generation prompt -#}
+{%- if add_generation_prompt -%}
+{{- bos_token ~ 'ai' ~ '\n' }}
+{%- if thinking_mode is defined and thinking_mode == "disabled" -%}
+    {{- think_end_token }}
+{%- elif thinking_mode is defined and thinking_mode == "adaptive" -%}
+    {#- adaptive: no prefix, let model decide -#}
+{%- elif thinking_mode is defined and thinking_mode == "enabled" -%}
+    {#- enabled or not defined: default to think -#}
+    {{- think_begin_token }}
+{%- else -%}
+    {#- adaptive: no prefix, let model decide -#}
+{%- endif -%}
+{%- endif -%}
diff --git a/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml
new file mode 100644
index 00000000000..a0e64e7fd91
--- /dev/null
+++ b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml
@@ -0,0 +1,125 @@
+# DFlash offline speculative decoding training for MiniMax-M3 (427B VL-MoE, 26B active).
+#
+# 2-step pipeline (mirrors MiniMax-M2.7-DFlash/hf_offline_dflash.yaml). Offline is the
+# chosen path for M3 — online FSDP2 training streams the 427B base forward at every step
+# and is too slow at scale:
+#   task_0: Dump base-model hidden states once via vLLM extract_hidden_states.
+#   task_1: Train the DFlash draft on the dump (FakeBaseModel — loads only lm_head +
+#           embed_tokens, not the full 427B base).
+#
+# M3-specific notes (differ from M2.7), all validated 2026-06-22:
+#   * Dump serves MiniMax-M3-MXFP8 (NVIDIA-published quant) single-node TP8 on H100. M3
+#     is not in stable vLLM yet -> image vllm/vllm-openai:minimax-m3.
+#   * --block-size 128 is MANDATORY for M3's MSA sparse attention.
+#   * --language-model-only skips the vision encoder (text-only synth/dump).
+#   * --enforce-eager + VLLM_RPC_TIMEOUT=1800000 are REQUIRED: M3's MSA Triton kernel
+#     (_gqa_sparse_fwd_kernel) JIT-recompiles per input shape; under cudagraph capture a
+#     recompile blows the executor RPC timeout (sample_tokens timeout -> EngineDead hang).
+#     Eager mode + a long RPC timeout avoids the hang. KV cache stays bf16 (M3's MSA fused
+#     kernel rejects fp8 KV).
+#   * Training FakeBaseModel reads lm_head + embed_tokens from the bf16 M3 (real weights;
+#     these tensors are not what MXFP8 quantizes, so dump@MXFP8 / train@bf16 logits stay
+#     consistent). Per Ye Yu: adhere to published bf16/MXFP8 ckpts, do not self-quantize.
+#   * Sequence length 8192 (not M2.7's 4096) end-to-end: synth, dump, training — captures
+#     full <mm:think> reasoning across the enabled/disabled/adaptive mode mix.
+#
+# Reference: "DFlash: Block Diffusion for Flash Speculative Decoding" (arXiv:2602.06036)
+#
+# Usage:
+#   uv run slurm.py --yaml modules/Model-Optimizer/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml --yes
+
+job_name: MiniMax-M3-DFlash_offline
+pipeline:
+  global_vars:
+    # bf16 base — used by training's FakeBaseModel (lm_head + embed_tokens) and tokenizer.
+    hf_model: /hf-local/MiniMaxAI/MiniMax-M3
+    # NVIDIA-published MXFP8 quant — used only to serve the dump single-node TP8 on H100.
+    dump_model: /hf-local/MiniMaxAI/MiniMax-M3-MXFP8
+
+  # Step 1: Dump base-model hidden states via vLLM extract_hidden_states (TP=8, MXFP8).
+  task_0:
+    script: common/eagle3/dump_offline_data_vllm.sh
+    args:
+      # Synthetic data from the M3 synth campaign (default.jsonl, 3-way thinking-mode mix),
+      # cleaned + uploaded. Update the suffix once the cleaned set is published.
+      - --input-data /hf-local/modelopt/MiniMax-M3-synthetic-data
+      - --output-dir /scratchspace/dflash_minimax_m3_hidden_states
+      # Must match the draft model's num_hidden_layers (recipe default: 5).
+      - --aux-layers dflash
+      - --answer-only-loss
+      - --chat-template examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja
+      - --max-seq-len 8192
+      - --tp 8
+      # M3 MSA requirements (see header).
+      - --block-size 128
+      - --language-model-only
+      - --enforce-eager
+    environment:
+      - HF_MODEL_CKPT: <<global_vars.dump_model>>
+      - TRUST_REMOTE_CODE: "1"
+      # Survive MSA Triton-kernel JIT recompiles without an executor RPC timeout.
+      - VLLM_RPC_TIMEOUT: "1800000"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 8
+      container: vllm/vllm-openai:minimax-m3
+
+  # Step 2: Train DFlash offline on the dumped hidden states. FakeBaseModel avoids loading
+  # the full 427B — only lm_head + embed_tokens are read from the bf16 checkpoint.
+  task_1:
+    script: common/specdec/dflash_online_training.sh
+    args:
+      - --config modules/Model-Optimizer/modelopt_recipes/general/speculative_decoding/dflash.yaml
+      - model.model_name_or_path=<<global_vars.hf_model>>
+      - model.trust_remote_code=true
+      - model.use_fake_base_for_offline=true
+      - data.mode=offline
+      - data.offline_data_path=/scratchspace/dflash_minimax_m3_hidden_states
+      - data.chat_template=examples/MiniMax/MiniMax-M3-DFlash/chat_template_train.jinja
+      - training.output_dir=/scratchspace/dflash_minimax_m3_offline
+      - training.num_train_epochs=10
+      # bs=1 @ 8192 keeps the activation footprint of M2.7's bs=2 @ 4096 (bs*seqlen equal).
+      - training.per_device_train_batch_size=1
+      - training.learning_rate=1.2e-3
+      - training.warmup_steps=100
+      - training.training_seq_len=8192
+      - training.logging_steps=100
+      - training.save_steps=400
+      - training.disable_tqdm=true
+      - training.dp_shard_size=1
+      - training.answer_only_loss=true
+      - training.ddp_timeout=3600
+      - training.bf16=false
+      - dflash.dflash_self_logit_distillation=true
+      - dflash.dflash_block_size=8
+      - dflash.dflash_num_anchors=512
+      - dflash.dflash_loss_decay_factor=4.0
+      - dflash.dflash_architecture_config.num_hidden_layers=5
+      # Mask token id: in M3, 200054 is a real special token, so the first unused reserved
+      # embedding row is 200061 (M2.7 used 200054).
+      - dflash.dflash_mask_token_id=200061
+      # YaRN rope_scaling injected at EXPORT time only (config.json field; draft weights
+      # unchanged) -> tunable per export. original_max_position_embeddings = training_seq_len
+      # (8192). Default factor 24 -> 8192*24 = 196608 served context (matches M2.7's target).
+      # For M3's full 1M context use factor 128 (8192*128 = 1048576).
+      - dflash.dflash_export_rope_scaling.type=yarn
+      - dflash.dflash_export_rope_scaling.factor=24.0
+      - dflash.dflash_export_rope_scaling.original_max_position_embeddings=8192
+      - dflash.dflash_export_rope_scaling.beta_fast=1.0
+      - dflash.dflash_export_rope_scaling.beta_slow=1.0
+      - dflash.dflash_export_rope_scaling.mscale=1.0
+      - dflash.dflash_export_rope_scaling.mscale_all_dim=1.0
+    environment:
+      - NUM_NODES: "8"
+      # Offline training uses a lightweight FakeBaseModel, so plain DDP suffices (no
+      # ACCELERATE_CONFIG / FSDP2 patches). OVERRIDE_TRANSFORMERS pins 4.52.4 for the
+      # MiniMax-M3 config load.
+      - OVERRIDE_TRANSFORMERS: "4.52.4"
+      - MIXED_PRECISION: "no"
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 8
+      ntasks_per_node: 1
+      gpus_per_node: 8

From 0aa2880fb7cc806650303dec1069a30437e9d830 Mon Sep 17 00:00:00 2001
From: Ye Yu <yeyu@nvidia.com>
Date: Mon, 22 Jun 2026 11:07:25 -0700
Subject: [PATCH 5/5] specdec(recipe): default MiniMax-M3 DFlash export YaRN to
 full 1M (factor 128)

original_max_position_embeddings=8192 (training seq-len) x factor 128 = 1048576 = M3's
full 1M context. Export-time tunable (factor 24 -> 196608 for the M2.7-equivalent target).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: Ye Yu <yeyu@nvidia.com>
---
 .../MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml        | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml
index a0e64e7fd91..bc848231cc2 100644
--- a/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml
+++ b/tools/launcher/examples/MiniMax/MiniMax-M3-DFlash/hf_offline_dflash.yaml
@@ -102,10 +102,10 @@ pipeline:
       - dflash.dflash_mask_token_id=200061
       # YaRN rope_scaling injected at EXPORT time only (config.json field; draft weights
       # unchanged) -> tunable per export. original_max_position_embeddings = training_seq_len
-      # (8192). Default factor 24 -> 8192*24 = 196608 served context (matches M2.7's target).
-      # For M3's full 1M context use factor 128 (8192*128 = 1048576).
+      # (8192). factor 128 -> 8192*128 = 1048576 = M3's full 1M context. (Use factor 24 ->
+      # 196608 to match M2.7's served target instead.)
       - dflash.dflash_export_rope_scaling.type=yarn
-      - dflash.dflash_export_rope_scaling.factor=24.0
+      - dflash.dflash_export_rope_scaling.factor=128.0
       - dflash.dflash_export_rope_scaling.original_max_position_embeddings=8192
       - dflash.dflash_export_rope_scaling.beta_fast=1.0
       - dflash.dflash_export_rope_scaling.beta_slow=1.0