diff --git a/src/dlm/base_models/registry.py b/src/dlm/base_models/registry.py
index c3e66bb8..1e15ced4 100644
--- a/src/dlm/base_models/registry.py
+++ b/src/dlm/base_models/registry.py
@@ -337,6 +337,17 @@
         size_gb_fp16=0.27,
         context_length=8_192,
         recommended_seq_len=1024,
+        capability_warning=(
+            "SmolLM2-135M is below dlm's empirical training floor. Audit 13 "
+            "follow-up findings 02 + 05 measured this base actively "
+            "degrading general-chat capability under every LoRA recipe "
+            "tested (PROSE-only, INSTRUCTION-only, mixed). Adapters "
+            "memorize trained content but fail to generalize and bleed "
+            "domain-specific tokens into unrelated queries. Suitable for "
+            "style-transfer demos and pipeline smoke tests; for any "
+            "specialty-knowledge task use a base ≥ 1B params (e.g. "
+            "smollm2-1.7b, qwen2.5-coder-1.5b, llama-3.2-1b)."
+        ),
     ),
     BaseModelSpec(
         key="smollm2-360m",
diff --git a/src/dlm/base_models/schema.py b/src/dlm/base_models/schema.py
index 8a3b852e..9e5e2794 100644
--- a/src/dlm/base_models/schema.py
+++ b/src/dlm/base_models/schema.py
@@ -149,6 +149,14 @@ class BaseModelSpec(BaseModel):
     provenance_url: str | None = None
     provenance_match_text: str | None = None
 
+    # Optional curated warning surfaced at `dlm train` time when this
+    # base is selected. Populate when the base has a known limitation
+    # that's not derivable from `params` / `architecture` alone — e.g.
+    # SmolLM2-135M's measured architectural floor (audit 13 follow-up
+    # findings 02 + 05: actively degrades base capability under any
+    # LoRA recipe). Empty string is treated as "no warning".
+    capability_warning: str | None = Field(default=None, min_length=1)
+
     # Modality + multi-modal preprocessing (schema v10 + v11, plus the
     # additive `text-moe` discriminator).
     # Text-family bases leave `modality in {"text", "text-moe"}`
diff --git a/src/dlm/cli/commands.py b/src/dlm/cli/commands.py
index 6bac022e..0ee8f0a1 100644
--- a/src/dlm/cli/commands.py
+++ b/src/dlm/cli/commands.py
@@ -853,6 +853,13 @@ def train_cmd(
             "Acceptance will be persisted in the store manifest."
         )
         raise typer.Exit(code=1) from exc
+    # `getattr` so test fixtures stubbing `spec` as a `SimpleNamespace`
+    # without this field still pass; real registry entries always have it.
+    capability_warning = getattr(spec, "capability_warning", None)
+    if capability_warning:
+        console.print(
+            f"[yellow]warning:[/yellow] base [bold]{spec.key}[/bold]: {capability_warning}"
+        )
     # Detect the DDP world_size set by `accelerate launch`
     # (WORLD_SIZE env var) and thread it into the doctor so the plan's
     # effective_batch_size reflects the rank count. Single-process
@@ -3575,6 +3582,20 @@ def cache_show_cmd(
     cache = TokenizedCache.open(store.tokenized_cache_dir)
     last = _queries.latest_tokenization(store.root)
 
+    # The tokenized cache only fires for runs whose frontmatter declares
+    # `training.sources` (directive-sourced rows are where the tokenize
+    # cost dominates; in-body sections go through TRL's tokenizer).
+    # Surface this so an empty cache on an in-body-only doc doesn't
+    # look like a bug.
+    has_sources = parsed.frontmatter.training.sources is not None
+    cache_enabled = parsed.frontmatter.training.cache.enabled
+    if not has_sources:
+        cache_status: str | None = "not used (doc has no `training.sources` directive)"
+    elif not cache_enabled:
+        cache_status = "disabled (training.cache.enabled = false)"
+    else:
+        cache_status = None
+
     payload: dict[str, object] = {
         "dlm_id": parsed.frontmatter.dlm_id,
         "cache_path": str(store.tokenized_cache_dir),
@@ -3582,6 +3603,7 @@ def cache_show_cmd(
         "bytes": cache.total_bytes,
         "last_run_hit_rate": last.hit_rate if last else None,
         "last_run_id": last.run_id if last else None,
+        "cache_status": cache_status,
     }
     if json_out:
         _sys.stdout.write(_json.dumps(payload, indent=2) + "\n")
@@ -3589,6 +3611,8 @@ def cache_show_cmd(
 
     out_console.print(f"[bold]Cache for {parsed.frontmatter.dlm_id}[/bold]")
     out_console.print(f"  path:              {store.tokenized_cache_dir}")
+    if cache_status is not None:
+        out_console.print(f"  status:            [yellow]{cache_status}[/yellow]")
     out_console.print(f"  entries:           {cache.entry_count}")
     out_console.print(f"  size:              {_human_size(cache.total_bytes)}")
     if last is not None:
@@ -3596,7 +3620,7 @@ def cache_show_cmd(
             f"  last-run hit rate: {last.hit_rate:.1%} "
             f"({last.cache_hits}/{last.cache_hits + last.cache_misses})"
         )
-    else:
+    elif cache_status is None:
         out_console.print("  last-run hit rate: [dim]no tokenization runs yet[/dim]")
 
 
diff --git a/src/dlm/doc/migrate.py b/src/dlm/doc/migrate.py
index 5bef5801..d9530980 100644
--- a/src/dlm/doc/migrate.py
+++ b/src/dlm/doc/migrate.py
@@ -97,7 +97,17 @@ def migrate_file(
     # Validate post-migration dict against the current schema so a bad
     # migrator can't silently smear garbage into the document.
     fm = DlmFrontmatter.model_validate(migrated)
-    new_text = _rejoin(fm, body_text)
+    # Preserve the user's *originally explicit* fields across migration
+    # by collecting their dotted paths from the post-migration dict and
+    # passing them to the serializer as force-emit overrides. Without
+    # this, a v1 doc with `lora_r: 8` (matching the current schema
+    # default) would silently lose the explicit pin and inherit any
+    # future default change. The contract `CLAUDE.md` calls "additive
+    # identity" is honored at *intent* level, not just behavior level.
+    from dlm.doc.serializer import collect_dict_field_paths
+
+    force_emit = collect_dict_field_paths(migrated)
+    new_text = _rejoin(fm, body_text, force_emit_paths=force_emit)
 
     if dry_run:
         return MigrationResult(
@@ -152,7 +162,12 @@ def _split_for_migrate(text: str, *, path: Path) -> tuple[str, str]:
     )
 
 
-def _rejoin(fm: DlmFrontmatter, body_text: str) -> str:
+def _rejoin(
+    fm: DlmFrontmatter,
+    body_text: str,
+    *,
+    force_emit_paths: frozenset[tuple[str, ...]] | None = None,
+) -> str:
     """Re-assemble a `.dlm` file from a migrated frontmatter + raw body.
 
     Preserves the body verbatim (migration never touches section content);
@@ -165,7 +180,7 @@ def _rejoin(fm: DlmFrontmatter, body_text: str) -> str:
     # section serialization by handing an empty sections tuple and
     # concatenating the raw body manually.
     empty = ParsedDlm(frontmatter=fm, sections=_empty_sections())
-    header = serialize(empty)  # always ends with "\n"
+    header = serialize(empty, force_emit_paths=force_emit_paths)  # always ends with "\n"
 
     # Normalize leading/trailing whitespace on the body to match the
     # canonical layout: exactly one blank line between `---\n` closer
diff --git a/src/dlm/doc/serializer.py b/src/dlm/doc/serializer.py
index b011e3dc..33b001f9 100644
--- a/src/dlm/doc/serializer.py
+++ b/src/dlm/doc/serializer.py
@@ -34,12 +34,23 @@
 )
 
 
-def serialize(parsed: ParsedDlm) -> str:
+def serialize(
+    parsed: ParsedDlm,
+    *,
+    force_emit_paths: frozenset[tuple[str, ...]] | None = None,
+) -> str:
     """Produce canonical `.dlm` text for `parsed`.
 
     Always ends with `\\n`.
+
+    `force_emit_paths` is consulted by `_emit_nested_mapping` — a field
+    whose dotted path appears in the set is emitted even when its value
+    matches the schema default. Used by the migrate pipeline to
+    preserve user-explicit fields across schema-default drift (so a
+    user who pinned `lora_r: 8` doesn't silently inherit a future
+    `lora_r: 16` default after migration).
     """
-    parts: list[str] = [_serialize_frontmatter(parsed.frontmatter), "\n"]
+    parts: list[str] = [_serialize_frontmatter(parsed.frontmatter, force_emit_paths), "\n"]
     for i, section in enumerate(parsed.sections):
         if i > 0:
             parts.append("\n")
@@ -50,10 +61,41 @@ def serialize(parsed: ParsedDlm) -> str:
     return rendered
 
 
+def collect_dict_field_paths(d: object, prefix: tuple[str, ...] = ()) -> frozenset[tuple[str, ...]]:
+    """Walk a parsed-YAML dict and return every nested leaf-or-mapping path.
+
+    Used by the migrate pipeline: the set of paths present in the
+    user's original frontmatter (after migration runs) is the set of
+    fields the serializer must emit even when they match defaults.
+    Mappings *and* leaves are both included so intermediate blocks
+    survive re-emission.
+    """
+    paths: set[tuple[str, ...]] = set()
+    if isinstance(d, dict):
+        for k, v in d.items():
+            if not isinstance(k, str):
+                continue
+            here = (*prefix, k)
+            paths.add(here)
+            if isinstance(v, dict):
+                paths.update(collect_dict_field_paths(v, here))
+            elif isinstance(v, list):
+                # List of mappings (e.g. training.sources) — each item
+                # contributes paths under the same parent key, since
+                # we serialize positional list entries together.
+                for item in v:
+                    if isinstance(item, dict):
+                        paths.update(collect_dict_field_paths(item, here))
+    return frozenset(paths)
+
+
 # --- frontmatter --------------------------------------------------------------
 
 
-def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
+def _serialize_frontmatter(
+    fm: DlmFrontmatter,
+    force_emit_paths: frozenset[tuple[str, ...]] | None = None,
+) -> str:
     lines: list[str] = ["---"]
     for key in _FRONTMATTER_ORDER:
         value = getattr(fm, key, None)
@@ -63,7 +105,9 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
             lines.extend(_emit_block_scalar(key, value))
             continue
         if isinstance(value, TrainingConfig | ExportConfig):
-            nested = _emit_nested_mapping(value, indent=2)
+            nested = _emit_nested_mapping(
+                value, indent=2, path=(key,), force_emit_paths=force_emit_paths
+            )
             if not nested:
                 # All-default nested block — skip the header too so we
                 # don't emit an empty `training:` line.
@@ -76,7 +120,13 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
     return "\n".join(lines) + "\n"
 
 
-def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
+def _emit_nested_mapping(
+    model: BaseModel,
+    *,
+    indent: int,
+    path: tuple[str, ...] = (),
+    force_emit_paths: frozenset[tuple[str, ...]] | None = None,
+) -> list[str]:
     """Emit a nested training/export/dpo block.
 
     Suppress fields that equal their schema default so
@@ -87,6 +137,11 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
 
     Nested `BaseModel` values (e.g. `TrainingConfig.preference`)
     recurse with deeper indent; all-default sub-blocks are skipped.
+
+    `force_emit_paths` overrides the default-suppression rule for any
+    field whose dotted path appears in the set. Used by the migrate
+    pipeline to preserve user-explicit fields across schema-default
+    drift.
     """
     pad = " " * indent
     lines: list[str] = []
@@ -99,16 +154,28 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
 
     for field_name, field_info in model.__class__.model_fields.items():
         value = getattr(model, field_name)
-        if field_info.default is not PydanticUndefined and value == field_info.default:
+        field_path = (*path, field_name)
+        forced = force_emit_paths is not None and field_path in force_emit_paths
+        if (
+            not forced
+            and field_info.default is not PydanticUndefined
+            and value == field_info.default
+        ):
             continue
         if (
-            field_info.default is PydanticUndefined
+            not forced
+            and field_info.default is PydanticUndefined
             and field_info.default_factory is not None
             and value == field_info.default_factory()  # type: ignore[call-arg]
         ):
             continue
         if isinstance(value, BaseModel):
-            nested = _emit_nested_mapping(value, indent=indent + 2)
+            nested = _emit_nested_mapping(
+                value,
+                indent=indent + 2,
+                path=field_path,
+                force_emit_paths=force_emit_paths,
+            )
             if not nested:
                 continue
             lines.append(f"{pad}{field_name}:")
@@ -125,7 +192,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
             lines.append(f"{pad}{field_name}:")
             for k, v in value.items():
                 lines.append(f"{pad}  {k}:")
-                nested = _emit_nested_mapping(v, indent=indent + 4)
+                nested = _emit_nested_mapping(
+                    v,
+                    indent=indent + 4,
+                    path=(*field_path, k),
+                    force_emit_paths=force_emit_paths,
+                )
                 if nested:
                     lines.extend(nested)
                 else:
@@ -144,7 +216,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
             # fields indent aligned.
             lines.append(f"{pad}{field_name}:")
             for item in value:
-                nested = _emit_nested_mapping(item, indent=indent + 4)
+                nested = _emit_nested_mapping(
+                    item,
+                    indent=indent + 4,
+                    path=field_path,
+                    force_emit_paths=force_emit_paths,
+                )
                 if not nested:
                     lines.append(f"{pad}  - {{}}")
                     continue
diff --git a/src/dlm/export/ollama/modelfile_shared.py b/src/dlm/export/ollama/modelfile_shared.py
index e0dcc704..bdaf6cfb 100644
--- a/src/dlm/export/ollama/modelfile_shared.py
+++ b/src/dlm/export/ollama/modelfile_shared.py
@@ -129,8 +129,14 @@ def build_param_lines(
     if num_ctx is not None:
         lines.append(f"PARAMETER num_ctx {num_ctx}")
     if draft_model is not None:
-        lines.append(f"# Speculative decoding: `ollama pull {draft_model}` first.")
-        lines.append(f"PARAMETER draft_model {draft_model}")
+        # `draft_model` is not a valid Modelfile PARAMETER directive
+        # (Ollama rejects `ollama create` with "unknown parameter
+        # 'draft_model'"). It's a runtime option exposed via the
+        # `OLLAMA_DRAFT_MODEL` env var or the API's `options.draft_model`
+        # field. Document the suggested pairing as a comment so users
+        # can wire it up without forcing-fail their `ollama create`.
+        lines.append(f"# Speculative-decoding draft: `ollama pull {draft_model}`")
+        lines.append(f"# then run with `OLLAMA_DRAFT_MODEL={draft_model} ollama run <this-model>`")
     return lines
 
 
diff --git a/src/dlm/export/preflight.py b/src/dlm/export/preflight.py
index 39163729..7e4836cc 100644
--- a/src/dlm/export/preflight.py
+++ b/src/dlm/export/preflight.py
@@ -98,8 +98,11 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int:
             detail=f"cannot parse {cfg_path}: {exc}",
         ) from exc
 
-    # `vocab_size` key isn't always present in tokenizer_config.json;
-    # fall back to the companion tokenizer.json which always carries it.
+    # `vocab_size` key isn't always present in tokenizer_config.json
+    # (Qwen2.5+, Llama-3.x omit it); fall back to summing the BPE base
+    # plus the explicit `added_tokens` array in tokenizer.json. This
+    # matches `len(transformers.AutoTokenizer.from_pretrained(...))` —
+    # the count the model actually addresses at inference time.
     vocab_size = cfg.get("vocab_size")
     if not isinstance(vocab_size, int):
         tokenizer_json = adapter_dir / "tokenizer.json"
@@ -113,8 +116,9 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int:
                 ) from exc
             model = t.get("model") or {}
             vocab = model.get("vocab")
+            added = t.get("added_tokens") or []
             if isinstance(vocab, dict):
-                vocab_size = len(vocab)
+                vocab_size = len(vocab) + (len(added) if isinstance(added, list) else 0)
     if not isinstance(vocab_size, int) or vocab_size <= 0:
         raise PreflightError(
             probe="tokenizer_vocab",
@@ -133,6 +137,10 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None:
     `--no-template` on the CLI sets `required=False`; the default
     requires one because the Modelfile emitter hardcodes
     `TEMPLATE "..."` which needs source text.
+
+    Modern HF tokenizers (Qwen2.5+, Llama-3.x) write the template to
+    a sibling `chat_template.jinja` file rather than inlining it in
+    `tokenizer_config.json`. Check both locations.
     """
     if not required:
         return
@@ -150,15 +158,30 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None:
             detail=f"cannot parse {cfg_path}: {exc}",
         ) from exc
     template = cfg.get("chat_template")
-    if not template or not str(template).strip():
-        raise PreflightError(
-            probe="chat_template",
-            detail=(
-                "tokenizer has no chat_template. Pass --no-template to skip "
-                "this check (Modelfile emission will fall back to the base "
-                "model's default), or attach a template via frontmatter."
-            ),
-        )
+    if template and str(template).strip():
+        return
+
+    sibling_path = adapter_dir / "chat_template.jinja"
+    if sibling_path.exists():
+        try:
+            sibling_template = sibling_path.read_text(encoding="utf-8")
+        except OSError as exc:
+            raise PreflightError(
+                probe="chat_template",
+                detail=f"cannot read {sibling_path}: {exc}",
+            ) from exc
+        if sibling_template.strip():
+            return
+
+    raise PreflightError(
+        probe="chat_template",
+        detail=(
+            "tokenizer has no chat_template (checked tokenizer_config.json "
+            "and chat_template.jinja). Pass --no-template to skip "
+            "this check (Modelfile emission will fall back to the base "
+            "model's default), or attach a template via frontmatter."
+        ),
+    )
 
 
 def check_pretokenizer_fingerprint(spec: BaseModelSpec) -> None:
diff --git a/src/dlm/export/runner.py b/src/dlm/export/runner.py
index 408b233d..55ed52df 100644
--- a/src/dlm/export/runner.py
+++ b/src/dlm/export/runner.py
@@ -90,24 +90,29 @@ def _default_check_base_vocab(adapter_path: Path, base_gguf_path: Path) -> None:
 
     `assert_gguf_vocab_matches` is tokenizer-backed; we use
     the file-based path here so `run_export` doesn't take a `transformers`
-    import hit on every call. A mismatch means the base conversion
-    materialized a different tokenizer than the one the adapter was
-    trained against — exactly the silent-failure surface this check is
-    meant to catch.
+    import hit on every call. The model's embedding matrix may have
+    reserved/special-token slots beyond the tokenizer's addressable
+    count (Qwen2.5: 151643 BPE + 22 added = 151665 addressable, but
+    `vocab_size = 151936` worth of embedding rows for future-token
+    reservation). GGUF carries the embedding-aligned count, so it's
+    valid for `gguf_vocab >= adapter_vocab`. The unsafe direction is
+    `adapter_vocab > gguf_vocab` — that means the tokenizer can emit
+    indices beyond what the model has embeddings for.
     """
     from dlm.export.preflight import check_tokenizer_vocab
     from dlm.export.tokenizer_sync import read_gguf_vocab_size
 
     adapter_vocab = check_tokenizer_vocab(adapter_path)
     gguf_vocab = read_gguf_vocab_size(base_gguf_path)
-    if adapter_vocab != gguf_vocab:
+    if adapter_vocab > gguf_vocab:
         from dlm.export.errors import PreflightError
 
         raise PreflightError(
             probe="gguf_vocab",
             detail=(
-                f"adapter tokenizer vocab ({adapter_vocab}) does not match "
-                f"base GGUF vocab ({gguf_vocab}) at {base_gguf_path.name}. "
+                f"adapter tokenizer addressable vocab ({adapter_vocab}) exceeds "
+                f"base GGUF vocab ({gguf_vocab}) at {base_gguf_path.name} — "
+                "the tokenizer can emit indices the model has no embeddings for. "
                 "Re-run base conversion against the adapter-dir tokenizer."
             ),
         )
diff --git a/src/dlm/hardware/render.py b/src/dlm/hardware/render.py
index e8bfcaf9..5bcacdc1 100644
--- a/src/dlm/hardware/render.py
+++ b/src/dlm/hardware/render.py
@@ -25,7 +25,7 @@ def render_text(result: DoctorResult) -> str:
     lines.append(f"FlashAttention: {_bool(caps.has_flash_attention)}")
     lines.append(f"xFormers:       {_bool(caps.has_xformers)}")
     lines.append(f"Triton:         {_bool(caps.has_triton)}")
-    lines.append(f"MLX inference:  {_bool(caps.has_mlx)}")
+    lines.append(f"MLX inference:  {_mlx_line(caps)}")
     lines.append(f"accelerate:     {caps.accelerate_version or 'not installed'}")
     lines.append(f"CPU cores:      {caps.cpu_cores}")
     lines.append(f"RAM:            {caps.ram_gb:.1f} GB")
@@ -82,6 +82,20 @@ def _bool(v: bool) -> str:
     return "yes" if v else "no"
 
 
+def _mlx_line(caps: Capabilities) -> str:
+    """Doctor's MLX line — `yes` plus a one-line scope hint, or `no`.
+
+    MLX is prompt-only (training stays on PyTorch MPS). On Apple Silicon
+    it's the default backend for `dlm prompt`; off-platform it's not
+    installed. The scope hint is here so users don't expect MLX to
+    accelerate training and don't think `--backend mlx` is a separate
+    "mode" with its own training story.
+    """
+    if not caps.has_mlx:
+        return "no"
+    return "yes (prompt-only; default backend for `dlm prompt` on Apple Silicon)"
+
+
 def _telemetry_line(posture: dict[str, str]) -> str:
     hf = posture.get("HF_HUB_DISABLE_TELEMETRY", "<unset>")
     dnt = posture.get("DO_NOT_TRACK", "<unset>")
diff --git a/src/dlm/inference/backends/mlx_backend.py b/src/dlm/inference/backends/mlx_backend.py
index d2730323..26e63478 100644
--- a/src/dlm/inference/backends/mlx_backend.py
+++ b/src/dlm/inference/backends/mlx_backend.py
@@ -35,7 +35,10 @@
 
 from dlm.inference.backends.base import InferenceBackend
 from dlm.inference.errors import AdapterNotFoundError
-from dlm.inference.mlx_adapter import MlxConversionError
+from dlm.inference.mlx_adapter import (
+    MlxConversionError,
+    assert_mlx_adapter_applied,
+)
 
 if TYPE_CHECKING:
     from dlm.base_models import BaseModelSpec
@@ -192,6 +195,14 @@ def load(  # pragma: no cover - heavy path
             adapter_path=str(staged),
         )
 
+        # mlx-lm's `load_adapters` runs `linear_to_lora_layers` +
+        # `model.load_weights(strict=False)`, both of which fail
+        # silently on key/shape mismatches. Verify at least one LoRA
+        # parameter actually attached — else `MlxConversionError` so
+        # the user sees the failure instead of base-model output.
+        staged_cfg = json.loads((staged / _ADAPTER_CONFIG_FILENAME).read_text(encoding="utf-8"))
+        assert_mlx_adapter_applied(self._model, expected_keys=staged_cfg["lora_parameters"]["keys"])
+
     def generate(self, prompt: str, **gen_kwargs: Any) -> str:  # pragma: no cover - heavy path
         if self._model is None or self._tokenizer is None:
             raise RuntimeError("MlxBackend.generate called before load()")
@@ -206,6 +217,13 @@ def generate(self, prompt: str, **gen_kwargs: Any) -> str:  # pragma: no cover -
         top_p = float(gen_kwargs.get("top_p") or 0.0)
         top_k = int(gen_kwargs.get("top_k") or 0)
 
+        # Apply the tokenizer's chat template so INSTRUCTION-trained
+        # adapters see the same input shape they trained on. The PyTorch
+        # backend renders via `format_chat_prompt`; without the same
+        # render here, MLX feeds raw user text to the model and the
+        # adapter silently fails to fire on chat-shaped queries.
+        rendered = _apply_chat_template(self._tokenizer, prompt)
+
         # mlx-lm's `generate` / `generate_step` no longer accept `temp`
         # directly — sampling params are passed via a sampler produced
         # by `make_sampler(temp=..., top_p=..., top_k=...)`. Unknown
@@ -215,7 +233,7 @@ def generate(self, prompt: str, **gen_kwargs: Any) -> str:  # pragma: no cover -
         out = mlx_generate(
             self._model,
             self._tokenizer,
-            prompt=prompt,
+            prompt=rendered,
             max_tokens=max_new_tokens,
             sampler=sampler,
             verbose=False,
@@ -228,3 +246,22 @@ def unload(self) -> None:
         if self._workdir is not None:
             self._workdir.cleanup()
             self._workdir = None
+
+
+def _apply_chat_template(tokenizer: Any, prompt: str) -> str:
+    """Render `prompt` through the tokenizer's chat template, or pass through.
+
+    Mirrors `dlm.inference.generate.format_chat_prompt` for the MLX path.
+    mlx-lm's `TokenizerWrapper` proxies `apply_chat_template` to the
+    underlying HF tokenizer; bases without a chat template (raw causal
+    LMs) get the prompt unchanged.
+    """
+    if getattr(tokenizer, "chat_template", None):
+        rendered = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        if isinstance(rendered, str):
+            return rendered
+    return prompt
diff --git a/src/dlm/inference/mlx_adapter.py b/src/dlm/inference/mlx_adapter.py
index 7bc627ac..9b27eba3 100644
--- a/src/dlm/inference/mlx_adapter.py
+++ b/src/dlm/inference/mlx_adapter.py
@@ -44,6 +44,39 @@
 _LORA_AB = re.compile(r"\.lora_([AB])\.weight$")
 """Matches the trailing `.lora_A.weight` / `.lora_B.weight` suffix."""
 
+_ATTN_TARGETS: frozenset[str] = frozenset(
+    {"q_proj", "k_proj", "v_proj", "o_proj", "qkv_proj", "wqkv"}
+)
+"""Bare PEFT `target_modules` names that live under `self_attn.` on
+decoder-only transformers (Qwen2/Llama/Mistral/Phi/SmolLM)."""
+
+_MLP_TARGETS: frozenset[str] = frozenset({"gate_proj", "up_proj", "down_proj", "fc1", "fc2"})
+"""Bare PEFT `target_modules` names that live under `mlp.` on the same
+family of architectures."""
+
+
+def _qualify_target_module(name: str) -> str:
+    """Map a PEFT bare `target_modules` entry to its in-block FQN.
+
+    mlx-lm's `linear_to_lora_layers` matches `named_modules()` keys
+    *within* each transformer block via exact equality (`if k in keys`).
+    PEFT records `target_modules` as bare module names (`q_proj`),
+    while the FQN within an MLX-LM transformer block is fully qualified
+    (`self_attn.q_proj`). Without this rewrite the keys never match and
+    `linear_to_lora_layers` silently leaves the model un-wrapped — the
+    user-visible failure is "trained model behaves identically to base."
+
+    Already-qualified names (containing a `.`) pass through untouched
+    so callers can pre-qualify if needed.
+    """
+    if "." in name:
+        return name
+    if name in _ATTN_TARGETS:
+        return f"self_attn.{name}"
+    if name in _MLP_TARGETS:
+        return f"mlp.{name}"
+    return name
+
 
 class MlxConversionError(RuntimeError):
     """Raised when a PEFT adapter cannot be converted to the MLX layout."""
@@ -121,7 +154,23 @@ def peft_safetensors_to_mlx_safetensors(  # pragma: no cover - I/O + torch deps
     tensors = load_file(str(src))
     mapping = map_all_keys(list(tensors.keys()))
 
-    mlx_tensors = {mlx_key: tensors[peft_key] for peft_key, mlx_key in mapping.items()}
+    # PEFT stores LoRA weights with shapes that don't match what
+    # mlx-lm's `LoRALinear` expects:
+    #
+    #   PEFT lora_A : [r, in_features]       MLX lora_a : [in_features, r]
+    #   PEFT lora_B : [out_features, r]      MLX lora_b : [r, out_features]
+    #
+    # Both tensors need a transpose. Loading without the transpose
+    # makes mlx-lm's `model.load_weights(strict=False)` silently skip
+    # the mismatched shapes and the adapter has no effect — the
+    # textbook "trained model behaves like base" failure mode.
+    mlx_tensors = {}
+    for peft_key, mlx_key in mapping.items():
+        t = tensors[peft_key]
+        # `mlx_key` ends in `.lora_a` or `.lora_b` (lowercase, no `.weight`).
+        if mlx_key.endswith((".lora_a", ".lora_b")):
+            t = t.t().contiguous()
+        mlx_tensors[mlx_key] = t
 
     mlx_safetensors_path.parent.mkdir(parents=True, exist_ok=True)
     save_file(mlx_tensors, str(mlx_safetensors_path))
@@ -174,6 +223,8 @@ def build_mlx_adapter_config(
             "cannot stage mlx adapter without a valid layer count"
         )
 
+    qualified_keys = [_qualify_target_module(t) for t in target_modules]
+
     return {
         "fine_tune_type": "dora" if use_dora else "lora",
         "num_layers": int(base_num_hidden_layers),
@@ -181,6 +232,54 @@ def build_mlx_adapter_config(
             "rank": rank,
             "scale": lora_alpha / rank if rank else float(lora_alpha),
             "dropout": lora_dropout,
-            "keys": list(target_modules),
+            "keys": qualified_keys,
         },
     }
+
+
+def assert_mlx_adapter_applied(model: Any, *, expected_keys: list[str]) -> None:
+    """Verify mlx-lm's `load_adapters` actually wrapped the targeted layers.
+
+    `mlx_lm.load(..., adapter_path=...)` calls `linear_to_lora_layers`
+    followed by `model.load_weights(strict=False)`. Both steps fail
+    silently if their inputs don't match the loaded model:
+
+    - `linear_to_lora_layers` is a no-op when `keys` don't match any
+      module's FQN inside the transformer blocks
+    - `load_weights(strict=False)` skips any tensor key that doesn't
+      match a model parameter
+
+    Either failure produces a model that runs as if no adapter were
+    loaded. Catching this here turns the "trained model behaves like
+    base" footgun into an explicit refusal so the user knows to use
+    `--backend pytorch` (or the fix needs an architecture-aware
+    keys translator).
+
+    `expected_keys` are the in-block FQNs from the staged
+    `adapter_config.json` (e.g. `["self_attn.q_proj", ...]`). We confirm
+    that at least one matching module ended up as a LoRA-wrapped layer.
+    """
+    try:
+        import mlx.utils as mlx_utils  # type: ignore[import-not-found, unused-ignore]
+    except ImportError as exc:  # pragma: no cover - mlx not importable
+        raise MlxConversionError(f"mlx not importable for verification: {exc}") from exc
+
+    try:
+        flat: Any = mlx_utils.tree_flatten(model.trainable_parameters())
+    except Exception as exc:  # pragma: no cover - defensive
+        raise MlxConversionError(
+            f"could not enumerate model trainable_parameters for verification: {exc}"
+        ) from exc
+
+    lora_param_count = sum(1 for k, _ in flat if k.endswith(".lora_a") or k.endswith(".lora_b"))
+    if lora_param_count == 0:
+        raise MlxConversionError(
+            "mlx-lm loaded the adapter without applying it — zero "
+            "`lora_a` / `lora_b` parameters present after load. This "
+            "usually means the keys "
+            f"{expected_keys!r} don't match the model's `named_modules()` "
+            "FQNs (e.g. the base architecture uses a different submodule "
+            "layout than `self_attn.*` / `mlp.*`). The trained adapter "
+            "would behave identically to the base model. Use "
+            "`--backend pytorch` as a workaround."
+        )
diff --git a/tests/unit/base_models/test_registry.py b/tests/unit/base_models/test_registry.py
index 62049233..c8ff7c23 100644
--- a/tests/unit/base_models/test_registry.py
+++ b/tests/unit/base_models/test_registry.py
@@ -30,6 +30,21 @@ def test_known_keys_stable_ordering(self) -> None:
         assert list(keys) == list(BASE_MODELS.keys())
 
 
+class TestCapabilityWarnings:
+    def test_smol_135m_carries_warning(self) -> None:
+        """SmolLM2-135M is below the empirically-measured training floor."""
+        spec = BASE_MODELS["smollm2-135m"]
+        assert spec.capability_warning is not None
+        # The audit findings 02 + 05 are the load-bearing evidence;
+        # the wording should mention the floor in some form.
+        assert "floor" in spec.capability_warning.lower()
+
+    def test_above_floor_bases_have_no_warning(self) -> None:
+        """1B+ launch bases ship without the architectural-floor warning."""
+        for key in ("smollm2-1.7b", "qwen2.5-coder-1.5b", "llama-3.2-1b"):
+            assert BASE_MODELS[key].capability_warning is None, key
+
+
 class TestRevisionPinning:
     """Audit-02 F11: every registry entry has a real 40-char commit SHA."""
 
diff --git a/tests/unit/cli/test_misc_commands_coverage.py b/tests/unit/cli/test_misc_commands_coverage.py
index 741f0d26..6e089c8b 100644
--- a/tests/unit/cli/test_misc_commands_coverage.py
+++ b/tests/unit/cli/test_misc_commands_coverage.py
@@ -552,7 +552,11 @@ def save_manifest(self) -> None:
         )
         assert show_text.exit_code == 0, show_text.output
         assert "Cache for" in show_text.output
-        assert "no tokenization runs yet" in show_text.output
+        # Minimal doc has no `training.sources` directive, so the cache
+        # is gated off by design — surface that instead of leaving the
+        # user to guess why their cache is empty.
+        assert "not used" in show_text.output
+        assert "training.sources" in show_text.output
 
         prune_bad = runner.invoke(
             app,
diff --git a/tests/unit/doc/test_migrate.py b/tests/unit/doc/test_migrate.py
index d8b3fc69..46923867 100644
--- a/tests/unit/doc/test_migrate.py
+++ b/tests/unit/doc/test_migrate.py
@@ -138,6 +138,46 @@ def _pre_current(raw: dict[str, object]) -> dict[str, object]:
         assert result.backup_path is None
         assert not (tmp_path / "mydoc.dlm.bak").exists()
 
+    def test_user_explicit_default_value_survives_migration(
+        self, tmp_path: Path, bumped_current: int
+    ) -> None:
+        """Honor the 'additive identity' contract at intent level, not just behavior.
+
+        A user who pinned `lora_r: 8` should still see `lora_r: 8` in
+        the migrated doc, even though 8 happens to match the current
+        schema default. Without this, a future default change would
+        silently override the user's pin.
+        """
+        current = bumped_current - 1
+
+        @register(from_version=current)
+        def _pre_current(raw: dict[str, object]) -> dict[str, object]:
+            return dict(raw)
+
+        doc = tmp_path / "mydoc.dlm"
+        doc.write_text(
+            f"""---
+dlm_id: {_VALID_ULID}
+base_model: smollm2-135m
+dlm_version: {current}
+training:
+  lora_r: 8
+  lora_alpha: 16
+  num_epochs: 3
+---
+
+body
+""",
+            encoding="utf-8",
+        )
+
+        result = migrate_file(doc, no_backup=True)
+        assert result.wrote is True
+        text = doc.read_text(encoding="utf-8")
+        assert "lora_r: 8" in text
+        assert "lora_alpha: 16" in text
+        assert "num_epochs: 3" in text
+
     def test_dry_run_reports_without_writing(self, tmp_path: Path, bumped_current: int) -> None:
         current = bumped_current - 1
 
diff --git a/tests/unit/doc/test_serializer_edges.py b/tests/unit/doc/test_serializer_edges.py
index f3688687..9e9dd501 100644
--- a/tests/unit/doc/test_serializer_edges.py
+++ b/tests/unit/doc/test_serializer_edges.py
@@ -28,6 +28,7 @@
     _needs_quoting,
     _scalar,
     _serialize_section,
+    collect_dict_field_paths,
     serialize,
 )
 
@@ -168,7 +169,9 @@ def test_output_always_ends_with_single_newline(self, monkeypatch: pytest.Monkey
             frontmatter=fm,
             sections=(Section(SectionType.PROSE, "content"),),
         )
-        monkeypatch.setattr("dlm.doc.serializer._serialize_frontmatter", lambda _fm: "---\n---")
+        monkeypatch.setattr(
+            "dlm.doc.serializer._serialize_frontmatter", lambda _fm, _force=None: "---\n---"
+        )
         out = serialize(parsed)
         assert out.endswith("\n")
         assert not out.endswith("\n\n")
@@ -181,11 +184,55 @@ def test_serializer_adds_newline_when_section_render_omits_it(
             frontmatter=fm,
             sections=(Section(SectionType.PROSE, "content"),),
         )
-        monkeypatch.setattr("dlm.doc.serializer._serialize_frontmatter", lambda _fm: "---\n---")
+        monkeypatch.setattr(
+            "dlm.doc.serializer._serialize_frontmatter", lambda _fm, _force=None: "---\n---"
+        )
         monkeypatch.setattr("dlm.doc.serializer._serialize_section", lambda _section: "body")
         assert serialize(parsed).endswith("\n")
 
 
+class TestCollectDictFieldPaths:
+    """Migrate path uses this to remember which fields the user explicitly set."""
+
+    def test_flat_mapping(self) -> None:
+        paths = collect_dict_field_paths({"a": 1, "b": 2})
+        assert paths == frozenset({("a",), ("b",)})
+
+    def test_nested_mapping_records_intermediate_and_leaf(self) -> None:
+        paths = collect_dict_field_paths({"training": {"lora_r": 8, "lora_alpha": 16}})
+        assert paths == frozenset(
+            {("training",), ("training", "lora_r"), ("training", "lora_alpha")}
+        )
+
+    def test_non_string_keys_ignored(self) -> None:
+        # Pydantic-validated dicts only use string keys; the defensive
+        # branch keeps a hostile YAML round-trip from leaking a tuple key.
+        paths = collect_dict_field_paths({"a": 1, 7: "ignored"})
+        assert paths == frozenset({("a",)})
+
+    def test_list_of_mappings_recurses_under_parent_path(self) -> None:
+        # `training.sources` shape: each list entry is a mapping that
+        # contributes paths under ("training", "sources"), so a v1 source
+        # block with explicit `path:`/`max_bytes_per_file:` survives migration.
+        paths = collect_dict_field_paths(
+            {
+                "training": {
+                    "sources": [
+                        {"path": "/foo", "max_bytes_per_file": 4096},
+                        {"path": "/bar"},
+                    ]
+                }
+            }
+        )
+        assert ("training", "sources") in paths
+        assert ("training", "sources", "path") in paths
+        assert ("training", "sources", "max_bytes_per_file") in paths
+
+    def test_non_dict_input_yields_empty(self) -> None:
+        assert collect_dict_field_paths("not a dict") == frozenset()
+        assert collect_dict_field_paths(None) == frozenset()
+
+
 class TestFrontmatterExplicitTargetModulesList:
     """Ensures the list branch in the nested-mapping emitter is exercised."""
 
diff --git a/tests/unit/export/ollama/test_modelfile.py b/tests/unit/export/ollama/test_modelfile.py
index 3466ea15..14a0dcb2 100644
--- a/tests/unit/export/ollama/test_modelfile.py
+++ b/tests/unit/export/ollama/test_modelfile.py
@@ -110,15 +110,21 @@ def test_override_temperature(self, tmp_path: Path) -> None:
         assert "PARAMETER temperature 0.2" in text
 
     def test_draft_model_omitted_by_default(self, tmp_path: Path) -> None:
-        """Sprint 12.5: no draft set → no `PARAMETER draft_model`."""
+        """No draft set → no draft directive at all."""
         text = render_modelfile(_ctx(tmp_path))
-        assert "PARAMETER draft_model" not in text
+        assert "draft_model" not in text
+
+    def test_draft_model_documented_when_set(self, tmp_path: Path) -> None:
+        """When a draft is registered we emit a comment, not a PARAMETER.
 
-    def test_draft_model_emitted_when_set(self, tmp_path: Path) -> None:
-        """Sprint 12.5: `PARAMETER draft_model <tag>` appears + pull reminder."""
+        `draft_model` is a runtime/API option in Ollama, not a Modelfile
+        PARAMETER — emitting `PARAMETER draft_model <tag>` made
+        `ollama create` fail with "unknown parameter 'draft_model'".
+        """
         text = render_modelfile(_ctx(tmp_path, draft_model_ollama_name="qwen2.5:0.5b"))
-        assert "PARAMETER draft_model qwen2.5:0.5b" in text
+        assert "PARAMETER draft_model" not in text
         assert "ollama pull qwen2.5:0.5b" in text
+        assert "OLLAMA_DRAFT_MODEL=qwen2.5:0.5b" in text
 
     def test_override_top_p(self, tmp_path: Path) -> None:
         """Audit-04 Q5: frontmatter export.default_top_p overrides the dialect default."""
diff --git a/tests/unit/export/test_preflight.py b/tests/unit/export/test_preflight.py
index d736b293..74ece763 100644
--- a/tests/unit/export/test_preflight.py
+++ b/tests/unit/export/test_preflight.py
@@ -101,6 +101,19 @@ def test_malformed_tokenizer_json_raises(self, tmp_path: Path) -> None:
         with pytest.raises(PreflightError, match="cannot parse"):
             check_tokenizer_vocab(tmp_path)
 
+    def test_fallback_sums_added_tokens(self, tmp_path: Path) -> None:
+        """Qwen2.5+ tokenizer: BPE vocab + added_tokens give addressable count."""
+        (tmp_path / "tokenizer_config.json").write_text(json.dumps({}))
+        (tmp_path / "tokenizer.json").write_text(
+            json.dumps(
+                {
+                    "model": {"vocab": {str(i): i for i in range(5000)}},
+                    "added_tokens": [{"id": i, "content": f"<|t{i}|>"} for i in range(22)],
+                }
+            )
+        )
+        assert check_tokenizer_vocab(tmp_path) == 5022
+
 
 class TestChatTemplate:
     def test_present_ok(self, tmp_path: Path) -> None:
@@ -130,6 +143,36 @@ def test_malformed_config_raises(self, tmp_path: Path) -> None:
         with pytest.raises(PreflightError, match="cannot parse"):
             check_chat_template(tmp_path, required=True)
 
+    def test_sibling_jinja_satisfies_when_config_lacks_template(self, tmp_path: Path) -> None:
+        """Qwen2.5+/Llama-3.x write the template to chat_template.jinja, not the config."""
+        _write_tokenizer_config(tmp_path, chat_template="")
+        (tmp_path / "chat_template.jinja").write_text("{{messages}}")
+        check_chat_template(tmp_path)
+
+    def test_empty_sibling_jinja_falls_through_to_error(self, tmp_path: Path) -> None:
+        _write_tokenizer_config(tmp_path, chat_template="")
+        (tmp_path / "chat_template.jinja").write_text("   \n")
+        with pytest.raises(PreflightError, match="chat_template"):
+            check_chat_template(tmp_path)
+
+    def test_unreadable_sibling_jinja_raises(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """OSError while reading the .jinja sibling surfaces as a typed PreflightError."""
+        _write_tokenizer_config(tmp_path, chat_template="")
+        (tmp_path / "chat_template.jinja").write_text("{{messages}}")
+
+        original_read_text = Path.read_text
+
+        def _boom(self: Path, *args: object, **kwargs: object) -> str:
+            if self.name == "chat_template.jinja":
+                raise OSError("simulated read failure")
+            return original_read_text(self, *args, **kwargs)  # type: ignore[arg-type]
+
+        monkeypatch.setattr(Path, "read_text", _boom)
+        with pytest.raises(PreflightError, match="cannot read"):
+            check_chat_template(tmp_path)
+
 
 class TestQloraFlag:
     def test_missing_file_returns_false(self, tmp_path: Path) -> None:
diff --git a/tests/unit/export/test_runner.py b/tests/unit/export/test_runner.py
index e22d0236..25463e9c 100644
--- a/tests/unit/export/test_runner.py
+++ b/tests/unit/export/test_runner.py
@@ -386,24 +386,43 @@ def _fake_merge_path(**kwargs: object) -> None:
 class TestDefaultVocabCheck:
     """Default path loads the adapter tokenizer-vocab and compares against the base GGUF."""
 
-    def test_mismatch_raises(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
+    def test_adapter_exceeds_gguf_raises(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Tokenizer addressing more indices than the model has embeddings is unsafe."""
         from dlm.export.errors import PreflightError
         from dlm.export.runner import _default_check_base_vocab
 
         adapter = tmp_path / "adapter"
         adapter.mkdir()
         (adapter / "tokenizer_config.json").write_text(
-            json.dumps({"vocab_size": 32000, "chat_template": "{{m}}"})
+            json.dumps({"vocab_size": 32001, "chat_template": "{{m}}"})
         )
 
         base = tmp_path / "base.gguf"
         base.write_bytes(b"ignored")
 
-        # Force a specific GGUF vocab through the reader seam.
-        monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32001)
+        monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32000)
         with pytest.raises(PreflightError, match="gguf_vocab"):
             _default_check_base_vocab(adapter, base)
 
+    def test_gguf_reserves_extra_slots_ok(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """GGUF carrying more rows than the tokenizer addresses is the Qwen2.5+ norm."""
+        from dlm.export.runner import _default_check_base_vocab
+
+        adapter = tmp_path / "adapter"
+        adapter.mkdir()
+        (adapter / "tokenizer_config.json").write_text(
+            json.dumps({"vocab_size": 32000, "chat_template": "{{m}}"})
+        )
+        base = tmp_path / "base.gguf"
+        base.write_bytes(b"ignored")
+
+        monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32293)
+        _default_check_base_vocab(adapter, base)
+
     def test_match_returns_none(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
         from dlm.export.runner import _default_check_base_vocab
 
@@ -647,19 +666,18 @@ class TestDraftWiring:
     """Sprint 12.5: runner routes draft override/disable through to the Modelfile."""
 
     def test_override_tag_written_to_modelfile(self, tmp_path: Path) -> None:
-        """--draft custom:tag propagates to PARAMETER draft_model."""
+        """--draft custom:tag propagates to the Modelfile as a documented hint."""
         cached_base, store, vendor = _setup_store(tmp_path)
         plan = ExportPlan(quant="Q4_K_M", ollama_name="mydoc")
         recorder = _SubprocessRecorder(store.export_quant_dir(plan.quant))
 
-        # skip_ollama=True stops before ollama_create but *after* the
-        # Modelfile is rendered? Actually no — the Modelfile is only
-        # rendered when skip_ollama=False. Switch to a mock ollama_create.
         def _ollama_create(*, name: str, modelfile_path: Path, cwd: Path) -> str:
             _ = name, cwd
             assert modelfile_path.is_file()
             text = modelfile_path.read_text()
-            assert "PARAMETER draft_model custom:tag" in text
+            assert "PARAMETER draft_model" not in text
+            assert "ollama pull custom:tag" in text
+            assert "OLLAMA_DRAFT_MODEL=custom:tag" in text
             return "created"
 
         run_export(
diff --git a/tests/unit/hardware/test_doctor.py b/tests/unit/hardware/test_doctor.py
index 3bfb2734..6cce2a1e 100644
--- a/tests/unit/hardware/test_doctor.py
+++ b/tests/unit/hardware/test_doctor.py
@@ -180,6 +180,49 @@ def test_rocm_render_surfaces_arch_suffix_and_qlora_summary(self) -> None:
         assert "adapter:         qlora (4-bit NF4, compute bf16)" in text
 
 
+class TestMlxRenderLine:
+    """Cover both branches of the MLX-availability annotation in render."""
+
+    def _caps(self, *, has_mlx: bool) -> Capabilities:
+        return Capabilities(
+            backend=Backend.CPU,
+            device_name="generic CPU",
+            sm=None,
+            rocm_arch=None,
+            vram_gb=None,
+            unified_memory_gb=None,
+            cpu_cores=8,
+            ram_gb=16.0,
+            supports_bf16=False,
+            supports_fp16=False,
+            has_flash_attention=False,
+            has_xformers=False,
+            has_bitsandbytes=False,
+            has_triton=False,
+            has_mlx=has_mlx,
+            torch_version="2.11.0",
+            accelerate_version=None,
+            cuda_version=None,
+            rocm_version=None,
+            platform="Linux 6.8",
+            determinism_class="best_effort",
+            telemetry_posture={},
+        )
+
+    def test_mlx_available_shows_scope_hint(self) -> None:
+        text = render_text(
+            DoctorResult(capabilities=self._caps(has_mlx=True), plan=None, plan_error=None)
+        )
+        assert "MLX inference:  yes (prompt-only" in text
+        assert "Apple Silicon" in text
+
+    def test_mlx_unavailable_shows_no(self) -> None:
+        text = render_text(
+            DoctorResult(capabilities=self._caps(has_mlx=False), plan=None, plan_error=None)
+        )
+        assert "MLX inference:  no" in text
+
+
 class TestCliDoctor:
     def test_cli_human_output_works(self) -> None:
         runner = CliRunner()
diff --git a/tests/unit/inference/test_mlx_adapter_conversion.py b/tests/unit/inference/test_mlx_adapter_conversion.py
index 1fd6c2b2..c4448d70 100644
--- a/tests/unit/inference/test_mlx_adapter_conversion.py
+++ b/tests/unit/inference/test_mlx_adapter_conversion.py
@@ -2,10 +2,13 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 import pytest
 
 from dlm.inference.mlx_adapter import (
     MlxConversionError,
+    assert_mlx_adapter_applied,
     map_all_keys,
     map_peft_key_to_mlx,
 )
@@ -91,3 +94,184 @@ def test_non_positive_layer_count_rejected(self) -> None:
                 },
                 0,
             )
+
+    def test_attn_target_modules_get_self_attn_prefix(self) -> None:
+        """mlx-lm matches `named_modules()` keys *inside* a transformer
+        block via exact equality. PEFT's bare `q_proj` doesn't match
+        the `self_attn.q_proj` FQN, so without the rewrite mlx-lm
+        silently leaves the model un-wrapped — the textbook "trained
+        model behaves like base" failure mode."""
+        from dlm.inference.mlx_adapter import build_mlx_adapter_config
+
+        cfg = build_mlx_adapter_config(
+            {
+                "r": 16,
+                "lora_alpha": 32,
+                "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
+            },
+            base_num_hidden_layers=28,
+        )
+        assert cfg["lora_parameters"]["keys"] == [
+            "self_attn.q_proj",
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.o_proj",
+        ]
+
+    def test_mlp_target_modules_get_mlp_prefix(self) -> None:
+        from dlm.inference.mlx_adapter import build_mlx_adapter_config
+
+        cfg = build_mlx_adapter_config(
+            {
+                "r": 8,
+                "target_modules": ["gate_proj", "up_proj", "down_proj"],
+            },
+            base_num_hidden_layers=12,
+        )
+        assert cfg["lora_parameters"]["keys"] == [
+            "mlp.gate_proj",
+            "mlp.up_proj",
+            "mlp.down_proj",
+        ]
+
+    def test_already_qualified_keys_pass_through(self) -> None:
+        """Callers that pre-qualify (e.g. for non-decoder architectures)
+        should not see their dotted keys re-rewritten."""
+        from dlm.inference.mlx_adapter import build_mlx_adapter_config
+
+        cfg = build_mlx_adapter_config(
+            {
+                "r": 8,
+                "target_modules": ["self_attn.q_proj", "encoder.fc1"],
+            },
+            base_num_hidden_layers=12,
+        )
+        assert cfg["lora_parameters"]["keys"] == ["self_attn.q_proj", "encoder.fc1"]
+
+    def test_unknown_target_module_passes_through_unqualified(self) -> None:
+        """Names that aren't in the attn/mlp tables stay bare. Caller
+        supervision is the user's responsibility — we don't guess."""
+        from dlm.inference.mlx_adapter import build_mlx_adapter_config
+
+        cfg = build_mlx_adapter_config(
+            {
+                "r": 8,
+                "target_modules": ["unknown_proj"],
+            },
+            base_num_hidden_layers=12,
+        )
+        assert cfg["lora_parameters"]["keys"] == ["unknown_proj"]
+
+
+class TestPeftSafetensorsToMlxTransposes:
+    """PEFT and MLX-LM use different storage layouts for LoRA tensors:
+
+      PEFT lora_A : [r, in_features]       MLX lora_a : [in_features, r]
+      PEFT lora_B : [out_features, r]      MLX lora_b : [r, out_features]
+
+    Without transposing, mlx-lm's `model.load_weights(strict=False)`
+    silently skips the mismatched shapes and the adapter has no effect.
+    """
+
+    def test_lora_a_and_b_get_transposed(self, tmp_path: object) -> None:
+        from pathlib import Path as _Path
+
+        import torch
+        from safetensors.torch import load_file, save_file
+
+        from dlm.inference.mlx_adapter import peft_safetensors_to_mlx_safetensors
+
+        tmp_path = _Path(str(tmp_path))
+        peft_dir = tmp_path / "peft"
+        peft_dir.mkdir()
+        # PEFT shapes: lora_A=[r=4, in=8], lora_B=[out=16, r=4]
+        peft_tensors = {
+            "base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight": torch.arange(
+                32, dtype=torch.float32
+            ).reshape(4, 8),
+            "base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight": torch.arange(
+                64, dtype=torch.float32
+            ).reshape(16, 4),
+        }
+        save_file(peft_tensors, str(peft_dir / "adapter_model.safetensors"))
+
+        mlx_path = tmp_path / "out" / "adapters.safetensors"
+        peft_safetensors_to_mlx_safetensors(peft_dir, mlx_path)
+
+        mlx_tensors = load_file(str(mlx_path))
+        a = mlx_tensors["model.layers.0.self_attn.q_proj.lora_a"]
+        b = mlx_tensors["model.layers.0.self_attn.q_proj.lora_b"]
+        # Transposed shapes
+        assert tuple(a.shape) == (8, 4)
+        assert tuple(b.shape) == (4, 16)
+        # Values match a transpose, not just a reshape.
+        assert torch.equal(a, peft_tensors[next(iter(peft_tensors))].t())
+
+
+class TestAssertMlxAdapterApplied:
+    """Fail-loud post-load guard. mlx-lm silently leaves a model
+    un-wrapped when keys don't match; this check turns that footgun
+    into an explicit `MlxConversionError` so users see the failure
+    rather than getting silent base-model output."""
+
+    def _fake_model_with_params(self, names: list[str]) -> Any:
+        """Build a stand-in for an mlx model that exposes
+        `trainable_parameters()` returning a flat dict of fake tensors.
+        We don't go through `mlx.utils.tree_flatten`'s real
+        implementation here — assert_mlx_adapter_applied uses it
+        directly, so we assert via the import-mock approach below."""
+
+        class _FakeArr:
+            shape = (1,)
+
+        class _FakeModel:
+            def trainable_parameters(self) -> dict[str, Any]:
+                return {n: _FakeArr() for n in names}
+
+        return _FakeModel()
+
+    def test_passes_when_lora_params_present(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        # Stub mlx.utils.tree_flatten so the test doesn't require
+        # mlx-lm's real flatten semantics — we only need it to walk
+        # the dict-shaped trainable_parameters() output.
+        import sys
+        import types as _types
+
+        fake_mlx = _types.ModuleType("mlx")
+        fake_mlx_utils = _types.ModuleType("mlx.utils")
+
+        def _tree_flatten(d: dict[str, Any]) -> list[tuple[str, Any]]:
+            return list(d.items())
+
+        fake_mlx_utils.tree_flatten = _tree_flatten  # type: ignore[attr-defined]
+        monkeypatch.setitem(sys.modules, "mlx", fake_mlx)
+        monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils)
+
+        model = self._fake_model_with_params(
+            [
+                "model.layers.0.self_attn.q_proj.lora_a",
+                "model.layers.0.self_attn.q_proj.lora_b",
+            ]
+        )
+        # Should not raise.
+        assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"])
+
+    def test_raises_when_no_lora_params(self, monkeypatch: pytest.MonkeyPatch) -> None:
+        import sys
+        import types as _types
+
+        fake_mlx = _types.ModuleType("mlx")
+        fake_mlx_utils = _types.ModuleType("mlx.utils")
+        fake_mlx_utils.tree_flatten = lambda d: list(d.items())  # type: ignore[attr-defined]
+        monkeypatch.setitem(sys.modules, "mlx", fake_mlx)
+        monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils)
+
+        # Only base parameters; no lora_a/lora_b.
+        model = self._fake_model_with_params(
+            [
+                "model.embed_tokens.weight",
+                "model.layers.0.self_attn.q_proj.weight",
+            ]
+        )
+        with pytest.raises(MlxConversionError, match="zero `lora_a`"):
+            assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"])
diff --git a/tests/unit/inference/test_mlx_backend.py b/tests/unit/inference/test_mlx_backend.py
index cc407378..7d6d7f5a 100644
--- a/tests/unit/inference/test_mlx_backend.py
+++ b/tests/unit/inference/test_mlx_backend.py
@@ -12,6 +12,7 @@
 from dlm.base_models import BASE_MODELS
 from dlm.inference.backends.mlx_backend import (
     MlxBackend,
+    _apply_chat_template,
     _resolve_base_num_hidden_layers,
     stage_mlx_adapter_dir,
 )
@@ -94,6 +95,14 @@ def test_load_generate_and_unload_happy_path(
         adapter_dir = tmp_path / "adapter"
         adapter_dir.mkdir()
         staged_dir = tmp_path / "staged"
+        staged_dir.mkdir()
+        # Real `stage_mlx_adapter_dir` writes this file; the post-load
+        # assertion guard reads it to know which FQNs to verify, so the
+        # test stub must mirror the on-disk shape.
+        (staged_dir / "adapter_config.json").write_text(
+            '{"lora_parameters": {"keys": ["self_attn.q_proj"]}}',
+            encoding="utf-8",
+        )
 
         backend = MlxBackend(SimpleNamespace())
         monkeypatch.setattr(
@@ -104,6 +113,14 @@ def test_load_generate_and_unload_happy_path(
             "dlm.inference.backends.mlx_backend.stage_mlx_adapter_dir",
             lambda peft_adapter_dir, dst_dir, *, base_hf_id: staged_dir,
         )
+        # Stub the post-load assertion: real `mlx_lm.load` produces a
+        # model with LoRA-wrapped layers, but here we hand back a
+        # placeholder string. Bypassing the assertion keeps the rest
+        # of the stubbed happy-path test intact.
+        monkeypatch.setattr(
+            "dlm.inference.backends.mlx_backend.assert_mlx_adapter_applied",
+            lambda model, *, expected_keys: None,
+        )
 
         fake_mlx = ModuleType("mlx_lm")
         fake_mlx.load = lambda hf_id, adapter_path: ("model", "tokenizer")
@@ -129,6 +146,46 @@ def test_load_generate_and_unload_happy_path(
         assert backend._tokenizer is None
 
 
+class TestApplyChatTemplate:
+    """Mirrors PyTorch backend's `format_chat_prompt` for the MLX path."""
+
+    def test_renders_via_tokenizer_when_template_present(self) -> None:
+        captured: dict[str, object] = {}
+
+        def _apply_chat_template_method(
+            messages: list[dict[str, str]],
+            *,
+            tokenize: bool,
+            add_generation_prompt: bool,
+        ) -> str:
+            captured["messages"] = messages
+            captured["tokenize"] = tokenize
+            captured["add_generation_prompt"] = add_generation_prompt
+            return "<rendered>hello"
+
+        tokenizer = SimpleNamespace(
+            chat_template="{{messages}}",
+            apply_chat_template=_apply_chat_template_method,
+        )
+        out = _apply_chat_template(tokenizer, "hello")
+        assert out == "<rendered>hello"
+        assert captured["messages"] == [{"role": "user", "content": "hello"}]
+        assert captured["tokenize"] is False
+        assert captured["add_generation_prompt"] is True
+
+    def test_passes_through_when_no_chat_template(self) -> None:
+        tokenizer = SimpleNamespace(chat_template=None)
+        assert _apply_chat_template(tokenizer, "raw prompt") == "raw prompt"
+
+    def test_passes_through_when_render_returns_non_string(self) -> None:
+        """Defensive: HF wrappers occasionally return token lists; fall back to raw."""
+        tokenizer = SimpleNamespace(
+            chat_template="{{messages}}",
+            apply_chat_template=lambda *_a, **_k: [1, 2, 3],
+        )
+        assert _apply_chat_template(tokenizer, "raw prompt") == "raw prompt"
+
+
 class TestStageMlxAdapterDir:
     def test_unreadable_adapter_config_raises_conversion_error(self, tmp_path: Path) -> None:
         adapter_dir = tmp_path / "adapter"
diff --git a/tests/unit/inference/test_mlx_stage_adapter_dir.py b/tests/unit/inference/test_mlx_stage_adapter_dir.py
index 794def8a..c5185c38 100644
--- a/tests/unit/inference/test_mlx_stage_adapter_dir.py
+++ b/tests/unit/inference/test_mlx_stage_adapter_dir.py
@@ -79,7 +79,10 @@ def test_writes_safetensors_and_translated_config(
         assert lp["rank"] == 8
         assert lp["scale"] == pytest.approx(16 / 8)
         assert lp["dropout"] == pytest.approx(0.05)
-        assert lp["keys"] == ["q_proj", "v_proj"]
+        # Bare PEFT target_modules get qualified with the in-block FQN
+        # so mlx-lm's `linear_to_lora_layers` can match them. See
+        # `_qualify_target_module` in dlm.inference.mlx_adapter.
+        assert lp["keys"] == ["self_attn.q_proj", "self_attn.v_proj"]
 
     def test_tensor_keys_match_mlx_layout(self, tmp_path: Path, stub_num_layers: None) -> None:
         src = tmp_path / "peft"