diff --git a/src/dlm/base_models/registry.py b/src/dlm/base_models/registry.py index c3e66bb8..1e15ced4 100644 --- a/src/dlm/base_models/registry.py +++ b/src/dlm/base_models/registry.py @@ -337,6 +337,17 @@ size_gb_fp16=0.27, context_length=8_192, recommended_seq_len=1024, + capability_warning=( + "SmolLM2-135M is below dlm's empirical training floor. Audit 13 " + "follow-up findings 02 + 05 measured this base actively " + "degrading general-chat capability under every LoRA recipe " + "tested (PROSE-only, INSTRUCTION-only, mixed). Adapters " + "memorize trained content but fail to generalize and bleed " + "domain-specific tokens into unrelated queries. Suitable for " + "style-transfer demos and pipeline smoke tests; for any " + "specialty-knowledge task use a base ≥ 1B params (e.g. " + "smollm2-1.7b, qwen2.5-coder-1.5b, llama-3.2-1b)." + ), ), BaseModelSpec( key="smollm2-360m", diff --git a/src/dlm/base_models/schema.py b/src/dlm/base_models/schema.py index 8a3b852e..9e5e2794 100644 --- a/src/dlm/base_models/schema.py +++ b/src/dlm/base_models/schema.py @@ -149,6 +149,14 @@ class BaseModelSpec(BaseModel): provenance_url: str | None = None provenance_match_text: str | None = None + # Optional curated warning surfaced at `dlm train` time when this + # base is selected. Populate when the base has a known limitation + # that's not derivable from `params` / `architecture` alone — e.g. + # SmolLM2-135M's measured architectural floor (audit 13 follow-up + # findings 02 + 05: actively degrades base capability under any + # LoRA recipe). Empty string is treated as "no warning". + capability_warning: str | None = Field(default=None, min_length=1) + # Modality + multi-modal preprocessing (schema v10 + v11, plus the # additive `text-moe` discriminator). # Text-family bases leave `modality in {"text", "text-moe"}` diff --git a/src/dlm/cli/commands.py b/src/dlm/cli/commands.py index 6bac022e..0ee8f0a1 100644 --- a/src/dlm/cli/commands.py +++ b/src/dlm/cli/commands.py @@ -853,6 +853,13 @@ def train_cmd( "Acceptance will be persisted in the store manifest." ) raise typer.Exit(code=1) from exc + # `getattr` so test fixtures stubbing `spec` as a `SimpleNamespace` + # without this field still pass; real registry entries always have it. + capability_warning = getattr(spec, "capability_warning", None) + if capability_warning: + console.print( + f"[yellow]warning:[/yellow] base [bold]{spec.key}[/bold]: {capability_warning}" + ) # Detect the DDP world_size set by `accelerate launch` # (WORLD_SIZE env var) and thread it into the doctor so the plan's # effective_batch_size reflects the rank count. Single-process @@ -3575,6 +3582,20 @@ def cache_show_cmd( cache = TokenizedCache.open(store.tokenized_cache_dir) last = _queries.latest_tokenization(store.root) + # The tokenized cache only fires for runs whose frontmatter declares + # `training.sources` (directive-sourced rows are where the tokenize + # cost dominates; in-body sections go through TRL's tokenizer). + # Surface this so an empty cache on an in-body-only doc doesn't + # look like a bug. + has_sources = parsed.frontmatter.training.sources is not None + cache_enabled = parsed.frontmatter.training.cache.enabled + if not has_sources: + cache_status: str | None = "not used (doc has no `training.sources` directive)" + elif not cache_enabled: + cache_status = "disabled (training.cache.enabled = false)" + else: + cache_status = None + payload: dict[str, object] = { "dlm_id": parsed.frontmatter.dlm_id, "cache_path": str(store.tokenized_cache_dir), @@ -3582,6 +3603,7 @@ def cache_show_cmd( "bytes": cache.total_bytes, "last_run_hit_rate": last.hit_rate if last else None, "last_run_id": last.run_id if last else None, + "cache_status": cache_status, } if json_out: _sys.stdout.write(_json.dumps(payload, indent=2) + "\n") @@ -3589,6 +3611,8 @@ def cache_show_cmd( out_console.print(f"[bold]Cache for {parsed.frontmatter.dlm_id}[/bold]") out_console.print(f" path: {store.tokenized_cache_dir}") + if cache_status is not None: + out_console.print(f" status: [yellow]{cache_status}[/yellow]") out_console.print(f" entries: {cache.entry_count}") out_console.print(f" size: {_human_size(cache.total_bytes)}") if last is not None: @@ -3596,7 +3620,7 @@ def cache_show_cmd( f" last-run hit rate: {last.hit_rate:.1%} " f"({last.cache_hits}/{last.cache_hits + last.cache_misses})" ) - else: + elif cache_status is None: out_console.print(" last-run hit rate: [dim]no tokenization runs yet[/dim]") diff --git a/src/dlm/doc/migrate.py b/src/dlm/doc/migrate.py index 5bef5801..d9530980 100644 --- a/src/dlm/doc/migrate.py +++ b/src/dlm/doc/migrate.py @@ -97,7 +97,17 @@ def migrate_file( # Validate post-migration dict against the current schema so a bad # migrator can't silently smear garbage into the document. fm = DlmFrontmatter.model_validate(migrated) - new_text = _rejoin(fm, body_text) + # Preserve the user's *originally explicit* fields across migration + # by collecting their dotted paths from the post-migration dict and + # passing them to the serializer as force-emit overrides. Without + # this, a v1 doc with `lora_r: 8` (matching the current schema + # default) would silently lose the explicit pin and inherit any + # future default change. The contract `CLAUDE.md` calls "additive + # identity" is honored at *intent* level, not just behavior level. + from dlm.doc.serializer import collect_dict_field_paths + + force_emit = collect_dict_field_paths(migrated) + new_text = _rejoin(fm, body_text, force_emit_paths=force_emit) if dry_run: return MigrationResult( @@ -152,7 +162,12 @@ def _split_for_migrate(text: str, *, path: Path) -> tuple[str, str]: ) -def _rejoin(fm: DlmFrontmatter, body_text: str) -> str: +def _rejoin( + fm: DlmFrontmatter, + body_text: str, + *, + force_emit_paths: frozenset[tuple[str, ...]] | None = None, +) -> str: """Re-assemble a `.dlm` file from a migrated frontmatter + raw body. Preserves the body verbatim (migration never touches section content); @@ -165,7 +180,7 @@ def _rejoin(fm: DlmFrontmatter, body_text: str) -> str: # section serialization by handing an empty sections tuple and # concatenating the raw body manually. empty = ParsedDlm(frontmatter=fm, sections=_empty_sections()) - header = serialize(empty) # always ends with "\n" + header = serialize(empty, force_emit_paths=force_emit_paths) # always ends with "\n" # Normalize leading/trailing whitespace on the body to match the # canonical layout: exactly one blank line between `---\n` closer diff --git a/src/dlm/doc/serializer.py b/src/dlm/doc/serializer.py index b011e3dc..33b001f9 100644 --- a/src/dlm/doc/serializer.py +++ b/src/dlm/doc/serializer.py @@ -34,12 +34,23 @@ ) -def serialize(parsed: ParsedDlm) -> str: +def serialize( + parsed: ParsedDlm, + *, + force_emit_paths: frozenset[tuple[str, ...]] | None = None, +) -> str: """Produce canonical `.dlm` text for `parsed`. Always ends with `\\n`. + + `force_emit_paths` is consulted by `_emit_nested_mapping` — a field + whose dotted path appears in the set is emitted even when its value + matches the schema default. Used by the migrate pipeline to + preserve user-explicit fields across schema-default drift (so a + user who pinned `lora_r: 8` doesn't silently inherit a future + `lora_r: 16` default after migration). """ - parts: list[str] = [_serialize_frontmatter(parsed.frontmatter), "\n"] + parts: list[str] = [_serialize_frontmatter(parsed.frontmatter, force_emit_paths), "\n"] for i, section in enumerate(parsed.sections): if i > 0: parts.append("\n") @@ -50,10 +61,41 @@ def serialize(parsed: ParsedDlm) -> str: return rendered +def collect_dict_field_paths(d: object, prefix: tuple[str, ...] = ()) -> frozenset[tuple[str, ...]]: + """Walk a parsed-YAML dict and return every nested leaf-or-mapping path. + + Used by the migrate pipeline: the set of paths present in the + user's original frontmatter (after migration runs) is the set of + fields the serializer must emit even when they match defaults. + Mappings *and* leaves are both included so intermediate blocks + survive re-emission. + """ + paths: set[tuple[str, ...]] = set() + if isinstance(d, dict): + for k, v in d.items(): + if not isinstance(k, str): + continue + here = (*prefix, k) + paths.add(here) + if isinstance(v, dict): + paths.update(collect_dict_field_paths(v, here)) + elif isinstance(v, list): + # List of mappings (e.g. training.sources) — each item + # contributes paths under the same parent key, since + # we serialize positional list entries together. + for item in v: + if isinstance(item, dict): + paths.update(collect_dict_field_paths(item, here)) + return frozenset(paths) + + # --- frontmatter -------------------------------------------------------------- -def _serialize_frontmatter(fm: DlmFrontmatter) -> str: +def _serialize_frontmatter( + fm: DlmFrontmatter, + force_emit_paths: frozenset[tuple[str, ...]] | None = None, +) -> str: lines: list[str] = ["---"] for key in _FRONTMATTER_ORDER: value = getattr(fm, key, None) @@ -63,7 +105,9 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str: lines.extend(_emit_block_scalar(key, value)) continue if isinstance(value, TrainingConfig | ExportConfig): - nested = _emit_nested_mapping(value, indent=2) + nested = _emit_nested_mapping( + value, indent=2, path=(key,), force_emit_paths=force_emit_paths + ) if not nested: # All-default nested block — skip the header too so we # don't emit an empty `training:` line. @@ -76,7 +120,13 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str: return "\n".join(lines) + "\n" -def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]: +def _emit_nested_mapping( + model: BaseModel, + *, + indent: int, + path: tuple[str, ...] = (), + force_emit_paths: frozenset[tuple[str, ...]] | None = None, +) -> list[str]: """Emit a nested training/export/dpo block. Suppress fields that equal their schema default so @@ -87,6 +137,11 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]: Nested `BaseModel` values (e.g. `TrainingConfig.preference`) recurse with deeper indent; all-default sub-blocks are skipped. + + `force_emit_paths` overrides the default-suppression rule for any + field whose dotted path appears in the set. Used by the migrate + pipeline to preserve user-explicit fields across schema-default + drift. """ pad = " " * indent lines: list[str] = [] @@ -99,16 +154,28 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]: for field_name, field_info in model.__class__.model_fields.items(): value = getattr(model, field_name) - if field_info.default is not PydanticUndefined and value == field_info.default: + field_path = (*path, field_name) + forced = force_emit_paths is not None and field_path in force_emit_paths + if ( + not forced + and field_info.default is not PydanticUndefined + and value == field_info.default + ): continue if ( - field_info.default is PydanticUndefined + not forced + and field_info.default is PydanticUndefined and field_info.default_factory is not None and value == field_info.default_factory() # type: ignore[call-arg] ): continue if isinstance(value, BaseModel): - nested = _emit_nested_mapping(value, indent=indent + 2) + nested = _emit_nested_mapping( + value, + indent=indent + 2, + path=field_path, + force_emit_paths=force_emit_paths, + ) if not nested: continue lines.append(f"{pad}{field_name}:") @@ -125,7 +192,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]: lines.append(f"{pad}{field_name}:") for k, v in value.items(): lines.append(f"{pad} {k}:") - nested = _emit_nested_mapping(v, indent=indent + 4) + nested = _emit_nested_mapping( + v, + indent=indent + 4, + path=(*field_path, k), + force_emit_paths=force_emit_paths, + ) if nested: lines.extend(nested) else: @@ -144,7 +216,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]: # fields indent aligned. lines.append(f"{pad}{field_name}:") for item in value: - nested = _emit_nested_mapping(item, indent=indent + 4) + nested = _emit_nested_mapping( + item, + indent=indent + 4, + path=field_path, + force_emit_paths=force_emit_paths, + ) if not nested: lines.append(f"{pad} - {{}}") continue diff --git a/src/dlm/export/ollama/modelfile_shared.py b/src/dlm/export/ollama/modelfile_shared.py index e0dcc704..bdaf6cfb 100644 --- a/src/dlm/export/ollama/modelfile_shared.py +++ b/src/dlm/export/ollama/modelfile_shared.py @@ -129,8 +129,14 @@ def build_param_lines( if num_ctx is not None: lines.append(f"PARAMETER num_ctx {num_ctx}") if draft_model is not None: - lines.append(f"# Speculative decoding: `ollama pull {draft_model}` first.") - lines.append(f"PARAMETER draft_model {draft_model}") + # `draft_model` is not a valid Modelfile PARAMETER directive + # (Ollama rejects `ollama create` with "unknown parameter + # 'draft_model'"). It's a runtime option exposed via the + # `OLLAMA_DRAFT_MODEL` env var or the API's `options.draft_model` + # field. Document the suggested pairing as a comment so users + # can wire it up without forcing-fail their `ollama create`. + lines.append(f"# Speculative-decoding draft: `ollama pull {draft_model}`") + lines.append(f"# then run with `OLLAMA_DRAFT_MODEL={draft_model} ollama run `") return lines diff --git a/src/dlm/export/preflight.py b/src/dlm/export/preflight.py index 39163729..7e4836cc 100644 --- a/src/dlm/export/preflight.py +++ b/src/dlm/export/preflight.py @@ -98,8 +98,11 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int: detail=f"cannot parse {cfg_path}: {exc}", ) from exc - # `vocab_size` key isn't always present in tokenizer_config.json; - # fall back to the companion tokenizer.json which always carries it. + # `vocab_size` key isn't always present in tokenizer_config.json + # (Qwen2.5+, Llama-3.x omit it); fall back to summing the BPE base + # plus the explicit `added_tokens` array in tokenizer.json. This + # matches `len(transformers.AutoTokenizer.from_pretrained(...))` — + # the count the model actually addresses at inference time. vocab_size = cfg.get("vocab_size") if not isinstance(vocab_size, int): tokenizer_json = adapter_dir / "tokenizer.json" @@ -113,8 +116,9 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int: ) from exc model = t.get("model") or {} vocab = model.get("vocab") + added = t.get("added_tokens") or [] if isinstance(vocab, dict): - vocab_size = len(vocab) + vocab_size = len(vocab) + (len(added) if isinstance(added, list) else 0) if not isinstance(vocab_size, int) or vocab_size <= 0: raise PreflightError( probe="tokenizer_vocab", @@ -133,6 +137,10 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None: `--no-template` on the CLI sets `required=False`; the default requires one because the Modelfile emitter hardcodes `TEMPLATE "..."` which needs source text. + + Modern HF tokenizers (Qwen2.5+, Llama-3.x) write the template to + a sibling `chat_template.jinja` file rather than inlining it in + `tokenizer_config.json`. Check both locations. """ if not required: return @@ -150,15 +158,30 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None: detail=f"cannot parse {cfg_path}: {exc}", ) from exc template = cfg.get("chat_template") - if not template or not str(template).strip(): - raise PreflightError( - probe="chat_template", - detail=( - "tokenizer has no chat_template. Pass --no-template to skip " - "this check (Modelfile emission will fall back to the base " - "model's default), or attach a template via frontmatter." - ), - ) + if template and str(template).strip(): + return + + sibling_path = adapter_dir / "chat_template.jinja" + if sibling_path.exists(): + try: + sibling_template = sibling_path.read_text(encoding="utf-8") + except OSError as exc: + raise PreflightError( + probe="chat_template", + detail=f"cannot read {sibling_path}: {exc}", + ) from exc + if sibling_template.strip(): + return + + raise PreflightError( + probe="chat_template", + detail=( + "tokenizer has no chat_template (checked tokenizer_config.json " + "and chat_template.jinja). Pass --no-template to skip " + "this check (Modelfile emission will fall back to the base " + "model's default), or attach a template via frontmatter." + ), + ) def check_pretokenizer_fingerprint(spec: BaseModelSpec) -> None: diff --git a/src/dlm/export/runner.py b/src/dlm/export/runner.py index 408b233d..55ed52df 100644 --- a/src/dlm/export/runner.py +++ b/src/dlm/export/runner.py @@ -90,24 +90,29 @@ def _default_check_base_vocab(adapter_path: Path, base_gguf_path: Path) -> None: `assert_gguf_vocab_matches` is tokenizer-backed; we use the file-based path here so `run_export` doesn't take a `transformers` - import hit on every call. A mismatch means the base conversion - materialized a different tokenizer than the one the adapter was - trained against — exactly the silent-failure surface this check is - meant to catch. + import hit on every call. The model's embedding matrix may have + reserved/special-token slots beyond the tokenizer's addressable + count (Qwen2.5: 151643 BPE + 22 added = 151665 addressable, but + `vocab_size = 151936` worth of embedding rows for future-token + reservation). GGUF carries the embedding-aligned count, so it's + valid for `gguf_vocab >= adapter_vocab`. The unsafe direction is + `adapter_vocab > gguf_vocab` — that means the tokenizer can emit + indices beyond what the model has embeddings for. """ from dlm.export.preflight import check_tokenizer_vocab from dlm.export.tokenizer_sync import read_gguf_vocab_size adapter_vocab = check_tokenizer_vocab(adapter_path) gguf_vocab = read_gguf_vocab_size(base_gguf_path) - if adapter_vocab != gguf_vocab: + if adapter_vocab > gguf_vocab: from dlm.export.errors import PreflightError raise PreflightError( probe="gguf_vocab", detail=( - f"adapter tokenizer vocab ({adapter_vocab}) does not match " - f"base GGUF vocab ({gguf_vocab}) at {base_gguf_path.name}. " + f"adapter tokenizer addressable vocab ({adapter_vocab}) exceeds " + f"base GGUF vocab ({gguf_vocab}) at {base_gguf_path.name} — " + "the tokenizer can emit indices the model has no embeddings for. " "Re-run base conversion against the adapter-dir tokenizer." ), ) diff --git a/src/dlm/hardware/render.py b/src/dlm/hardware/render.py index e8bfcaf9..5bcacdc1 100644 --- a/src/dlm/hardware/render.py +++ b/src/dlm/hardware/render.py @@ -25,7 +25,7 @@ def render_text(result: DoctorResult) -> str: lines.append(f"FlashAttention: {_bool(caps.has_flash_attention)}") lines.append(f"xFormers: {_bool(caps.has_xformers)}") lines.append(f"Triton: {_bool(caps.has_triton)}") - lines.append(f"MLX inference: {_bool(caps.has_mlx)}") + lines.append(f"MLX inference: {_mlx_line(caps)}") lines.append(f"accelerate: {caps.accelerate_version or 'not installed'}") lines.append(f"CPU cores: {caps.cpu_cores}") lines.append(f"RAM: {caps.ram_gb:.1f} GB") @@ -82,6 +82,20 @@ def _bool(v: bool) -> str: return "yes" if v else "no" +def _mlx_line(caps: Capabilities) -> str: + """Doctor's MLX line — `yes` plus a one-line scope hint, or `no`. + + MLX is prompt-only (training stays on PyTorch MPS). On Apple Silicon + it's the default backend for `dlm prompt`; off-platform it's not + installed. The scope hint is here so users don't expect MLX to + accelerate training and don't think `--backend mlx` is a separate + "mode" with its own training story. + """ + if not caps.has_mlx: + return "no" + return "yes (prompt-only; default backend for `dlm prompt` on Apple Silicon)" + + def _telemetry_line(posture: dict[str, str]) -> str: hf = posture.get("HF_HUB_DISABLE_TELEMETRY", "") dnt = posture.get("DO_NOT_TRACK", "") diff --git a/src/dlm/inference/backends/mlx_backend.py b/src/dlm/inference/backends/mlx_backend.py index d2730323..26e63478 100644 --- a/src/dlm/inference/backends/mlx_backend.py +++ b/src/dlm/inference/backends/mlx_backend.py @@ -35,7 +35,10 @@ from dlm.inference.backends.base import InferenceBackend from dlm.inference.errors import AdapterNotFoundError -from dlm.inference.mlx_adapter import MlxConversionError +from dlm.inference.mlx_adapter import ( + MlxConversionError, + assert_mlx_adapter_applied, +) if TYPE_CHECKING: from dlm.base_models import BaseModelSpec @@ -192,6 +195,14 @@ def load( # pragma: no cover - heavy path adapter_path=str(staged), ) + # mlx-lm's `load_adapters` runs `linear_to_lora_layers` + + # `model.load_weights(strict=False)`, both of which fail + # silently on key/shape mismatches. Verify at least one LoRA + # parameter actually attached — else `MlxConversionError` so + # the user sees the failure instead of base-model output. + staged_cfg = json.loads((staged / _ADAPTER_CONFIG_FILENAME).read_text(encoding="utf-8")) + assert_mlx_adapter_applied(self._model, expected_keys=staged_cfg["lora_parameters"]["keys"]) + def generate(self, prompt: str, **gen_kwargs: Any) -> str: # pragma: no cover - heavy path if self._model is None or self._tokenizer is None: raise RuntimeError("MlxBackend.generate called before load()") @@ -206,6 +217,13 @@ def generate(self, prompt: str, **gen_kwargs: Any) -> str: # pragma: no cover - top_p = float(gen_kwargs.get("top_p") or 0.0) top_k = int(gen_kwargs.get("top_k") or 0) + # Apply the tokenizer's chat template so INSTRUCTION-trained + # adapters see the same input shape they trained on. The PyTorch + # backend renders via `format_chat_prompt`; without the same + # render here, MLX feeds raw user text to the model and the + # adapter silently fails to fire on chat-shaped queries. + rendered = _apply_chat_template(self._tokenizer, prompt) + # mlx-lm's `generate` / `generate_step` no longer accept `temp` # directly — sampling params are passed via a sampler produced # by `make_sampler(temp=..., top_p=..., top_k=...)`. Unknown @@ -215,7 +233,7 @@ def generate(self, prompt: str, **gen_kwargs: Any) -> str: # pragma: no cover - out = mlx_generate( self._model, self._tokenizer, - prompt=prompt, + prompt=rendered, max_tokens=max_new_tokens, sampler=sampler, verbose=False, @@ -228,3 +246,22 @@ def unload(self) -> None: if self._workdir is not None: self._workdir.cleanup() self._workdir = None + + +def _apply_chat_template(tokenizer: Any, prompt: str) -> str: + """Render `prompt` through the tokenizer's chat template, or pass through. + + Mirrors `dlm.inference.generate.format_chat_prompt` for the MLX path. + mlx-lm's `TokenizerWrapper` proxies `apply_chat_template` to the + underlying HF tokenizer; bases without a chat template (raw causal + LMs) get the prompt unchanged. + """ + if getattr(tokenizer, "chat_template", None): + rendered = tokenizer.apply_chat_template( + [{"role": "user", "content": prompt}], + tokenize=False, + add_generation_prompt=True, + ) + if isinstance(rendered, str): + return rendered + return prompt diff --git a/src/dlm/inference/mlx_adapter.py b/src/dlm/inference/mlx_adapter.py index 7bc627ac..9b27eba3 100644 --- a/src/dlm/inference/mlx_adapter.py +++ b/src/dlm/inference/mlx_adapter.py @@ -44,6 +44,39 @@ _LORA_AB = re.compile(r"\.lora_([AB])\.weight$") """Matches the trailing `.lora_A.weight` / `.lora_B.weight` suffix.""" +_ATTN_TARGETS: frozenset[str] = frozenset( + {"q_proj", "k_proj", "v_proj", "o_proj", "qkv_proj", "wqkv"} +) +"""Bare PEFT `target_modules` names that live under `self_attn.` on +decoder-only transformers (Qwen2/Llama/Mistral/Phi/SmolLM).""" + +_MLP_TARGETS: frozenset[str] = frozenset({"gate_proj", "up_proj", "down_proj", "fc1", "fc2"}) +"""Bare PEFT `target_modules` names that live under `mlp.` on the same +family of architectures.""" + + +def _qualify_target_module(name: str) -> str: + """Map a PEFT bare `target_modules` entry to its in-block FQN. + + mlx-lm's `linear_to_lora_layers` matches `named_modules()` keys + *within* each transformer block via exact equality (`if k in keys`). + PEFT records `target_modules` as bare module names (`q_proj`), + while the FQN within an MLX-LM transformer block is fully qualified + (`self_attn.q_proj`). Without this rewrite the keys never match and + `linear_to_lora_layers` silently leaves the model un-wrapped — the + user-visible failure is "trained model behaves identically to base." + + Already-qualified names (containing a `.`) pass through untouched + so callers can pre-qualify if needed. + """ + if "." in name: + return name + if name in _ATTN_TARGETS: + return f"self_attn.{name}" + if name in _MLP_TARGETS: + return f"mlp.{name}" + return name + class MlxConversionError(RuntimeError): """Raised when a PEFT adapter cannot be converted to the MLX layout.""" @@ -121,7 +154,23 @@ def peft_safetensors_to_mlx_safetensors( # pragma: no cover - I/O + torch deps tensors = load_file(str(src)) mapping = map_all_keys(list(tensors.keys())) - mlx_tensors = {mlx_key: tensors[peft_key] for peft_key, mlx_key in mapping.items()} + # PEFT stores LoRA weights with shapes that don't match what + # mlx-lm's `LoRALinear` expects: + # + # PEFT lora_A : [r, in_features] MLX lora_a : [in_features, r] + # PEFT lora_B : [out_features, r] MLX lora_b : [r, out_features] + # + # Both tensors need a transpose. Loading without the transpose + # makes mlx-lm's `model.load_weights(strict=False)` silently skip + # the mismatched shapes and the adapter has no effect — the + # textbook "trained model behaves like base" failure mode. + mlx_tensors = {} + for peft_key, mlx_key in mapping.items(): + t = tensors[peft_key] + # `mlx_key` ends in `.lora_a` or `.lora_b` (lowercase, no `.weight`). + if mlx_key.endswith((".lora_a", ".lora_b")): + t = t.t().contiguous() + mlx_tensors[mlx_key] = t mlx_safetensors_path.parent.mkdir(parents=True, exist_ok=True) save_file(mlx_tensors, str(mlx_safetensors_path)) @@ -174,6 +223,8 @@ def build_mlx_adapter_config( "cannot stage mlx adapter without a valid layer count" ) + qualified_keys = [_qualify_target_module(t) for t in target_modules] + return { "fine_tune_type": "dora" if use_dora else "lora", "num_layers": int(base_num_hidden_layers), @@ -181,6 +232,54 @@ def build_mlx_adapter_config( "rank": rank, "scale": lora_alpha / rank if rank else float(lora_alpha), "dropout": lora_dropout, - "keys": list(target_modules), + "keys": qualified_keys, }, } + + +def assert_mlx_adapter_applied(model: Any, *, expected_keys: list[str]) -> None: + """Verify mlx-lm's `load_adapters` actually wrapped the targeted layers. + + `mlx_lm.load(..., adapter_path=...)` calls `linear_to_lora_layers` + followed by `model.load_weights(strict=False)`. Both steps fail + silently if their inputs don't match the loaded model: + + - `linear_to_lora_layers` is a no-op when `keys` don't match any + module's FQN inside the transformer blocks + - `load_weights(strict=False)` skips any tensor key that doesn't + match a model parameter + + Either failure produces a model that runs as if no adapter were + loaded. Catching this here turns the "trained model behaves like + base" footgun into an explicit refusal so the user knows to use + `--backend pytorch` (or the fix needs an architecture-aware + keys translator). + + `expected_keys` are the in-block FQNs from the staged + `adapter_config.json` (e.g. `["self_attn.q_proj", ...]`). We confirm + that at least one matching module ended up as a LoRA-wrapped layer. + """ + try: + import mlx.utils as mlx_utils # type: ignore[import-not-found, unused-ignore] + except ImportError as exc: # pragma: no cover - mlx not importable + raise MlxConversionError(f"mlx not importable for verification: {exc}") from exc + + try: + flat: Any = mlx_utils.tree_flatten(model.trainable_parameters()) + except Exception as exc: # pragma: no cover - defensive + raise MlxConversionError( + f"could not enumerate model trainable_parameters for verification: {exc}" + ) from exc + + lora_param_count = sum(1 for k, _ in flat if k.endswith(".lora_a") or k.endswith(".lora_b")) + if lora_param_count == 0: + raise MlxConversionError( + "mlx-lm loaded the adapter without applying it — zero " + "`lora_a` / `lora_b` parameters present after load. This " + "usually means the keys " + f"{expected_keys!r} don't match the model's `named_modules()` " + "FQNs (e.g. the base architecture uses a different submodule " + "layout than `self_attn.*` / `mlp.*`). The trained adapter " + "would behave identically to the base model. Use " + "`--backend pytorch` as a workaround." + ) diff --git a/tests/unit/base_models/test_registry.py b/tests/unit/base_models/test_registry.py index 62049233..c8ff7c23 100644 --- a/tests/unit/base_models/test_registry.py +++ b/tests/unit/base_models/test_registry.py @@ -30,6 +30,21 @@ def test_known_keys_stable_ordering(self) -> None: assert list(keys) == list(BASE_MODELS.keys()) +class TestCapabilityWarnings: + def test_smol_135m_carries_warning(self) -> None: + """SmolLM2-135M is below the empirically-measured training floor.""" + spec = BASE_MODELS["smollm2-135m"] + assert spec.capability_warning is not None + # The audit findings 02 + 05 are the load-bearing evidence; + # the wording should mention the floor in some form. + assert "floor" in spec.capability_warning.lower() + + def test_above_floor_bases_have_no_warning(self) -> None: + """1B+ launch bases ship without the architectural-floor warning.""" + for key in ("smollm2-1.7b", "qwen2.5-coder-1.5b", "llama-3.2-1b"): + assert BASE_MODELS[key].capability_warning is None, key + + class TestRevisionPinning: """Audit-02 F11: every registry entry has a real 40-char commit SHA.""" diff --git a/tests/unit/cli/test_misc_commands_coverage.py b/tests/unit/cli/test_misc_commands_coverage.py index 741f0d26..6e089c8b 100644 --- a/tests/unit/cli/test_misc_commands_coverage.py +++ b/tests/unit/cli/test_misc_commands_coverage.py @@ -552,7 +552,11 @@ def save_manifest(self) -> None: ) assert show_text.exit_code == 0, show_text.output assert "Cache for" in show_text.output - assert "no tokenization runs yet" in show_text.output + # Minimal doc has no `training.sources` directive, so the cache + # is gated off by design — surface that instead of leaving the + # user to guess why their cache is empty. + assert "not used" in show_text.output + assert "training.sources" in show_text.output prune_bad = runner.invoke( app, diff --git a/tests/unit/doc/test_migrate.py b/tests/unit/doc/test_migrate.py index d8b3fc69..46923867 100644 --- a/tests/unit/doc/test_migrate.py +++ b/tests/unit/doc/test_migrate.py @@ -138,6 +138,46 @@ def _pre_current(raw: dict[str, object]) -> dict[str, object]: assert result.backup_path is None assert not (tmp_path / "mydoc.dlm.bak").exists() + def test_user_explicit_default_value_survives_migration( + self, tmp_path: Path, bumped_current: int + ) -> None: + """Honor the 'additive identity' contract at intent level, not just behavior. + + A user who pinned `lora_r: 8` should still see `lora_r: 8` in + the migrated doc, even though 8 happens to match the current + schema default. Without this, a future default change would + silently override the user's pin. + """ + current = bumped_current - 1 + + @register(from_version=current) + def _pre_current(raw: dict[str, object]) -> dict[str, object]: + return dict(raw) + + doc = tmp_path / "mydoc.dlm" + doc.write_text( + f"""--- +dlm_id: {_VALID_ULID} +base_model: smollm2-135m +dlm_version: {current} +training: + lora_r: 8 + lora_alpha: 16 + num_epochs: 3 +--- + +body +""", + encoding="utf-8", + ) + + result = migrate_file(doc, no_backup=True) + assert result.wrote is True + text = doc.read_text(encoding="utf-8") + assert "lora_r: 8" in text + assert "lora_alpha: 16" in text + assert "num_epochs: 3" in text + def test_dry_run_reports_without_writing(self, tmp_path: Path, bumped_current: int) -> None: current = bumped_current - 1 diff --git a/tests/unit/doc/test_serializer_edges.py b/tests/unit/doc/test_serializer_edges.py index f3688687..9e9dd501 100644 --- a/tests/unit/doc/test_serializer_edges.py +++ b/tests/unit/doc/test_serializer_edges.py @@ -28,6 +28,7 @@ _needs_quoting, _scalar, _serialize_section, + collect_dict_field_paths, serialize, ) @@ -168,7 +169,9 @@ def test_output_always_ends_with_single_newline(self, monkeypatch: pytest.Monkey frontmatter=fm, sections=(Section(SectionType.PROSE, "content"),), ) - monkeypatch.setattr("dlm.doc.serializer._serialize_frontmatter", lambda _fm: "---\n---") + monkeypatch.setattr( + "dlm.doc.serializer._serialize_frontmatter", lambda _fm, _force=None: "---\n---" + ) out = serialize(parsed) assert out.endswith("\n") assert not out.endswith("\n\n") @@ -181,11 +184,55 @@ def test_serializer_adds_newline_when_section_render_omits_it( frontmatter=fm, sections=(Section(SectionType.PROSE, "content"),), ) - monkeypatch.setattr("dlm.doc.serializer._serialize_frontmatter", lambda _fm: "---\n---") + monkeypatch.setattr( + "dlm.doc.serializer._serialize_frontmatter", lambda _fm, _force=None: "---\n---" + ) monkeypatch.setattr("dlm.doc.serializer._serialize_section", lambda _section: "body") assert serialize(parsed).endswith("\n") +class TestCollectDictFieldPaths: + """Migrate path uses this to remember which fields the user explicitly set.""" + + def test_flat_mapping(self) -> None: + paths = collect_dict_field_paths({"a": 1, "b": 2}) + assert paths == frozenset({("a",), ("b",)}) + + def test_nested_mapping_records_intermediate_and_leaf(self) -> None: + paths = collect_dict_field_paths({"training": {"lora_r": 8, "lora_alpha": 16}}) + assert paths == frozenset( + {("training",), ("training", "lora_r"), ("training", "lora_alpha")} + ) + + def test_non_string_keys_ignored(self) -> None: + # Pydantic-validated dicts only use string keys; the defensive + # branch keeps a hostile YAML round-trip from leaking a tuple key. + paths = collect_dict_field_paths({"a": 1, 7: "ignored"}) + assert paths == frozenset({("a",)}) + + def test_list_of_mappings_recurses_under_parent_path(self) -> None: + # `training.sources` shape: each list entry is a mapping that + # contributes paths under ("training", "sources"), so a v1 source + # block with explicit `path:`/`max_bytes_per_file:` survives migration. + paths = collect_dict_field_paths( + { + "training": { + "sources": [ + {"path": "/foo", "max_bytes_per_file": 4096}, + {"path": "/bar"}, + ] + } + } + ) + assert ("training", "sources") in paths + assert ("training", "sources", "path") in paths + assert ("training", "sources", "max_bytes_per_file") in paths + + def test_non_dict_input_yields_empty(self) -> None: + assert collect_dict_field_paths("not a dict") == frozenset() + assert collect_dict_field_paths(None) == frozenset() + + class TestFrontmatterExplicitTargetModulesList: """Ensures the list branch in the nested-mapping emitter is exercised.""" diff --git a/tests/unit/export/ollama/test_modelfile.py b/tests/unit/export/ollama/test_modelfile.py index 3466ea15..14a0dcb2 100644 --- a/tests/unit/export/ollama/test_modelfile.py +++ b/tests/unit/export/ollama/test_modelfile.py @@ -110,15 +110,21 @@ def test_override_temperature(self, tmp_path: Path) -> None: assert "PARAMETER temperature 0.2" in text def test_draft_model_omitted_by_default(self, tmp_path: Path) -> None: - """Sprint 12.5: no draft set → no `PARAMETER draft_model`.""" + """No draft set → no draft directive at all.""" text = render_modelfile(_ctx(tmp_path)) - assert "PARAMETER draft_model" not in text + assert "draft_model" not in text + + def test_draft_model_documented_when_set(self, tmp_path: Path) -> None: + """When a draft is registered we emit a comment, not a PARAMETER. - def test_draft_model_emitted_when_set(self, tmp_path: Path) -> None: - """Sprint 12.5: `PARAMETER draft_model ` appears + pull reminder.""" + `draft_model` is a runtime/API option in Ollama, not a Modelfile + PARAMETER — emitting `PARAMETER draft_model ` made + `ollama create` fail with "unknown parameter 'draft_model'". + """ text = render_modelfile(_ctx(tmp_path, draft_model_ollama_name="qwen2.5:0.5b")) - assert "PARAMETER draft_model qwen2.5:0.5b" in text + assert "PARAMETER draft_model" not in text assert "ollama pull qwen2.5:0.5b" in text + assert "OLLAMA_DRAFT_MODEL=qwen2.5:0.5b" in text def test_override_top_p(self, tmp_path: Path) -> None: """Audit-04 Q5: frontmatter export.default_top_p overrides the dialect default.""" diff --git a/tests/unit/export/test_preflight.py b/tests/unit/export/test_preflight.py index d736b293..74ece763 100644 --- a/tests/unit/export/test_preflight.py +++ b/tests/unit/export/test_preflight.py @@ -101,6 +101,19 @@ def test_malformed_tokenizer_json_raises(self, tmp_path: Path) -> None: with pytest.raises(PreflightError, match="cannot parse"): check_tokenizer_vocab(tmp_path) + def test_fallback_sums_added_tokens(self, tmp_path: Path) -> None: + """Qwen2.5+ tokenizer: BPE vocab + added_tokens give addressable count.""" + (tmp_path / "tokenizer_config.json").write_text(json.dumps({})) + (tmp_path / "tokenizer.json").write_text( + json.dumps( + { + "model": {"vocab": {str(i): i for i in range(5000)}}, + "added_tokens": [{"id": i, "content": f"<|t{i}|>"} for i in range(22)], + } + ) + ) + assert check_tokenizer_vocab(tmp_path) == 5022 + class TestChatTemplate: def test_present_ok(self, tmp_path: Path) -> None: @@ -130,6 +143,36 @@ def test_malformed_config_raises(self, tmp_path: Path) -> None: with pytest.raises(PreflightError, match="cannot parse"): check_chat_template(tmp_path, required=True) + def test_sibling_jinja_satisfies_when_config_lacks_template(self, tmp_path: Path) -> None: + """Qwen2.5+/Llama-3.x write the template to chat_template.jinja, not the config.""" + _write_tokenizer_config(tmp_path, chat_template="") + (tmp_path / "chat_template.jinja").write_text("{{messages}}") + check_chat_template(tmp_path) + + def test_empty_sibling_jinja_falls_through_to_error(self, tmp_path: Path) -> None: + _write_tokenizer_config(tmp_path, chat_template="") + (tmp_path / "chat_template.jinja").write_text(" \n") + with pytest.raises(PreflightError, match="chat_template"): + check_chat_template(tmp_path) + + def test_unreadable_sibling_jinja_raises( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """OSError while reading the .jinja sibling surfaces as a typed PreflightError.""" + _write_tokenizer_config(tmp_path, chat_template="") + (tmp_path / "chat_template.jinja").write_text("{{messages}}") + + original_read_text = Path.read_text + + def _boom(self: Path, *args: object, **kwargs: object) -> str: + if self.name == "chat_template.jinja": + raise OSError("simulated read failure") + return original_read_text(self, *args, **kwargs) # type: ignore[arg-type] + + monkeypatch.setattr(Path, "read_text", _boom) + with pytest.raises(PreflightError, match="cannot read"): + check_chat_template(tmp_path) + class TestQloraFlag: def test_missing_file_returns_false(self, tmp_path: Path) -> None: diff --git a/tests/unit/export/test_runner.py b/tests/unit/export/test_runner.py index e22d0236..25463e9c 100644 --- a/tests/unit/export/test_runner.py +++ b/tests/unit/export/test_runner.py @@ -386,24 +386,43 @@ def _fake_merge_path(**kwargs: object) -> None: class TestDefaultVocabCheck: """Default path loads the adapter tokenizer-vocab and compares against the base GGUF.""" - def test_mismatch_raises(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + def test_adapter_exceeds_gguf_raises( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Tokenizer addressing more indices than the model has embeddings is unsafe.""" from dlm.export.errors import PreflightError from dlm.export.runner import _default_check_base_vocab adapter = tmp_path / "adapter" adapter.mkdir() (adapter / "tokenizer_config.json").write_text( - json.dumps({"vocab_size": 32000, "chat_template": "{{m}}"}) + json.dumps({"vocab_size": 32001, "chat_template": "{{m}}"}) ) base = tmp_path / "base.gguf" base.write_bytes(b"ignored") - # Force a specific GGUF vocab through the reader seam. - monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32001) + monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32000) with pytest.raises(PreflightError, match="gguf_vocab"): _default_check_base_vocab(adapter, base) + def test_gguf_reserves_extra_slots_ok( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """GGUF carrying more rows than the tokenizer addresses is the Qwen2.5+ norm.""" + from dlm.export.runner import _default_check_base_vocab + + adapter = tmp_path / "adapter" + adapter.mkdir() + (adapter / "tokenizer_config.json").write_text( + json.dumps({"vocab_size": 32000, "chat_template": "{{m}}"}) + ) + base = tmp_path / "base.gguf" + base.write_bytes(b"ignored") + + monkeypatch.setattr("dlm.export.tokenizer_sync.read_gguf_vocab_size", lambda _p: 32293) + _default_check_base_vocab(adapter, base) + def test_match_returns_none(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: from dlm.export.runner import _default_check_base_vocab @@ -647,19 +666,18 @@ class TestDraftWiring: """Sprint 12.5: runner routes draft override/disable through to the Modelfile.""" def test_override_tag_written_to_modelfile(self, tmp_path: Path) -> None: - """--draft custom:tag propagates to PARAMETER draft_model.""" + """--draft custom:tag propagates to the Modelfile as a documented hint.""" cached_base, store, vendor = _setup_store(tmp_path) plan = ExportPlan(quant="Q4_K_M", ollama_name="mydoc") recorder = _SubprocessRecorder(store.export_quant_dir(plan.quant)) - # skip_ollama=True stops before ollama_create but *after* the - # Modelfile is rendered? Actually no — the Modelfile is only - # rendered when skip_ollama=False. Switch to a mock ollama_create. def _ollama_create(*, name: str, modelfile_path: Path, cwd: Path) -> str: _ = name, cwd assert modelfile_path.is_file() text = modelfile_path.read_text() - assert "PARAMETER draft_model custom:tag" in text + assert "PARAMETER draft_model" not in text + assert "ollama pull custom:tag" in text + assert "OLLAMA_DRAFT_MODEL=custom:tag" in text return "created" run_export( diff --git a/tests/unit/hardware/test_doctor.py b/tests/unit/hardware/test_doctor.py index 3bfb2734..6cce2a1e 100644 --- a/tests/unit/hardware/test_doctor.py +++ b/tests/unit/hardware/test_doctor.py @@ -180,6 +180,49 @@ def test_rocm_render_surfaces_arch_suffix_and_qlora_summary(self) -> None: assert "adapter: qlora (4-bit NF4, compute bf16)" in text +class TestMlxRenderLine: + """Cover both branches of the MLX-availability annotation in render.""" + + def _caps(self, *, has_mlx: bool) -> Capabilities: + return Capabilities( + backend=Backend.CPU, + device_name="generic CPU", + sm=None, + rocm_arch=None, + vram_gb=None, + unified_memory_gb=None, + cpu_cores=8, + ram_gb=16.0, + supports_bf16=False, + supports_fp16=False, + has_flash_attention=False, + has_xformers=False, + has_bitsandbytes=False, + has_triton=False, + has_mlx=has_mlx, + torch_version="2.11.0", + accelerate_version=None, + cuda_version=None, + rocm_version=None, + platform="Linux 6.8", + determinism_class="best_effort", + telemetry_posture={}, + ) + + def test_mlx_available_shows_scope_hint(self) -> None: + text = render_text( + DoctorResult(capabilities=self._caps(has_mlx=True), plan=None, plan_error=None) + ) + assert "MLX inference: yes (prompt-only" in text + assert "Apple Silicon" in text + + def test_mlx_unavailable_shows_no(self) -> None: + text = render_text( + DoctorResult(capabilities=self._caps(has_mlx=False), plan=None, plan_error=None) + ) + assert "MLX inference: no" in text + + class TestCliDoctor: def test_cli_human_output_works(self) -> None: runner = CliRunner() diff --git a/tests/unit/inference/test_mlx_adapter_conversion.py b/tests/unit/inference/test_mlx_adapter_conversion.py index 1fd6c2b2..c4448d70 100644 --- a/tests/unit/inference/test_mlx_adapter_conversion.py +++ b/tests/unit/inference/test_mlx_adapter_conversion.py @@ -2,10 +2,13 @@ from __future__ import annotations +from typing import Any + import pytest from dlm.inference.mlx_adapter import ( MlxConversionError, + assert_mlx_adapter_applied, map_all_keys, map_peft_key_to_mlx, ) @@ -91,3 +94,184 @@ def test_non_positive_layer_count_rejected(self) -> None: }, 0, ) + + def test_attn_target_modules_get_self_attn_prefix(self) -> None: + """mlx-lm matches `named_modules()` keys *inside* a transformer + block via exact equality. PEFT's bare `q_proj` doesn't match + the `self_attn.q_proj` FQN, so without the rewrite mlx-lm + silently leaves the model un-wrapped — the textbook "trained + model behaves like base" failure mode.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 16, + "lora_alpha": 32, + "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"], + }, + base_num_hidden_layers=28, + ) + assert cfg["lora_parameters"]["keys"] == [ + "self_attn.q_proj", + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.o_proj", + ] + + def test_mlp_target_modules_get_mlp_prefix(self) -> None: + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["gate_proj", "up_proj", "down_proj"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == [ + "mlp.gate_proj", + "mlp.up_proj", + "mlp.down_proj", + ] + + def test_already_qualified_keys_pass_through(self) -> None: + """Callers that pre-qualify (e.g. for non-decoder architectures) + should not see their dotted keys re-rewritten.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["self_attn.q_proj", "encoder.fc1"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == ["self_attn.q_proj", "encoder.fc1"] + + def test_unknown_target_module_passes_through_unqualified(self) -> None: + """Names that aren't in the attn/mlp tables stay bare. Caller + supervision is the user's responsibility — we don't guess.""" + from dlm.inference.mlx_adapter import build_mlx_adapter_config + + cfg = build_mlx_adapter_config( + { + "r": 8, + "target_modules": ["unknown_proj"], + }, + base_num_hidden_layers=12, + ) + assert cfg["lora_parameters"]["keys"] == ["unknown_proj"] + + +class TestPeftSafetensorsToMlxTransposes: + """PEFT and MLX-LM use different storage layouts for LoRA tensors: + + PEFT lora_A : [r, in_features] MLX lora_a : [in_features, r] + PEFT lora_B : [out_features, r] MLX lora_b : [r, out_features] + + Without transposing, mlx-lm's `model.load_weights(strict=False)` + silently skips the mismatched shapes and the adapter has no effect. + """ + + def test_lora_a_and_b_get_transposed(self, tmp_path: object) -> None: + from pathlib import Path as _Path + + import torch + from safetensors.torch import load_file, save_file + + from dlm.inference.mlx_adapter import peft_safetensors_to_mlx_safetensors + + tmp_path = _Path(str(tmp_path)) + peft_dir = tmp_path / "peft" + peft_dir.mkdir() + # PEFT shapes: lora_A=[r=4, in=8], lora_B=[out=16, r=4] + peft_tensors = { + "base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight": torch.arange( + 32, dtype=torch.float32 + ).reshape(4, 8), + "base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight": torch.arange( + 64, dtype=torch.float32 + ).reshape(16, 4), + } + save_file(peft_tensors, str(peft_dir / "adapter_model.safetensors")) + + mlx_path = tmp_path / "out" / "adapters.safetensors" + peft_safetensors_to_mlx_safetensors(peft_dir, mlx_path) + + mlx_tensors = load_file(str(mlx_path)) + a = mlx_tensors["model.layers.0.self_attn.q_proj.lora_a"] + b = mlx_tensors["model.layers.0.self_attn.q_proj.lora_b"] + # Transposed shapes + assert tuple(a.shape) == (8, 4) + assert tuple(b.shape) == (4, 16) + # Values match a transpose, not just a reshape. + assert torch.equal(a, peft_tensors[next(iter(peft_tensors))].t()) + + +class TestAssertMlxAdapterApplied: + """Fail-loud post-load guard. mlx-lm silently leaves a model + un-wrapped when keys don't match; this check turns that footgun + into an explicit `MlxConversionError` so users see the failure + rather than getting silent base-model output.""" + + def _fake_model_with_params(self, names: list[str]) -> Any: + """Build a stand-in for an mlx model that exposes + `trainable_parameters()` returning a flat dict of fake tensors. + We don't go through `mlx.utils.tree_flatten`'s real + implementation here — assert_mlx_adapter_applied uses it + directly, so we assert via the import-mock approach below.""" + + class _FakeArr: + shape = (1,) + + class _FakeModel: + def trainable_parameters(self) -> dict[str, Any]: + return {n: _FakeArr() for n in names} + + return _FakeModel() + + def test_passes_when_lora_params_present(self, monkeypatch: pytest.MonkeyPatch) -> None: + # Stub mlx.utils.tree_flatten so the test doesn't require + # mlx-lm's real flatten semantics — we only need it to walk + # the dict-shaped trainable_parameters() output. + import sys + import types as _types + + fake_mlx = _types.ModuleType("mlx") + fake_mlx_utils = _types.ModuleType("mlx.utils") + + def _tree_flatten(d: dict[str, Any]) -> list[tuple[str, Any]]: + return list(d.items()) + + fake_mlx_utils.tree_flatten = _tree_flatten # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, "mlx", fake_mlx) + monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils) + + model = self._fake_model_with_params( + [ + "model.layers.0.self_attn.q_proj.lora_a", + "model.layers.0.self_attn.q_proj.lora_b", + ] + ) + # Should not raise. + assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"]) + + def test_raises_when_no_lora_params(self, monkeypatch: pytest.MonkeyPatch) -> None: + import sys + import types as _types + + fake_mlx = _types.ModuleType("mlx") + fake_mlx_utils = _types.ModuleType("mlx.utils") + fake_mlx_utils.tree_flatten = lambda d: list(d.items()) # type: ignore[attr-defined] + monkeypatch.setitem(sys.modules, "mlx", fake_mlx) + monkeypatch.setitem(sys.modules, "mlx.utils", fake_mlx_utils) + + # Only base parameters; no lora_a/lora_b. + model = self._fake_model_with_params( + [ + "model.embed_tokens.weight", + "model.layers.0.self_attn.q_proj.weight", + ] + ) + with pytest.raises(MlxConversionError, match="zero `lora_a`"): + assert_mlx_adapter_applied(model, expected_keys=["self_attn.q_proj"]) diff --git a/tests/unit/inference/test_mlx_backend.py b/tests/unit/inference/test_mlx_backend.py index cc407378..7d6d7f5a 100644 --- a/tests/unit/inference/test_mlx_backend.py +++ b/tests/unit/inference/test_mlx_backend.py @@ -12,6 +12,7 @@ from dlm.base_models import BASE_MODELS from dlm.inference.backends.mlx_backend import ( MlxBackend, + _apply_chat_template, _resolve_base_num_hidden_layers, stage_mlx_adapter_dir, ) @@ -94,6 +95,14 @@ def test_load_generate_and_unload_happy_path( adapter_dir = tmp_path / "adapter" adapter_dir.mkdir() staged_dir = tmp_path / "staged" + staged_dir.mkdir() + # Real `stage_mlx_adapter_dir` writes this file; the post-load + # assertion guard reads it to know which FQNs to verify, so the + # test stub must mirror the on-disk shape. + (staged_dir / "adapter_config.json").write_text( + '{"lora_parameters": {"keys": ["self_attn.q_proj"]}}', + encoding="utf-8", + ) backend = MlxBackend(SimpleNamespace()) monkeypatch.setattr( @@ -104,6 +113,14 @@ def test_load_generate_and_unload_happy_path( "dlm.inference.backends.mlx_backend.stage_mlx_adapter_dir", lambda peft_adapter_dir, dst_dir, *, base_hf_id: staged_dir, ) + # Stub the post-load assertion: real `mlx_lm.load` produces a + # model with LoRA-wrapped layers, but here we hand back a + # placeholder string. Bypassing the assertion keeps the rest + # of the stubbed happy-path test intact. + monkeypatch.setattr( + "dlm.inference.backends.mlx_backend.assert_mlx_adapter_applied", + lambda model, *, expected_keys: None, + ) fake_mlx = ModuleType("mlx_lm") fake_mlx.load = lambda hf_id, adapter_path: ("model", "tokenizer") @@ -129,6 +146,46 @@ def test_load_generate_and_unload_happy_path( assert backend._tokenizer is None +class TestApplyChatTemplate: + """Mirrors PyTorch backend's `format_chat_prompt` for the MLX path.""" + + def test_renders_via_tokenizer_when_template_present(self) -> None: + captured: dict[str, object] = {} + + def _apply_chat_template_method( + messages: list[dict[str, str]], + *, + tokenize: bool, + add_generation_prompt: bool, + ) -> str: + captured["messages"] = messages + captured["tokenize"] = tokenize + captured["add_generation_prompt"] = add_generation_prompt + return "hello" + + tokenizer = SimpleNamespace( + chat_template="{{messages}}", + apply_chat_template=_apply_chat_template_method, + ) + out = _apply_chat_template(tokenizer, "hello") + assert out == "hello" + assert captured["messages"] == [{"role": "user", "content": "hello"}] + assert captured["tokenize"] is False + assert captured["add_generation_prompt"] is True + + def test_passes_through_when_no_chat_template(self) -> None: + tokenizer = SimpleNamespace(chat_template=None) + assert _apply_chat_template(tokenizer, "raw prompt") == "raw prompt" + + def test_passes_through_when_render_returns_non_string(self) -> None: + """Defensive: HF wrappers occasionally return token lists; fall back to raw.""" + tokenizer = SimpleNamespace( + chat_template="{{messages}}", + apply_chat_template=lambda *_a, **_k: [1, 2, 3], + ) + assert _apply_chat_template(tokenizer, "raw prompt") == "raw prompt" + + class TestStageMlxAdapterDir: def test_unreadable_adapter_config_raises_conversion_error(self, tmp_path: Path) -> None: adapter_dir = tmp_path / "adapter" diff --git a/tests/unit/inference/test_mlx_stage_adapter_dir.py b/tests/unit/inference/test_mlx_stage_adapter_dir.py index 794def8a..c5185c38 100644 --- a/tests/unit/inference/test_mlx_stage_adapter_dir.py +++ b/tests/unit/inference/test_mlx_stage_adapter_dir.py @@ -79,7 +79,10 @@ def test_writes_safetensors_and_translated_config( assert lp["rank"] == 8 assert lp["scale"] == pytest.approx(16 / 8) assert lp["dropout"] == pytest.approx(0.05) - assert lp["keys"] == ["q_proj", "v_proj"] + # Bare PEFT target_modules get qualified with the in-block FQN + # so mlx-lm's `linear_to_lora_layers` can match them. See + # `_qualify_target_module` in dlm.inference.mlx_adapter. + assert lp["keys"] == ["self_attn.q_proj", "self_attn.v_proj"] def test_tensor_keys_match_mlx_layout(self, tmp_path: Path, stub_num_layers: None) -> None: src = tmp_path / "peft"