Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions src/dlm/base_models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,17 @@
size_gb_fp16=0.27,
context_length=8_192,
recommended_seq_len=1024,
capability_warning=(
"SmolLM2-135M is below dlm's empirical training floor. Audit 13 "
"follow-up findings 02 + 05 measured this base actively "
"degrading general-chat capability under every LoRA recipe "
"tested (PROSE-only, INSTRUCTION-only, mixed). Adapters "
"memorize trained content but fail to generalize and bleed "
"domain-specific tokens into unrelated queries. Suitable for "
"style-transfer demos and pipeline smoke tests; for any "
"specialty-knowledge task use a base ≥ 1B params (e.g. "
"smollm2-1.7b, qwen2.5-coder-1.5b, llama-3.2-1b)."
),
),
BaseModelSpec(
key="smollm2-360m",
Expand Down
8 changes: 8 additions & 0 deletions src/dlm/base_models/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ class BaseModelSpec(BaseModel):
provenance_url: str | None = None
provenance_match_text: str | None = None

# Optional curated warning surfaced at `dlm train` time when this
# base is selected. Populate when the base has a known limitation
# that's not derivable from `params` / `architecture` alone — e.g.
# SmolLM2-135M's measured architectural floor (audit 13 follow-up
# findings 02 + 05: actively degrades base capability under any
# LoRA recipe). Empty string is treated as "no warning".
capability_warning: str | None = Field(default=None, min_length=1)

# Modality + multi-modal preprocessing (schema v10 + v11, plus the
# additive `text-moe` discriminator).
# Text-family bases leave `modality in {"text", "text-moe"}`
Expand Down
26 changes: 25 additions & 1 deletion src/dlm/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,13 @@ def train_cmd(
"Acceptance will be persisted in the store manifest."
)
raise typer.Exit(code=1) from exc
# `getattr` so test fixtures stubbing `spec` as a `SimpleNamespace`
# without this field still pass; real registry entries always have it.
capability_warning = getattr(spec, "capability_warning", None)
if capability_warning:
console.print(
f"[yellow]warning:[/yellow] base [bold]{spec.key}[/bold]: {capability_warning}"
)
# Detect the DDP world_size set by `accelerate launch`
# (WORLD_SIZE env var) and thread it into the doctor so the plan's
# effective_batch_size reflects the rank count. Single-process
Expand Down Expand Up @@ -3575,28 +3582,45 @@ def cache_show_cmd(
cache = TokenizedCache.open(store.tokenized_cache_dir)
last = _queries.latest_tokenization(store.root)

# The tokenized cache only fires for runs whose frontmatter declares
# `training.sources` (directive-sourced rows are where the tokenize
# cost dominates; in-body sections go through TRL's tokenizer).
# Surface this so an empty cache on an in-body-only doc doesn't
# look like a bug.
has_sources = parsed.frontmatter.training.sources is not None
cache_enabled = parsed.frontmatter.training.cache.enabled
if not has_sources:
cache_status: str | None = "not used (doc has no `training.sources` directive)"
elif not cache_enabled:
cache_status = "disabled (training.cache.enabled = false)"
else:
cache_status = None

payload: dict[str, object] = {
"dlm_id": parsed.frontmatter.dlm_id,
"cache_path": str(store.tokenized_cache_dir),
"entry_count": cache.entry_count,
"bytes": cache.total_bytes,
"last_run_hit_rate": last.hit_rate if last else None,
"last_run_id": last.run_id if last else None,
"cache_status": cache_status,
}
if json_out:
_sys.stdout.write(_json.dumps(payload, indent=2) + "\n")
return

out_console.print(f"[bold]Cache for {parsed.frontmatter.dlm_id}[/bold]")
out_console.print(f" path: {store.tokenized_cache_dir}")
if cache_status is not None:
out_console.print(f" status: [yellow]{cache_status}[/yellow]")
out_console.print(f" entries: {cache.entry_count}")
out_console.print(f" size: {_human_size(cache.total_bytes)}")
if last is not None:
out_console.print(
f" last-run hit rate: {last.hit_rate:.1%} "
f"({last.cache_hits}/{last.cache_hits + last.cache_misses})"
)
else:
elif cache_status is None:
out_console.print(" last-run hit rate: [dim]no tokenization runs yet[/dim]")


Expand Down
21 changes: 18 additions & 3 deletions src/dlm/doc/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,17 @@ def migrate_file(
# Validate post-migration dict against the current schema so a bad
# migrator can't silently smear garbage into the document.
fm = DlmFrontmatter.model_validate(migrated)
new_text = _rejoin(fm, body_text)
# Preserve the user's *originally explicit* fields across migration
# by collecting their dotted paths from the post-migration dict and
# passing them to the serializer as force-emit overrides. Without
# this, a v1 doc with `lora_r: 8` (matching the current schema
# default) would silently lose the explicit pin and inherit any
# future default change. The contract `CLAUDE.md` calls "additive
# identity" is honored at *intent* level, not just behavior level.
from dlm.doc.serializer import collect_dict_field_paths

force_emit = collect_dict_field_paths(migrated)
new_text = _rejoin(fm, body_text, force_emit_paths=force_emit)

if dry_run:
return MigrationResult(
Expand Down Expand Up @@ -152,7 +162,12 @@ def _split_for_migrate(text: str, *, path: Path) -> tuple[str, str]:
)


def _rejoin(fm: DlmFrontmatter, body_text: str) -> str:
def _rejoin(
fm: DlmFrontmatter,
body_text: str,
*,
force_emit_paths: frozenset[tuple[str, ...]] | None = None,
) -> str:
"""Re-assemble a `.dlm` file from a migrated frontmatter + raw body.

Preserves the body verbatim (migration never touches section content);
Expand All @@ -165,7 +180,7 @@ def _rejoin(fm: DlmFrontmatter, body_text: str) -> str:
# section serialization by handing an empty sections tuple and
# concatenating the raw body manually.
empty = ParsedDlm(frontmatter=fm, sections=_empty_sections())
header = serialize(empty) # always ends with "\n"
header = serialize(empty, force_emit_paths=force_emit_paths) # always ends with "\n"

# Normalize leading/trailing whitespace on the body to match the
# canonical layout: exactly one blank line between `---\n` closer
Expand Down
97 changes: 87 additions & 10 deletions src/dlm/doc/serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,23 @@
)


def serialize(parsed: ParsedDlm) -> str:
def serialize(
parsed: ParsedDlm,
*,
force_emit_paths: frozenset[tuple[str, ...]] | None = None,
) -> str:
"""Produce canonical `.dlm` text for `parsed`.

Always ends with `\\n`.

`force_emit_paths` is consulted by `_emit_nested_mapping` — a field
whose dotted path appears in the set is emitted even when its value
matches the schema default. Used by the migrate pipeline to
preserve user-explicit fields across schema-default drift (so a
user who pinned `lora_r: 8` doesn't silently inherit a future
`lora_r: 16` default after migration).
"""
parts: list[str] = [_serialize_frontmatter(parsed.frontmatter), "\n"]
parts: list[str] = [_serialize_frontmatter(parsed.frontmatter, force_emit_paths), "\n"]
for i, section in enumerate(parsed.sections):
if i > 0:
parts.append("\n")
Expand All @@ -50,10 +61,41 @@ def serialize(parsed: ParsedDlm) -> str:
return rendered


def collect_dict_field_paths(d: object, prefix: tuple[str, ...] = ()) -> frozenset[tuple[str, ...]]:
"""Walk a parsed-YAML dict and return every nested leaf-or-mapping path.

Used by the migrate pipeline: the set of paths present in the
user's original frontmatter (after migration runs) is the set of
fields the serializer must emit even when they match defaults.
Mappings *and* leaves are both included so intermediate blocks
survive re-emission.
"""
paths: set[tuple[str, ...]] = set()
if isinstance(d, dict):
for k, v in d.items():
if not isinstance(k, str):
continue
here = (*prefix, k)
paths.add(here)
if isinstance(v, dict):
paths.update(collect_dict_field_paths(v, here))
elif isinstance(v, list):
# List of mappings (e.g. training.sources) — each item
# contributes paths under the same parent key, since
# we serialize positional list entries together.
for item in v:
if isinstance(item, dict):
paths.update(collect_dict_field_paths(item, here))
return frozenset(paths)


# --- frontmatter --------------------------------------------------------------


def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
def _serialize_frontmatter(
fm: DlmFrontmatter,
force_emit_paths: frozenset[tuple[str, ...]] | None = None,
) -> str:
lines: list[str] = ["---"]
for key in _FRONTMATTER_ORDER:
value = getattr(fm, key, None)
Expand All @@ -63,7 +105,9 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
lines.extend(_emit_block_scalar(key, value))
continue
if isinstance(value, TrainingConfig | ExportConfig):
nested = _emit_nested_mapping(value, indent=2)
nested = _emit_nested_mapping(
value, indent=2, path=(key,), force_emit_paths=force_emit_paths
)
if not nested:
# All-default nested block — skip the header too so we
# don't emit an empty `training:` line.
Expand All @@ -76,7 +120,13 @@ def _serialize_frontmatter(fm: DlmFrontmatter) -> str:
return "\n".join(lines) + "\n"


def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
def _emit_nested_mapping(
model: BaseModel,
*,
indent: int,
path: tuple[str, ...] = (),
force_emit_paths: frozenset[tuple[str, ...]] | None = None,
) -> list[str]:
"""Emit a nested training/export/dpo block.

Suppress fields that equal their schema default so
Expand All @@ -87,6 +137,11 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:

Nested `BaseModel` values (e.g. `TrainingConfig.preference`)
recurse with deeper indent; all-default sub-blocks are skipped.

`force_emit_paths` overrides the default-suppression rule for any
field whose dotted path appears in the set. Used by the migrate
pipeline to preserve user-explicit fields across schema-default
drift.
"""
pad = " " * indent
lines: list[str] = []
Expand All @@ -99,16 +154,28 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:

for field_name, field_info in model.__class__.model_fields.items():
value = getattr(model, field_name)
if field_info.default is not PydanticUndefined and value == field_info.default:
field_path = (*path, field_name)
forced = force_emit_paths is not None and field_path in force_emit_paths
if (
not forced
and field_info.default is not PydanticUndefined
and value == field_info.default
):
continue
if (
field_info.default is PydanticUndefined
not forced
and field_info.default is PydanticUndefined
and field_info.default_factory is not None
and value == field_info.default_factory() # type: ignore[call-arg]
):
continue
if isinstance(value, BaseModel):
nested = _emit_nested_mapping(value, indent=indent + 2)
nested = _emit_nested_mapping(
value,
indent=indent + 2,
path=field_path,
force_emit_paths=force_emit_paths,
)
if not nested:
continue
lines.append(f"{pad}{field_name}:")
Expand All @@ -125,7 +192,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
lines.append(f"{pad}{field_name}:")
for k, v in value.items():
lines.append(f"{pad} {k}:")
nested = _emit_nested_mapping(v, indent=indent + 4)
nested = _emit_nested_mapping(
v,
indent=indent + 4,
path=(*field_path, k),
force_emit_paths=force_emit_paths,
)
if nested:
lines.extend(nested)
else:
Expand All @@ -144,7 +216,12 @@ def _emit_nested_mapping(model: BaseModel, *, indent: int) -> list[str]:
# fields indent aligned.
lines.append(f"{pad}{field_name}:")
for item in value:
nested = _emit_nested_mapping(item, indent=indent + 4)
nested = _emit_nested_mapping(
item,
indent=indent + 4,
path=field_path,
force_emit_paths=force_emit_paths,
)
if not nested:
lines.append(f"{pad} - {{}}")
continue
Expand Down
10 changes: 8 additions & 2 deletions src/dlm/export/ollama/modelfile_shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,14 @@ def build_param_lines(
if num_ctx is not None:
lines.append(f"PARAMETER num_ctx {num_ctx}")
if draft_model is not None:
lines.append(f"# Speculative decoding: `ollama pull {draft_model}` first.")
lines.append(f"PARAMETER draft_model {draft_model}")
# `draft_model` is not a valid Modelfile PARAMETER directive
# (Ollama rejects `ollama create` with "unknown parameter
# 'draft_model'"). It's a runtime option exposed via the
# `OLLAMA_DRAFT_MODEL` env var or the API's `options.draft_model`
# field. Document the suggested pairing as a comment so users
# can wire it up without forcing-fail their `ollama create`.
lines.append(f"# Speculative-decoding draft: `ollama pull {draft_model}`")
lines.append(f"# then run with `OLLAMA_DRAFT_MODEL={draft_model} ollama run <this-model>`")
return lines


Expand Down
47 changes: 35 additions & 12 deletions src/dlm/export/preflight.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,11 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int:
detail=f"cannot parse {cfg_path}: {exc}",
) from exc

# `vocab_size` key isn't always present in tokenizer_config.json;
# fall back to the companion tokenizer.json which always carries it.
# `vocab_size` key isn't always present in tokenizer_config.json
# (Qwen2.5+, Llama-3.x omit it); fall back to summing the BPE base
# plus the explicit `added_tokens` array in tokenizer.json. This
# matches `len(transformers.AutoTokenizer.from_pretrained(...))` —
# the count the model actually addresses at inference time.
vocab_size = cfg.get("vocab_size")
if not isinstance(vocab_size, int):
tokenizer_json = adapter_dir / "tokenizer.json"
Expand All @@ -113,8 +116,9 @@ def check_tokenizer_vocab(adapter_dir: Path) -> int:
) from exc
model = t.get("model") or {}
vocab = model.get("vocab")
added = t.get("added_tokens") or []
if isinstance(vocab, dict):
vocab_size = len(vocab)
vocab_size = len(vocab) + (len(added) if isinstance(added, list) else 0)
if not isinstance(vocab_size, int) or vocab_size <= 0:
raise PreflightError(
probe="tokenizer_vocab",
Expand All @@ -133,6 +137,10 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None:
`--no-template` on the CLI sets `required=False`; the default
requires one because the Modelfile emitter hardcodes
`TEMPLATE "..."` which needs source text.

Modern HF tokenizers (Qwen2.5+, Llama-3.x) write the template to
a sibling `chat_template.jinja` file rather than inlining it in
`tokenizer_config.json`. Check both locations.
"""
if not required:
return
Expand All @@ -150,15 +158,30 @@ def check_chat_template(adapter_dir: Path, *, required: bool = True) -> None:
detail=f"cannot parse {cfg_path}: {exc}",
) from exc
template = cfg.get("chat_template")
if not template or not str(template).strip():
raise PreflightError(
probe="chat_template",
detail=(
"tokenizer has no chat_template. Pass --no-template to skip "
"this check (Modelfile emission will fall back to the base "
"model's default), or attach a template via frontmatter."
),
)
if template and str(template).strip():
return

sibling_path = adapter_dir / "chat_template.jinja"
if sibling_path.exists():
try:
sibling_template = sibling_path.read_text(encoding="utf-8")
except OSError as exc:
raise PreflightError(
probe="chat_template",
detail=f"cannot read {sibling_path}: {exc}",
) from exc
if sibling_template.strip():
return

raise PreflightError(
probe="chat_template",
detail=(
"tokenizer has no chat_template (checked tokenizer_config.json "
"and chat_template.jinja). Pass --no-template to skip "
"this check (Modelfile emission will fall back to the base "
"model's default), or attach a template via frontmatter."
),
)


def check_pretokenizer_fingerprint(spec: BaseModelSpec) -> None:
Expand Down
Loading
Loading