diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f5565b..ca6e528 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,6 +47,15 @@ jobs: # tier-1 + tier-2 + tier-3 repo. run: make check-docs-prose + - name: docs frontmatter gate (docs-discoverability Phase 1) + # Phase 1 P1.3 + P1.3b: validate prose-doc frontmatter under + # docs/ against profile/docs.schema.json. --warn-only per spec + # §10 — surfaces issues without failing CI while the org + # backfills frontmatter across the corpus. Drops --warn-only at + # Phase 2. The tier-1+2 repos get this same step (P1.5) in + # their own sessions. + run: make check-docs + - name: Validate catalog (schema-strict) # Real jsonschema validation against tools.schema.json + # task_index.schema.json. Replaces the prior parse-only gate. diff --git a/Makefile b/Makefile index 0ccf430..d38de7a 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links check-licenses check-schema-compat +.PHONY: catalog validate-catalog check-catalog check-repo-meta phase0-smoke check-docs-prose recipes-check handshake check-freshness check-links check-licenses check-schema-compat check-docs # Phase-1 Track B's generator. Fetches each TIER_1+TIER_2+TIER_3 repo's # dist/repo.meta.json, validates it, translates it into a `tools.` @@ -133,3 +133,13 @@ check-licenses: # true→false) landed without a schema_compat bump. check-schema-compat: python3 profile/build/check-schema-compat.py + +# docs-discoverability Phase 1 (P1.3): validate prose-doc frontmatter +# under docs/ against profile/docs.schema.json. Phase 1 invocation is +# --warn-only per docs-discoverability-spec.md §10 — surfaces issues +# without failing CI while the org backfills frontmatter across the +# corpus. Phase 2 will drop --warn-only; Phase 3 adds the cross-repo +# walker. This target is the .github repo's self-check; the tier-1+2 +# repos get their own `make check-docs` in P1.4 sessions. +check-docs: + python3 profile/build/validate-docs.py --docs-root docs --warn-only diff --git a/docs/docs-discoverability/phases-tracker.md b/docs/docs-discoverability/phases-tracker.md index a1d4b14..22364e7 100644 --- a/docs/docs-discoverability/phases-tracker.md +++ b/docs/docs-discoverability/phases-tracker.md @@ -1,7 +1,7 @@ --- created: 2026-05-11 -last_modified: 2026-05-11 -revisions: 2 +last_modified: 2026-05-12 +revisions: 3 doc_type: [STATUS] lifecycle: active owner: rmrich5 @@ -29,7 +29,7 @@ one of the legal values. | Phase | Theme | Target | Status | Reference | |---|---|---|---|---| | 0 | Vocabulary, frontmatter, indexes, acceptance | 2026-05-11 | done | [§10 P0](docs-discoverability-spec.md#phase-0--done-2026-05-11) | -| 1 | Schema + warn-only CI | 2026-Q2 weeks 1–4 | not-started | [§10 P1](docs-discoverability-spec.md#phase-1--schema--warn-only-ci-target-2026-q2-weeks-14) | +| 1 | Schema + warn-only CI | 2026-Q2 weeks 1–4 | in-progress | [§10 P1](docs-discoverability-spec.md#phase-1--schema--warn-only-ci-target-2026-q2-weeks-14) | | 2 | Block on new docs; first remediations | 2026-Q2 weeks 5–8 | not-started | [§10 P2](docs-discoverability-spec.md#phase-2--block-on-new-docs-target-2026-q2-weeks-58) | | 3 | Block everywhere; org catalog | 2026-Q3 | not-started | [§10 P3](docs-discoverability-spec.md#phase-3--block-everywhere-org-catalog-target-2026-q3) | | 4 | Cross-repo + freshness | 2026-Q4 | not-started | [§10 P4](docs-discoverability-spec.md#phase-4--cross-repo-and-freshness-target-2026-q4) | @@ -57,16 +57,18 @@ Target: **2026-Q2 weeks 1–4**. Owner default: rmrich5 unless reassigned. | ID | Item | Status | Target | Blocked-by | Notes | |---|---|---|---|---|---| -| P1.1 | Land `profile/docs.schema.json` | not-started | Q2 wk 1 | — | Sibling of repo.meta.schema.json | -| P1.2 | Extend `profile/repo.meta.schema.json` with `docs.generated_paths` array | not-started | Q2 wk 1 | — | Enables §5.1 exclusion rule | -| P1.3 | Land `profile/build/validate-docs.py` + tests | not-started | Q2 wk 2 | P1.1, P1.2 | Copy pattern from validate-catalog.py | +| P1.1 | Land `profile/docs.schema.json` | done | 2026-05-12 | — | Sibling of repo.meta.schema.json; two variants (prose vs generated) gated on the `generated` field via if/then | +| P1.2 | Extend `profile/repo.meta.schema.json` with `docs.generated_paths` array | done | 2026-05-12 | — | Enables §5.1 exclusion rule; bidirectional check (path↔marker) implemented in P1.3 | +| P1.3 | Land `profile/build/validate-docs.py` + tests | done | 2026-05-12 | P1.1, P1.2 | TDD'd against 13 cases; covers spec §9 checks 0–4. Checks 5–8 (git-derived field drift, README existence, orphan/dangling refs) are Phase-1 follow-up | +| P1.3b | `.github` repo's own `make check-docs` target (self-check) | done | 2026-05-12 | P1.3 | Bonus: the meta-repo dogfoods the validator. Warn-only against `.github/docs/`; surfaces 21 issues (`.github` was outside Phase 0's 7-repo backfill) | | P1.4 | Add `make check-docs` target to each of the 7 repos | not-started | Q2 wk 2 | P1.3 | One PR per repo | | P1.5 | Wire CI step into per-repo workflows (warn-only) | not-started | Q2 wk 3 | P1.4 | Don't block PRs yet | -| P1.6 | Backfill `lifecycle: active` across all 108 docs | not-started | Q2 wk 3 | P1.1 | Default value; one-time pass | +| P1.6 | Backfill `lifecycle: active` across all 108 docs | not-started | Q2 wk 3 | P1.1 | Default value; one-time pass. Plus ~21 docs in `.github` itself | | P1.7 | Backfill `owner:` across all 108 docs | not-started | Q2 wk 3 | P1.1 | From `git shortlog -sn` per repo | | P1.8 | Add `generated: true` to m-stdlib `docs/modules/std*.md` | not-started | Q2 wk 4 | P1.2 | Done by `make manifest` regeneration | | P1.9 | Declare m-stdlib's `docs/modules/` in `docs.generated_paths` | not-started | Q2 wk 4 | P1.2 | One-line edit to repo.meta.json | | P1.10 | Run weekly cron once with warn-only CI; review noise | not-started | end of Q2 wk 4 | P1.5 | Decision gate before Phase 2 | +| P1.11 | Implement spec §9 checks 5–8 (git-derived drift; README existence; orphan/dangling refs) | not-started | Q2 wk 3 | P1.3 | Validator-side follow-up; lands as a second PR to `.github` | --- @@ -77,15 +79,15 @@ Each check lands in the indicated phase. | # | Check | Phase | Status | Implementation note | |---|---|---|---|---| -| 0 | Generated-doc gate (skip if `generated: true`) | 1 | not-started | Hard short-circuit; first check evaluated | -| 1 | Frontmatter present | 1 | not-started | YAML block at top | -| 2 | Required keys present | 1 | not-started | created, last_modified, revisions, doc_type, lifecycle | -| 3 | `doc_type` values valid | 1 | not-started | From the 23-vocab in docs.schema.json | -| 4 | `lifecycle` value valid | 1 | not-started | One of 5 states | -| 5 | created/last_modified/revisions match git | 1 | not-started | Auto-fixable; tooling regenerates | -| 6 | `docs/README.md` exists | 1 | not-started | Per-repo gate | -| 7 | No orphans (every `.md` in index) | 1 | not-started | Bidirectional check | -| 8 | No dangling refs in `README.md` | 1 | not-started | Local link resolution | +| 0 | Generated-doc gate (skip if `generated: true`) | 1 | done | `docs.schema.json` if/then short-circuit + path↔marker bidirectional check in validate-docs.py | +| 1 | Frontmatter present | 1 | done | `parse_frontmatter` returns None → "missing YAML frontmatter block" error | +| 2 | Required keys present | 1 | done | Schema `required: [created, last_modified, revisions, doc_type, lifecycle]` | +| 3 | `doc_type` values valid | 1 | done | Schema enum (23 entries) | +| 4 | `lifecycle` value valid | 1 | done | Schema enum (5 states) | +| 5 | created/last_modified/revisions match git | 1 | not-started | Auto-fixable; tooling regenerates. Tracked as P1.11 | +| 6 | `docs/README.md` exists | 1 | not-started | Per-repo gate. Tracked as P1.11 | +| 7 | No orphans (every `.md` in index) | 1 | not-started | Bidirectional check. Tracked as P1.11 | +| 8 | No dangling refs in `README.md` | 1 | not-started | Local link resolution. Tracked as P1.11 | | 9 | Filename matches doc_type | 2 | not-started | Per spec §7 table | | 10 | Filename content-derived (no `and`, kebab-case) | 2 | not-started | | | 11 | Required H2 sections per doc_type | 2 | not-started | Per spec §8 table | diff --git a/profile/build/test_validate_docs.py b/profile/build/test_validate_docs.py new file mode 100644 index 0000000..fce40be --- /dev/null +++ b/profile/build/test_validate_docs.py @@ -0,0 +1,301 @@ +"""Tests for validate-docs.py — Phase 1 Track A (P1.3). + +Mirrors the test_validate_catalog.py pattern: load the hyphen-named +script via importlib, exercise the validator's typed API on synthetic +docs/ trees under tmp_path, assert the issues list shape. + +The validator's surface is ``walk(docs_root, repo_meta_path=None, +schema_path=None) -> list[Issue]`` where each ``Issue`` is a +namedtuple-ish ``(severity, path, message)``. Empty list = clean. +""" + +from __future__ import annotations + +import importlib.util +import json +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[2] +PROFILE = REPO_ROOT / "profile" +BUILD = PROFILE / "build" + +_validator_path = BUILD / "validate-docs.py" +_spec = importlib.util.spec_from_file_location("_validate_docs", _validator_path) +_validate_docs = importlib.util.module_from_spec(_spec) +assert _spec and _spec.loader +_spec.loader.exec_module(_validate_docs) + + +# ---------------------------------------------------------------- helpers --- + + +def _write_doc(path: Path, frontmatter: dict | None, body: str = "# Body\n") -> None: + """Write a markdown file with optional YAML frontmatter. + + Uses minimal YAML emission (str/int/list scalars + flow-style lists) + matching what the production corpus emits. Avoids the PyYAML + dependency in tests. + """ + path.parent.mkdir(parents=True, exist_ok=True) + parts: list[str] = [] + if frontmatter is not None: + parts.append("---\n") + for key, value in frontmatter.items(): + parts.append(f"{key}: {_yaml_scalar(value)}\n") + parts.append("---\n") + parts.append(body) + path.write_text("".join(parts), encoding="utf-8") + + +def _yaml_scalar(value) -> str: + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, int): + return str(value) + if isinstance(value, list): + return "[" + ", ".join(_yaml_scalar(v) for v in value) + "]" + return str(value) + + +def _write_repo_meta(path: Path, docs_block: dict | None = None) -> None: + """Write a minimal repo.meta.json. Only the `docs` block matters here.""" + meta = { + "id": "tool:test-fixture", + "repo": "https://github.com/m-dev-tools/test-fixture", + "role": "Test fixture", + "language": ["python"], + "license": "MIT", + "agent_instructions": "AGENTS.md", + "verified_on": "2026-05-12", + "exposes": {"commands": "dist/commands.json"}, + "verification_commands": ["make check"], + } + if docs_block is not None: + meta["docs"] = docs_block + path.write_text(json.dumps(meta, indent=2), encoding="utf-8") + + +# ---------------------------------------------------------------- happy --- + + +def test_clean_prose_doc(tmp_path): + docs = tmp_path / "docs" + _write_doc( + docs / "good-guide.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 3, + "doc_type": ["GUIDE"], + "lifecycle": "active", + }, + ) + _write_doc(docs / "README.md", { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 1, + "doc_type": ["REFERENCE"], + "lifecycle": "active", + }) + issues = _validate_docs.walk(docs) + assert issues == [] + + +def test_clean_generated_doc(tmp_path): + """A doc with `generated: true` declared in docs.generated_paths + passes with just `generated` + `last_modified`.""" + docs = tmp_path / "docs" + _write_doc(docs / "README.md", { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 1, + "doc_type": ["REFERENCE"], + "lifecycle": "active", + }) + _write_doc( + docs / "modules" / "stdjson.md", + {"generated": True, "last_modified": "2026-05-10"}, + ) + repo_meta = tmp_path / "repo.meta.json" + _write_repo_meta(repo_meta, docs_block={"generated_paths": ["docs/modules/*.md"]}) + + issues = _validate_docs.walk(docs, repo_meta_path=repo_meta) + assert issues == [] + + +# ---------------------------------------------------------------- failures --- + + +def test_missing_frontmatter(tmp_path): + docs = tmp_path / "docs" + _write_doc(docs / "bare.md", frontmatter=None) + issues = _validate_docs.walk(docs) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert "frontmatter" in issues[0].message.lower() + assert issues[0].path.name == "bare.md" + + +def test_missing_required_key(tmp_path): + docs = tmp_path / "docs" + _write_doc( + docs / "no-doctype.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 3, + "lifecycle": "active", + # doc_type intentionally absent + }, + ) + issues = _validate_docs.walk(docs) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert "doc_type" in issues[0].message + + +def test_invalid_doc_type_enum(tmp_path): + docs = tmp_path / "docs" + _write_doc( + docs / "weird.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 3, + "doc_type": ["FAKE-TYPE"], + "lifecycle": "active", + }, + ) + issues = _validate_docs.walk(docs) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert "FAKE-TYPE" in issues[0].message + + +def test_invalid_lifecycle_enum(tmp_path): + docs = tmp_path / "docs" + _write_doc( + docs / "bad-lifecycle.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 3, + "doc_type": ["GUIDE"], + "lifecycle": "bogus", + }, + ) + issues = _validate_docs.walk(docs) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert "bogus" in issues[0].message + + +def test_generated_marker_without_repo_meta_declaration(tmp_path): + """A file with `generated: true` whose path isn't declared in + docs.generated_paths is suspicious (typo / accidental hiding).""" + docs = tmp_path / "docs" + _write_doc( + docs / "hand-written.md", + {"generated": True, "last_modified": "2026-05-10"}, + ) + repo_meta = tmp_path / "repo.meta.json" + _write_repo_meta(repo_meta, docs_block={"generated_paths": []}) + issues = _validate_docs.walk(docs, repo_meta_path=repo_meta) + assert len(issues) == 1 + assert issues[0].severity == "error" + assert "generated" in issues[0].message.lower() + assert "docs.generated_paths" in issues[0].message + + +def test_path_declared_generated_but_marker_missing(tmp_path): + """A file whose path matches docs.generated_paths but lacks + `generated: true` is suspicious (stale glob / hand-edit on a + generated doc).""" + docs = tmp_path / "docs" + _write_doc( + docs / "modules" / "stdjson.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 1, + "doc_type": ["REFERENCE"], + "lifecycle": "active", + }, + ) + repo_meta = tmp_path / "repo.meta.json" + _write_repo_meta(repo_meta, docs_block={"generated_paths": ["docs/modules/*.md"]}) + issues = _validate_docs.walk(docs, repo_meta_path=repo_meta) + assert len(issues) == 1 + assert "generated" in issues[0].message.lower() + + +def test_recipes_subdir_skipped(tmp_path): + """Files under docs/recipes/ follow recipe.schema.json, not + docs.schema.json — validate-docs.py leaves them to recipes-check. + README.md inside recipes/ is still treated as a normal doc.""" + docs = tmp_path / "docs" + _write_doc( + docs / "recipes" / "some-recipe.md", + { + "id": "recipe:some-recipe", + "title": "Some recipe", + "intent": "Test fixture", + "verified_on": "2026-05-12", + "ci_verifiable": True, + }, + ) + _write_doc( + docs / "recipes" / "README.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 1, + "doc_type": ["REFERENCE"], + "lifecycle": "active", + }, + ) + issues = _validate_docs.walk(docs) + # The recipe is skipped; the README is validated and clean. + assert issues == [] + + +# ---------------------------------------------------------------- CLI --- + + +def test_main_clean_exits_zero(tmp_path): + docs = tmp_path / "docs" + _write_doc( + docs / "good.md", + { + "created": "2026-01-01", + "last_modified": "2026-05-01", + "revisions": 1, + "doc_type": ["GUIDE"], + "lifecycle": "active", + }, + ) + rc = _validate_docs.main(["--docs-root", str(docs)]) + assert rc == 0 + + +def test_main_error_exits_nonzero(tmp_path): + docs = tmp_path / "docs" + _write_doc(docs / "bare.md", frontmatter=None) + rc = _validate_docs.main(["--docs-root", str(docs)]) + assert rc == 1 + + +def test_main_warn_only_exits_zero_on_error(tmp_path): + """--warn-only converts errors to warnings; rc=0 even with issues. + Phase 1 default per docs-discoverability-spec.md §10 Phase 1.""" + docs = tmp_path / "docs" + _write_doc(docs / "bare.md", frontmatter=None) + rc = _validate_docs.main(["--docs-root", str(docs), "--warn-only"]) + assert rc == 0 + + +def test_main_missing_docs_root_exits_nonzero(tmp_path): + rc = _validate_docs.main(["--docs-root", str(tmp_path / "nope")]) + assert rc == 2 diff --git a/profile/build/validate-docs.py b/profile/build/validate-docs.py new file mode 100644 index 0000000..29e9647 --- /dev/null +++ b/profile/build/validate-docs.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +"""Validate prose-doc frontmatter against profile/docs.schema.json. + +Walks ``docs/`` for every ``*.md`` file, extracts the YAML frontmatter +block, and validates it against the docs schema. Skips ``docs/recipes/`` +(those have their own schema; ``make recipes-check`` covers them). + +Files marked ``generated: true`` get the §5.1 generated-doc treatment: +the prose checks are skipped, but the file's path must appear in the +repo's ``repo.meta.json:docs.generated_paths`` and vice versa (the +declaration is bidirectional — neither a stray marker nor a stale glob +can silently hide a hand-written doc). + +Sibling of validate-catalog.py / validate-repo-meta.py / check-freshness.py. + +Phase 1 default invocation (per spec §10): + + validate-docs.py --warn-only --docs-root docs/ --repo-meta dist/repo.meta.json + +Exit codes: + + 0 — no issues, or only issues seen under --warn-only + 1 — at least one error (and --warn-only not set) + 2 — docs-root or schema missing / unreadable +""" + +from __future__ import annotations + +import argparse +import fnmatch +import json +import re +import sys +from pathlib import Path +from typing import NamedTuple + +REPO_ROOT = Path(__file__).resolve().parents[2] +PROFILE = REPO_ROOT / "profile" +DEFAULT_DOCS_ROOT = REPO_ROOT / "docs" +DEFAULT_SCHEMA = PROFILE / "docs.schema.json" + +# Subtrees with their own schema / their own validator. Hardcoded for +# Phase 1 — when other "out-of-scope" subtrees emerge (e.g. an MCP +# manifest under docs/mcp/), promote this to a config field in +# repo.meta.json:docs. +SKIP_SUBDIRS = ("recipes",) + + +class Issue(NamedTuple): + severity: str # "error" | "warn" + path: Path + message: str + + +# ---------------------------------------------------------- frontmatter --- + +_FRONTMATTER_RE = re.compile(r"\A---\s*\n(.*?\n)---\s*\n", re.DOTALL) +# A YAML scalar key: alnum + underscore + hyphen, followed by `:` and a +# value. Greedy through the rest of the line. +_KV_RE = re.compile(r"^([A-Za-z_][A-Za-z0-9_-]*)\s*:\s*(.*?)\s*$") + + +def _strip_quotes(s: str) -> str: + if len(s) >= 2 and s[0] == s[-1] and s[0] in ('"', "'"): + return s[1:-1] + return s + + +def _parse_value(raw: str): + """Parse a single YAML scalar or flow-style list. Minimal — handles + the patterns our corpus uses (string / int / bool / [A, B, C]). + """ + raw = raw.strip() + if raw == "": + return "" + if raw.startswith("[") and raw.endswith("]"): + inner = raw[1:-1].strip() + if inner == "": + return [] + return [_parse_value(item) for item in _split_flow_list(inner)] + if raw in ("true", "True"): + return True + if raw in ("false", "False"): + return False + # Integer (no leading +, no underscores) + if re.fullmatch(r"-?[0-9]+", raw): + return int(raw) + return _strip_quotes(raw) + + +def _split_flow_list(inner: str) -> list[str]: + """Split `A, B, "C, D"` on commas not inside quotes. Quoted strings + with embedded commas are kept whole.""" + parts: list[str] = [] + buf: list[str] = [] + quote: str | None = None + for ch in inner: + if quote: + buf.append(ch) + if ch == quote: + quote = None + elif ch in ('"', "'"): + quote = ch + buf.append(ch) + elif ch == ",": + parts.append("".join(buf).strip()) + buf = [] + else: + buf.append(ch) + if buf: + parts.append("".join(buf).strip()) + return parts + + +def parse_frontmatter(text: str) -> dict | None: + """Return the parsed frontmatter dict, or None if no block is present. + + Returns an empty dict if the block exists but contains no parseable + key-value pairs (distinct from None — the file *has* a delimited + block, just an empty one). + """ + match = _FRONTMATTER_RE.match(text) + if not match: + return None + block = match.group(1) + data: dict = {} + for raw_line in block.splitlines(): + line = raw_line.rstrip() + if line == "" or line.lstrip().startswith("#"): + continue + kv = _KV_RE.match(line) + if not kv: + # Block-style continuation lines (e.g. nested list items + # starting with ` -`) are silently skipped in this minimal + # parser. The corpus's prose-doc frontmatter is flow-style + # only; if a doc starts using block-style we'll see the + # required-key check fire and surface the gap. + continue + key, raw_value = kv.group(1), kv.group(2) + data[key] = _parse_value(raw_value) + return data + + +# ---------------------------------------------------------- core walk --- + + +def _load_json(path: Path) -> dict: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def _generated_globs(repo_meta_path: Path | None) -> list[str]: + if repo_meta_path is None: + return [] + try: + meta = _load_json(repo_meta_path) + except (OSError, json.JSONDecodeError): + return [] + docs_block = meta.get("docs") or {} + return list(docs_block.get("generated_paths") or []) + + +def _path_matches_globs(rel: str, globs: list[str]) -> bool: + return any(fnmatch.fnmatch(rel, g) for g in globs) + + +def _validate_one( + path: Path, + rel_path: str, + text: str, + schema: dict, + generated_globs: list[str], +) -> list[Issue]: + issues: list[Issue] = [] + frontmatter = parse_frontmatter(text) + is_generated_by_path = _path_matches_globs(rel_path, generated_globs) + is_generated_by_marker = ( + isinstance(frontmatter, dict) and frontmatter.get("generated") is True + ) + + if frontmatter is None: + issues.append(Issue("error", path, "missing YAML frontmatter block")) + # If the path was declared generated, also flag that the marker + # is absent — same check #6 below, surfaced even on a bare file. + if is_generated_by_path: + issues.append( + Issue( + "error", + path, + "path matches docs.generated_paths but `generated: true` is missing", + ) + ) + return issues + + # Generated-doc bidirectional declaration check (§5.1). + if is_generated_by_marker and not is_generated_by_path: + issues.append( + Issue( + "error", + path, + "frontmatter has `generated: true` but path is not declared in docs.generated_paths", + ) + ) + if is_generated_by_path and not is_generated_by_marker: + issues.append( + Issue( + "error", + path, + "path matches docs.generated_paths but frontmatter lacks `generated: true`", + ) + ) + + # Defer to the JSON-Schema validator for everything else. The schema + # itself handles the two variants (prose vs generated) via if/then. + try: + from jsonschema import Draft202012Validator + except ImportError as exc: + issues.append( + Issue( + "error", + path, + f"jsonschema is required (pip install jsonschema): {exc}", + ) + ) + return issues + + validator = Draft202012Validator(schema) + for err in sorted(validator.iter_errors(frontmatter), key=lambda e: list(e.absolute_path)): + loc = "/".join(str(p) for p in err.absolute_path) or "" + issues.append(Issue("error", path, f"at {loc}: {err.message}")) + + return issues + + +def walk( + docs_root: Path, + repo_meta_path: Path | None = None, + schema_path: Path | None = None, +) -> list[Issue]: + """Walk ``docs_root`` and validate every ``.md`` file outside the + skipped subdirs. Returns a list of Issue tuples; empty = clean. + + Raises FileNotFoundError if docs_root or schema doesn't exist — the + CLI catches and exits 2. The tests catch it via main()'s rc=2. + """ + docs_root = Path(docs_root) + if not docs_root.is_dir(): + raise FileNotFoundError(f"docs root not found: {docs_root}") + schema_path = Path(schema_path) if schema_path else DEFAULT_SCHEMA + schema = _load_json(schema_path) + generated_globs = _generated_globs(repo_meta_path) + + # Repo root is always the parent of docs_root — that's the layout + # contract (every repo: /docs/). The repo_meta path is used + # only to read docs.generated_paths; it doesn't define the root. + repo_root = docs_root.resolve().parent + + issues: list[Issue] = [] + for path in sorted(docs_root.rglob("*.md")): + rel = path.resolve().relative_to(repo_root) + rel_str = rel.as_posix() + parts = rel.parts + # Skip subtrees under docs/ that have their own schema. parts[0] + # is "docs" (or whatever the docs dir is called); parts[1] is + # the first subdir. + if len(parts) >= 2 and parts[1] in SKIP_SUBDIRS: + # Special-case: a recipes/README.md is a normal index doc, + # not a recipe — it should still be validated. + if parts[-1] != "README.md": + continue + text = path.read_text(encoding="utf-8") + issues.extend(_validate_one(path, rel_str, text, schema, generated_globs)) + return issues + + +# ---------------------------------------------------------- CLI --- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0] if __doc__ else "") + parser.add_argument( + "--docs-root", + type=Path, + default=DEFAULT_DOCS_ROOT, + help=f"Directory to walk (default: {DEFAULT_DOCS_ROOT})", + ) + parser.add_argument( + "--repo-meta", + type=Path, + default=None, + help="Path to dist/repo.meta.json (for docs.generated_paths). Optional.", + ) + parser.add_argument( + "--schema", + type=Path, + default=DEFAULT_SCHEMA, + help=f"Path to docs.schema.json (default: {DEFAULT_SCHEMA})", + ) + parser.add_argument( + "--warn-only", + action="store_true", + help="Print issues as warnings; exit 0 even with errors. Phase 1 default per spec §10.", + ) + args = parser.parse_args(argv) + + try: + issues = walk(args.docs_root, repo_meta_path=args.repo_meta, schema_path=args.schema) + except FileNotFoundError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + + if not issues: + print(f"validate-docs: {args.docs_root} clean") + return 0 + + label = "WARN" if args.warn_only else "ERROR" + for issue in issues: + print(f"{label}: {issue.path}: {issue.message}", file=sys.stderr) + print( + f"validate-docs: {len(issues)} issue(s) in {args.docs_root}", + file=sys.stderr, + ) + return 0 if args.warn_only else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/profile/docs.schema.json b/profile/docs.schema.json new file mode 100644 index 0000000..4374e1b --- /dev/null +++ b/profile/docs.schema.json @@ -0,0 +1,169 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/m-dev-tools/.github/main/profile/docs.schema.json", + "title": "m-dev-tools doc frontmatter", + "description": "Validation contract for the YAML frontmatter block at the top of every prose document under any repo's docs/ tree. Sibling of recipe.schema.json (for docs/recipes/*.md) and repo.meta.schema.json (for dist/repo.meta.json). Two variants gated on the `generated` field: prose docs (require doc_type+lifecycle+the git-derived trio) and generator-emitted docs (only `generated: true` + drift markers — see docs-discoverability-spec.md §5.1).", + "type": "object", + "if": { + "properties": { + "generated": { "const": true } + }, + "required": ["generated"] + }, + "then": { + "$ref": "#/$defs/generatedDoc" + }, + "else": { + "$ref": "#/$defs/proseDoc" + }, + "$defs": { + "proseDoc": { + "type": "object", + "required": [ + "created", + "last_modified", + "revisions", + "doc_type", + "lifecycle" + ], + "additionalProperties": true, + "properties": { + "created": { + "$ref": "#/$defs/isoDate", + "description": "ISO date of the file's first git commit. Tooling-populated from `git log --follow`; never hand-edited." + }, + "last_modified": { + "$ref": "#/$defs/isoDate", + "description": "ISO date of the file's latest git commit. Tooling-populated; never hand-edited." + }, + "revisions": { + "type": "integer", + "minimum": 1, + "description": "Number of commits touching this file. Tooling-populated; never hand-edited." + }, + "doc_type": { + "type": "array", + "minItems": 1, + "uniqueItems": true, + "items": { "$ref": "#/$defs/docType" }, + "description": "One or more entries from the 23-type vocabulary. Combinations are allowed and meaningful (see spec §4 refactor heuristics)." + }, + "lifecycle": { + "$ref": "#/$defs/lifecycle", + "description": "Lifecycle state. Default for the org-wide backfill: `active`." + }, + "title": { + "type": "string", + "minLength": 1, + "description": "Human-readable title. Optional but strongly encouraged for catalog rendering." + }, + "owner": { + "type": "string", + "pattern": "^[A-Za-z0-9][A-Za-z0-9-]*$", + "description": "GitHub handle of the accountable owner. Defaults to the repo's top contributor when omitted." + }, + "connections": { + "type": "array", + "maxItems": 3, + "uniqueItems": true, + "items": { "$ref": "#/$defs/connection" }, + "description": "0–3 connection types from the 6-vocab. Optional in Phase 1." + }, + "replaces": { + "type": "array", + "items": { "type": "string", "minLength": 1 }, + "description": "Paths to other docs this one supersedes. If A says `replaces: [B]`, B must say `superseded_by: A` (Phase 3 bidirectional check)." + }, + "supersedes": { + "type": "array", + "items": { "type": "string", "minLength": 1 }, + "description": "Alias of `replaces`. Pick one per doc." + }, + "superseded_by": { + "type": "string", + "minLength": 1, + "description": "Path to the doc that retired this one." + }, + "related": { + "type": "array", + "items": { "type": "string", "minLength": 1 }, + "description": "Paths to related docs. Not a supersession claim." + }, + "freeze_after": { + "$ref": "#/$defs/isoDate", + "description": "Date after which this doc's lifecycle should flip to `frozen` (e.g., release plans at GA)." + }, + "review_after": { + "$ref": "#/$defs/isoDate", + "description": "Date after which this doc surfaces in the freshness report. Phase 3 cron honors this." + }, + "generated": { + "const": false, + "description": "Explicit `false` is accepted but redundant — absence implies prose." + } + } + }, + "generatedDoc": { + "type": "object", + "required": ["generated", "last_modified"], + "additionalProperties": true, + "properties": { + "generated": { + "const": true, + "description": "Marks this doc as generator-emitted. See docs-discoverability-spec.md §5.1: docs-QA CI skips all prose checks for these and asserts only path-declaration + drift-vs-generator." + }, + "last_modified": { + "$ref": "#/$defs/isoDate", + "description": "Date of the generator run that wrote this file. Drift gate (Phase 1 check #5) compares against the source manifest's `verified_on`." + } + } + }, + "docType": { + "type": "string", + "enum": [ + "HISTORY", + "ARCHITECTURE", + "DESIGN", + "ADR", + "SPEC", + "REFERENCE", + "GUIDE", + "TUTORIAL", + "ROADMAP", + "PLAN", + "RESEARCH", + "SURVEY", + "GAP-ANALYSIS", + "STATUS", + "EXPLAINER", + "NOTES", + "WORKED-EXAMPLE", + "SETUP", + "INTEGRATION", + "PROPOSAL", + "BUILD-LOG", + "CHANGELOG", + "POSTMORTEM" + ] + }, + "lifecycle": { + "type": "string", + "enum": ["draft", "active", "frozen", "superseded", "deprecated"] + }, + "connection": { + "type": "string", + "enum": [ + "history", + "function", + "design", + "architecture", + "planning", + "implementation" + ] + }, + "isoDate": { + "type": "string", + "pattern": "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + } + } +} diff --git a/profile/repo.meta.schema.json b/profile/repo.meta.schema.json index f259971..073f425 100644 --- a/profile/repo.meta.schema.json +++ b/profile/repo.meta.schema.json @@ -91,6 +91,19 @@ "enum": ["active", "archived", "deprecated"], "description": "Repo lifecycle status. Defaults to 'active' when omitted." }, + "docs": { + "type": "object", + "additionalProperties": false, + "description": "Repo-level documentation metadata consumed by validate-docs.py (docs-discoverability spec §5.1). Optional — repos with only hand-written prose docs may omit this block entirely.", + "properties": { + "generated_paths": { + "type": "array", + "items": { "type": "string", "minLength": 1 }, + "uniqueItems": true, + "description": "Glob patterns (relative to repo root, e.g. 'docs/modules/std*.md') for documentation files produced by a generator. Files matching these globs are exempt from the prose schema; validate-docs.py asserts only `generated: true` + `last_modified` drift against the source manifest. See docs-discoverability-spec.md §5.1." + } + } + }, "notes": { "type": "string", "description": "Free-text notes for human readers. Not consumed by the catalog generator."