";
+}
+
+function renderGaps() {
+ const scenarios = state.data.scenarios?.scenarios || [];
+ const claims = state.data.claims?.claims || [];
+ const failures = state.data.failures?.failure_modes || [];
+ const datasets = state.data.datasets?.datasets || [];
+ const metrics = state.data.metrics?.metrics || [];
+
+ // 1. Scenarios with no current best method.
+ const orphanScenarios = scenarios.filter(s => !(s.current_best_methods || []).length);
+
+ // 2. Papers (claim subjects) without any claim-level related_failure_modes.
+ const papersWithFailures = new Set();
+ const papersWithClaims = new Set();
+ for (const c of claims) {
+ if (!c.subject) continue;
+ papersWithClaims.add(c.subject);
+ for (const fm of c.related_failure_modes || []) papersWithFailures.add(c.subject);
+ }
+ const orphanPapers = Array.from(papersWithClaims).filter(p => !papersWithFailures.has(p));
+
+ // 3. Datasets / metrics not referenced by any claim.reproduction.public_data or scenario.evaluation_metrics.
+ const referencedDatasets = new Set();
+ const referencedMetrics = new Set();
+ for (const c of claims) if (c.reproduction?.public_data) referencedDatasets.add(c.reproduction.public_data);
+ for (const s of scenarios) {
+ for (const d of s.available_datasets || []) referencedDatasets.add(d);
+ for (const m of s.evaluation_metrics || []) referencedMetrics.add(m);
+ }
+ const orphanDatasets = datasets.filter(d => !referencedDatasets.has(d.id));
+ const orphanMetrics = metrics.filter(m => !referencedMetrics.has(m.id));
+
+ // 4. Failure modes not referenced by any claim or scenario.
+ const referencedFailures = new Set();
+ for (const c of claims) for (const fm of c.related_failure_modes || []) referencedFailures.add(fm);
+ for (const s of scenarios) for (const fm of s.open_failure_modes || []) referencedFailures.add(fm);
+ const orphanFailures = failures.filter(f => !referencedFailures.has(f.id));
+
+ function section(title, hint, items, render) {
+ return `
+
+
${escapeHtml(title)} ${items.length}
+
${escapeHtml(hint)}
+ ${items.length ? `
${items.map(render).join("")}
` : "
该类缺口当前为空。
"}
+ `;
+ }
+
+ const html = `
+
+ ${section("尚无可信公开报告的场景", "current_best_methods 为空。直接对应一篇基准构建或方法首报论文。", orphanScenarios, s => `
+
+
+
+
+
+
+
+
+
+
diff --git a/tools/build_research_overlay.py b/tools/build_research_overlay.py
new file mode 100644
index 0000000..c021d33
--- /dev/null
+++ b/tools/build_research_overlay.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+"""Build docs/data/research/node_overlay.json from the structured research layer.
+
+The 3D atlas reads this overlay to bind visual encoding to research substance:
+ - evidence_strength (0..3) — how well-supported the node's strongest claim is
+ - dispute_level (0..3) — community disagreement
+ - reproducibility_status — verified / partial / inferred / speculative
+ - failure_boundary_count — number of failure modes that diagnose this node
+ - maturity (0..3) — derived from reproducibility + evidence
+
+This script is idempotent: re-running it after editing the source JSONs will
+refresh the overlay. CI re-runs it to make sure the overlay tracks reality.
+"""
+from __future__ import annotations
+
+import json
+from collections import defaultdict
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+RESEARCH = ROOT / "docs" / "data" / "research"
+OUT = RESEARCH / "node_overlay.json"
+
+REPRO_TO_MATURITY = {"verified": 3, "partial": 2, "inferred": 1, "speculative": 0}
+
+
+def main() -> int:
+ claims = json.loads((RESEARCH / "claims.json").read_text(encoding="utf-8")).get("claims", [])
+ failures = json.loads((RESEARCH / "failure_modes.json").read_text(encoding="utf-8")).get("failure_modes", [])
+ scenarios = json.loads((RESEARCH / "scenarios.json").read_text(encoding="utf-8")).get("scenarios", [])
+ chains = json.loads((RESEARCH / "argument_chains.json").read_text(encoding="utf-8")).get("argument_chains", [])
+
+ by_subject_max_ev: dict[str, int] = defaultdict(int)
+ by_subject_max_disp: dict[str, int] = defaultdict(int)
+ by_subject_best_repro: dict[str, int] = defaultdict(int)
+ by_subject_claim_count: dict[str, int] = defaultdict(int)
+ for c in claims:
+ sid = c.get("subject")
+ if not sid:
+ continue
+ by_subject_max_ev[sid] = max(by_subject_max_ev[sid], int(c.get("evidence_strength") or 0))
+ by_subject_max_disp[sid] = max(by_subject_max_disp[sid], int(c.get("dispute_level") or 0))
+ by_subject_best_repro[sid] = max(by_subject_best_repro[sid], REPRO_TO_MATURITY.get(c.get("reproducibility_status"), 0))
+ by_subject_claim_count[sid] += 1
+
+ failure_for_subject: dict[str, set[str]] = defaultdict(set)
+ for c in claims:
+ sid = c.get("subject")
+ if not sid:
+ continue
+ for fm in c.get("related_failure_modes") or []:
+ failure_for_subject[sid].add(fm)
+
+ overlay = {
+ "generated_by": "tools/build_research_overlay.py",
+ "version": 1,
+ "subjects": {},
+ "scenarios": [s.get("id") for s in scenarios],
+ "failure_modes": [f.get("id") for f in failures],
+ "argument_chains": [ch.get("id") for ch in chains],
+ }
+ all_subjects = set(by_subject_max_ev) | set(failure_for_subject)
+ for sid in sorted(all_subjects):
+ overlay["subjects"][sid] = {
+ "evidence_strength": by_subject_max_ev.get(sid, 0),
+ "dispute_level": by_subject_max_disp.get(sid, 0),
+ "reproducibility_status_score": by_subject_best_repro.get(sid, 0),
+ "failure_boundary_count": len(failure_for_subject.get(sid, set())),
+ "claim_count": by_subject_claim_count.get(sid, 0),
+ "maturity": max(by_subject_best_repro.get(sid, 0), by_subject_max_ev.get(sid, 0)),
+ }
+ OUT.write_text(json.dumps(overlay, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+ n = len(overlay["subjects"])
+ print(f"OK wrote {OUT.relative_to(ROOT)} with overlay for {n} subject nodes")
+ return 0
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/tools/screenshot_regression.py b/tools/screenshot_regression.py
new file mode 100644
index 0000000..65bee89
--- /dev/null
+++ b/tools/screenshot_regression.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+"""Screenshot regression scaffold for the workbench and 3D atlas pages.
+
+This script captures page screenshots at two canonical viewports (desktop
+1440x900 and mobile 390x844) and compares them against baseline PNGs stored
+under `docs/data/research/baselines/`. It is designed to fail loudly when a
+known-good baseline is missing or differs.
+
+Behaviour matrix:
+ - If Playwright is not installed, the script prints a clear setup message
+ and exits 0 (do not block CI on missing optional dependency).
+ - If Playwright is installed and `--bake` is passed, baselines are
+ regenerated. Useful when intentional visual changes happen.
+ - If Playwright is installed without `--bake`, each captured frame is
+ compared to its baseline. Any pixel-difference > threshold fails.
+
+Run from repo root: `python tools/screenshot_regression.py [--bake]`.
+"""
+from __future__ import annotations
+
+import argparse
+import contextlib
+import http.server
+import socketserver
+import subprocess
+import sys
+import threading
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+DOCS = ROOT / "docs"
+BASELINE_DIR = DOCS / "data" / "research" / "baselines"
+PORT = 8766
+
+VIEWPORTS = [
+ ("desktop", 1440, 900),
+ ("mobile", 390, 844),
+]
+ROUTES = [
+ ("workbench", "/workbench.html"),
+ ("workbench_papers", "/workbench.html?view=papers"),
+ ("workbench_failures", "/workbench.html?view=failures"),
+ ("atlas3d", "/index.html"),
+]
+
+
+def _try_import_playwright():
+ try:
+ from playwright.sync_api import sync_playwright # noqa: F401
+ return True
+ except Exception:
+ return False
+
+
+def _start_server() -> tuple[socketserver.TCPServer, threading.Thread]:
+ handler = http.server.SimpleHTTPRequestHandler
+ # The handler serves files relative to CWD; chdir into docs before binding.
+ server = socketserver.TCPServer(("127.0.0.1", PORT), handler)
+ thread = threading.Thread(target=server.serve_forever, daemon=True)
+ thread.start()
+ return server, thread
+
+
+@contextlib.contextmanager
+def _docs_cwd():
+ import os
+ prev = os.getcwd()
+ os.chdir(DOCS)
+ try:
+ yield
+ finally:
+ os.chdir(prev)
+
+
+def _capture(playwright, base_url: str, out_dir: Path, bake: bool) -> tuple[int, list[str]]:
+ """Capture every (route, viewport) pair into out_dir; return (mismatch_count, error_list)."""
+ out_dir.mkdir(parents=True, exist_ok=True)
+ mismatches = 0
+ errors: list[str] = []
+ chromium = playwright.chromium.launch()
+ for vp_name, w, h in VIEWPORTS:
+ context = chromium.new_context(viewport={"width": w, "height": h})
+ page = context.new_page()
+ for route_name, path in ROUTES:
+ url = f"{base_url}{path}"
+ try:
+ page.goto(url, wait_until="networkidle", timeout=15000)
+ # Give the workbench JS a moment to render the cards (no transitions).
+ page.wait_for_timeout(900)
+ except Exception as e:
+ errors.append(f"navigation failure for {url}: {e}")
+ continue
+ fname = f"{route_name}__{vp_name}.png"
+ target = out_dir / fname
+ page.screenshot(path=str(target), full_page=False)
+ if bake:
+ continue
+ baseline = BASELINE_DIR / fname
+ if not baseline.exists():
+ errors.append(f"missing baseline {fname}; rerun with --bake after visual review")
+ mismatches += 1
+ continue
+ if baseline.read_bytes() != target.read_bytes():
+ errors.append(f"visual diff detected: {fname}")
+ mismatches += 1
+ context.close()
+ chromium.close()
+ return mismatches, errors
+
+
+def main() -> int:
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--bake", action="store_true", help="Regenerate baselines (use after intentional visual change)")
+ args = parser.parse_args()
+
+ if not _try_import_playwright():
+ print("screenshot_regression: Playwright not installed.")
+ print(" to enable visual regression: pip install playwright && python -m playwright install chromium")
+ print(" this scaffold exits 0 by design so CI is not blocked on the optional dependency.")
+ return 0
+
+ from playwright.sync_api import sync_playwright
+
+ with _docs_cwd():
+ server, _thread = _start_server()
+ try:
+ time.sleep(0.4)
+ BASELINE_DIR.mkdir(parents=True, exist_ok=True)
+ with sync_playwright() as p:
+ tmp_dir = ROOT / "tmp_screenshots"
+ target_dir = BASELINE_DIR if args.bake else tmp_dir
+ mismatches, errors = _capture(p, f"http://127.0.0.1:{PORT}", target_dir, args.bake)
+ if args.bake:
+ print(f"baked {len(VIEWPORTS) * len(ROUTES)} baseline screenshots into {BASELINE_DIR.relative_to(ROOT)}")
+ return 0
+ if mismatches:
+ print(f"SCREENSHOT REGRESSION FAILED with {mismatches} mismatch(es):")
+ for e in errors:
+ print(f" - {e}")
+ return 1
+ print(f"OK {len(VIEWPORTS) * len(ROUTES)} screenshots match baselines")
+ return 0
+ finally:
+ server.shutdown()
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/tools/validate_research.py b/tools/validate_research.py
new file mode 100644
index 0000000..1c7f20f
--- /dev/null
+++ b/tools/validate_research.py
@@ -0,0 +1,325 @@
+#!/usr/bin/env python3
+"""Validate the structured research layer under docs/data/research/.
+
+Quality gates enforced (exit 1 on any failure):
+
+ 1. Every claim has all required fields: id, subject, statement, evidence,
+ preconditions, counterexamples, boundaries, reproduction, publication_value,
+ dispute_level, evidence_strength, reproducibility_status.
+ 2. Every claim's `evidence` is a non-empty array; each item has kind, source,
+ finding. Kinds limited to the controlled vocabulary.
+ 3. Every argument chain has all 10 required argumentative fields plus related
+ scenarios/datasets/metrics.
+ 4. Every dataset has supports + limits + common_misuses (each non-empty).
+ 5. Every metric has formula + variables + assumptions + what_it_proves +
+ what_it_cannot_prove + known_misuses. The formula must reference each
+ declared variable symbol (or the symbol must appear inside another).
+ 6. Every failure mode has trigger_conditions + manifestation +
+ reproducible_setup + diagnostic_metrics + method_weakness + partial_solutions
+ + open_questions + publication_angles, all non-empty.
+ 7. Every experiment plan has all three tiers; each tier specifies purpose,
+ metrics or success_criteria, and an expected signal or compute budget.
+ 8. Cross-references resolve: a claim's subject must be either a known paper
+ node in docs/data/graph_extended.json (or graph.json), or marked as
+ `unresolved_subject: true` in the claim. Same for related_failure_modes
+ pointing to actual failure_mode ids. Argument chains' related_* ids
+ must resolve to known scenario/dataset/metric ids.
+ 9. No claim leaves preconditions, counterexamples or boundaries empty.
+10. No formula contains stray TeX errors that we can detect cheaply
+ (mismatched $$ or empty $...$).
+11. Every dataset that appears in a claim/argument chain must list at least
+ one covers_scenarios entry; every metric must list at least one
+ what_it_cannot_prove entry.
+
+Run: `python tools/validate_research.py`
+"""
+from __future__ import annotations
+
+import json
+import re
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+RESEARCH = ROOT / "docs" / "data" / "research"
+GRAPH = ROOT / "docs" / "data" / "graph.json"
+GRAPH_EXT = ROOT / "docs" / "data" / "graph_extended.json"
+
+ALLOWED_EVIDENCE_KINDS = {"ablation", "table", "theorem", "repro", "external_benchmark"}
+ALLOWED_REPRO = {"verified", "partial", "inferred", "speculative"}
+
+
+def _load(path: Path):
+ with path.open(encoding="utf-8") as f:
+ return json.load(f)
+
+
+def _known_paper_ids() -> set[str]:
+ ids = set()
+ for p in (GRAPH, GRAPH_EXT):
+ if p.exists():
+ data = _load(p)
+ for n in data.get("nodes", []):
+ if n.get("id"):
+ ids.add(n["id"])
+ return ids
+
+
+def _expect(cond: bool, msg: str, errors: list[str]) -> None:
+ if not cond:
+ errors.append(msg)
+
+
+def _is_non_empty_str(v) -> bool:
+ return isinstance(v, str) and v.strip() != ""
+
+
+def _is_non_empty_list(v) -> bool:
+ return isinstance(v, list) and len(v) > 0
+
+
+def validate_claims(errors: list[str], known_papers: set[str], known_fms: set[str], known_metrics: set[str], known_datasets: set[str]) -> int:
+ path = RESEARCH / "claims.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0
+ data = _load(path)
+ claims = data.get("claims", [])
+ seen_ids = set()
+ for c in claims:
+ cid = c.get("id", "")
+ _expect(_is_non_empty_str(cid), f"claim missing id: {c}", errors)
+ _expect(cid not in seen_ids, f"claim duplicate id: {cid}", errors)
+ seen_ids.add(cid)
+ for field in ("subject", "statement", "publication_value", "reproducibility_status"):
+ _expect(_is_non_empty_str(c.get(field)), f"claim {cid} missing {field}", errors)
+ for field in ("preconditions", "counterexamples", "boundaries", "evidence"):
+ _expect(_is_non_empty_list(c.get(field)), f"claim {cid} field {field} must be non-empty list", errors)
+ _expect(c.get("reproducibility_status") in ALLOWED_REPRO, f"claim {cid} bad reproducibility_status {c.get('reproducibility_status')!r}", errors)
+ ev_strength = c.get("evidence_strength")
+ _expect(isinstance(ev_strength, int) and 0 <= ev_strength <= 3, f"claim {cid} evidence_strength must be int 0..3", errors)
+ disp = c.get("dispute_level")
+ _expect(isinstance(disp, int) and 0 <= disp <= 3, f"claim {cid} dispute_level must be int 0..3", errors)
+ for ev in c.get("evidence", []) or []:
+ _expect(ev.get("kind") in ALLOWED_EVIDENCE_KINDS, f"claim {cid} evidence has bad kind {ev.get('kind')!r}", errors)
+ _expect(_is_non_empty_str(ev.get("source")), f"claim {cid} evidence missing source", errors)
+ _expect(_is_non_empty_str(ev.get("finding")), f"claim {cid} evidence missing finding", errors)
+ repro = c.get("reproduction") or {}
+ _expect(_is_non_empty_str(repro.get("minimal")), f"claim {cid} reproduction.minimal missing", errors)
+ _expect(_is_non_empty_str(repro.get("public_data")), f"claim {cid} reproduction.public_data missing", errors)
+ _expect(_is_non_empty_str(repro.get("expected_output")), f"claim {cid} reproduction.expected_output missing", errors)
+ _expect(isinstance(repro.get("cost_hours"), (int, float)), f"claim {cid} reproduction.cost_hours missing", errors)
+ # cross-ref reproduction.public_data → known dataset
+ pd = repro.get("public_data")
+ if pd and known_datasets and pd not in known_datasets:
+ errors.append(f"claim {cid} reproduction.public_data {pd!r} not declared in datasets.json")
+ # cross-ref subject
+ if c.get("subject") and known_papers and c["subject"] not in known_papers and not c.get("unresolved_subject"):
+ errors.append(f"claim {cid} subject {c['subject']} not in known nodes; mark unresolved_subject=true if intentional")
+ for fm in c.get("related_failure_modes", []) or []:
+ _expect(fm in known_fms, f"claim {cid} related_failure_modes {fm} not declared", errors)
+ # Every paper that appears as a claim subject must touch at least one
+ # failure mode somewhere among its claims. This enforces the rule
+ # "every method node must have a failure boundary".
+ by_subject: dict[str, list] = {}
+ for c in claims:
+ by_subject.setdefault(c.get("subject"), []).append(c)
+ for sid, cs in by_subject.items():
+ if not sid:
+ continue
+ fms = set()
+ for c in cs:
+ for fm in c.get("related_failure_modes") or []:
+ fms.add(fm)
+ if not fms:
+ errors.append(f"paper {sid} has claims but no related_failure_modes across them (every method node must declare a failure boundary)")
+ return len(claims)
+
+
+def validate_chains(errors: list[str], known_papers: set[str], known_scenarios: set[str], known_datasets: set[str], known_metrics: set[str]) -> int:
+ path = RESEARCH / "argument_chains.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0
+ data = _load(path)
+ chains = data.get("argument_chains", [])
+ required_text = ("research_gap", "core_claim", "method_mechanism")
+ required_lists = ("key_experiments", "strong_baselines", "ablations", "negative_results", "reviewer_attacks", "response_experiments", "figure_plan")
+ for ch in chains:
+ cid = ch.get("id", "")
+ for f in required_text:
+ _expect(_is_non_empty_str(ch.get(f)), f"chain {cid} missing {f}", errors)
+ for f in required_lists:
+ _expect(_is_non_empty_list(ch.get(f)), f"chain {cid} field {f} must be non-empty list", errors)
+ _expect(_is_non_empty_list(ch.get("subject_papers")), f"chain {cid} subject_papers must be non-empty", errors)
+ for p in ch.get("subject_papers", []) or []:
+ if known_papers and p not in known_papers:
+ errors.append(f"chain {cid} subject_papers references unknown node {p}")
+ for s in ch.get("related_scenarios", []) or []:
+ _expect(s in known_scenarios, f"chain {cid} related_scenarios {s} not declared", errors)
+ for d in ch.get("related_datasets", []) or []:
+ _expect(d in known_datasets, f"chain {cid} related_datasets {d} not declared", errors)
+ for m in ch.get("related_metrics", []) or []:
+ _expect(m in known_metrics, f"chain {cid} related_metrics {m} not declared", errors)
+ return len(chains)
+
+
+def validate_scenarios(errors: list[str]) -> tuple[int, set[str]]:
+ path = RESEARCH / "scenarios.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0, set()
+ data = _load(path)
+ scenarios = data.get("scenarios", [])
+ ids: set[str] = set()
+ for s in scenarios:
+ sid = s.get("id", "")
+ ids.add(sid)
+ for f in ("label", "description", "why_hard"):
+ _expect(_is_non_empty_str(s.get(f)), f"scenario {sid} missing {f}", errors)
+ for f in ("available_datasets", "evaluation_metrics"):
+ _expect(_is_non_empty_list(s.get(f)), f"scenario {sid} {f} must be non-empty", errors)
+ # current_best_methods may be empty — an empty list intentionally signals
+ # that no public method has been credibly benchmarked on this scenario
+ # yet. Treat that absence as research-relevant data, not a missing field.
+ _expect(isinstance(s.get("current_best_methods"), list), f"scenario {sid} current_best_methods must be a list (possibly empty when no public report exists)", errors)
+ return len(scenarios), ids
+
+
+def validate_datasets(errors: list[str], known_scenarios: set[str]) -> tuple[int, set[str]]:
+ path = RESEARCH / "datasets.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0, set()
+ data = _load(path)
+ datasets = data.get("datasets", [])
+ ids: set[str] = set()
+ for d in datasets:
+ did = d.get("id", "")
+ ids.add(did)
+ for f in ("label", "scale", "license"):
+ _expect(_is_non_empty_str(d.get(f)), f"dataset {did} missing {f}", errors)
+ for f in ("supports", "limits", "common_misuses", "covers_scenarios"):
+ _expect(_is_non_empty_list(d.get(f)), f"dataset {did} {f} must be non-empty", errors)
+ for s in d.get("covers_scenarios", []) or []:
+ _expect(s in known_scenarios, f"dataset {did} covers_scenarios {s} not declared", errors)
+ return len(datasets), ids
+
+
+def validate_metrics(errors: list[str]) -> tuple[int, set[str]]:
+ path = RESEARCH / "metrics.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0, set()
+ data = _load(path)
+ metrics = data.get("metrics", [])
+ ids: set[str] = set()
+ for m in metrics:
+ mid = m.get("id", "")
+ ids.add(mid)
+ for f in ("label", "formula", "scope"):
+ _expect(_is_non_empty_str(m.get(f)), f"metric {mid} missing {f}", errors)
+ _expect(isinstance(m.get("variables"), dict) and m["variables"], f"metric {mid} variables must be a non-empty dict", errors)
+ for f in ("assumptions", "what_it_proves", "what_it_cannot_prove", "known_misuses"):
+ _expect(_is_non_empty_list(m.get(f)), f"metric {mid} {f} must be non-empty", errors)
+ formula = m.get("formula", "")
+ # cheap TeX sanity: balanced dollar pairs, no empty `$$`
+ if formula.count("$$") % 2 != 0:
+ errors.append(f"metric {mid} formula has unbalanced $$ delimiters")
+ if re.search(r"\$\s*\$", formula):
+ errors.append(f"metric {mid} formula has empty $...$ block")
+ # at least one declared variable symbol must literally appear in the formula
+ vars_dict = m.get("variables") or {}
+ if vars_dict:
+ present = sum(1 for sym in vars_dict if sym in formula)
+ if present == 0:
+ errors.append(f"metric {mid} formula references none of its declared variable symbols; the formula and the variables dictionary are out of sync")
+ return len(metrics), ids
+
+
+def validate_scenarios_cross_refs(errors: list[str], known_datasets: set[str], known_metrics: set[str], known_fms: set[str]) -> None:
+ path = RESEARCH / "scenarios.json"
+ if not path.exists():
+ return
+ scenarios = _load(path).get("scenarios", [])
+ for s in scenarios:
+ sid = s.get("id", "")
+ for d in s.get("available_datasets", []) or []:
+ if known_datasets and d not in known_datasets:
+ errors.append(f"scenario {sid} available_datasets {d} not declared in datasets.json")
+ for m in s.get("evaluation_metrics", []) or []:
+ if known_metrics and m not in known_metrics:
+ errors.append(f"scenario {sid} evaluation_metrics {m} not declared in metrics.json")
+ for fm in s.get("open_failure_modes", []) or []:
+ if known_fms and fm not in known_fms:
+ errors.append(f"scenario {sid} open_failure_modes {fm} not declared in failure_modes.json")
+
+
+def validate_failures(errors: list[str], known_metrics: set[str]) -> tuple[int, set[str]]:
+ path = RESEARCH / "failure_modes.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0, set()
+ data = _load(path)
+ fms = data.get("failure_modes", [])
+ ids: set[str] = set()
+ for f in fms:
+ fid = f.get("id", "")
+ ids.add(fid)
+ for k in ("label", "manifestation", "reproducible_setup", "method_weakness"):
+ _expect(_is_non_empty_str(f.get(k)), f"failure_mode {fid} missing {k}", errors)
+ for k in ("trigger_conditions", "diagnostic_metrics", "partial_solutions", "open_questions", "publication_angles"):
+ _expect(_is_non_empty_list(f.get(k)), f"failure_mode {fid} {k} must be non-empty", errors)
+ for m in f.get("diagnostic_metrics", []) or []:
+ if known_metrics and m not in known_metrics:
+ errors.append(f"failure_mode {fid} diagnostic_metrics {m} not declared")
+ for sol in f.get("partial_solutions", []) or []:
+ _expect(_is_non_empty_str(sol.get("idea")), f"failure_mode {fid} partial_solution missing idea", errors)
+ _expect(_is_non_empty_str(sol.get("residual_gap")), f"failure_mode {fid} partial_solution missing residual_gap", errors)
+ return len(fms), ids
+
+
+def validate_experiments(errors: list[str]) -> int:
+ path = RESEARCH / "experiment_plans.json"
+ if not path.exists():
+ errors.append(f"missing {path}")
+ return 0
+ data = _load(path)
+ plans = data.get("experiment_plans", [])
+ tiers = ("tier_1_minimal_mechanism", "tier_2_public_benchmark", "tier_3_stress_test")
+ for p in plans:
+ pid = p.get("id", "")
+ for f in ("title", "subject"):
+ _expect(_is_non_empty_str(p.get(f)), f"experiment_plan {pid} missing {f}", errors)
+ for t in tiers:
+ tier = p.get(t) or {}
+ _expect(_is_non_empty_str(tier.get("purpose")), f"experiment_plan {pid} {t}.purpose missing", errors)
+ _expect(_is_non_empty_str(tier.get("success_criteria")), f"experiment_plan {pid} {t}.success_criteria missing", errors)
+ return len(plans)
+
+
+def main() -> int:
+ errors: list[str] = []
+ known_papers = _known_paper_ids()
+ scen_count, known_scenarios = validate_scenarios(errors)
+ metric_count, known_metrics = validate_metrics(errors)
+ fm_count, known_fms = validate_failures(errors, known_metrics)
+ ds_count, known_datasets = validate_datasets(errors, known_scenarios)
+ claim_count = validate_claims(errors, known_papers, known_fms, known_metrics, known_datasets)
+ chain_count = validate_chains(errors, known_papers, known_scenarios, known_datasets, known_metrics)
+ plan_count = validate_experiments(errors)
+ validate_scenarios_cross_refs(errors, known_datasets, known_metrics, known_fms)
+
+ if errors:
+ print(f"RESEARCH VALIDATION FAILED with {len(errors)} error(s):")
+ for e in errors:
+ print(f" - {e}")
+ return 1
+ print("OK research layer:")
+ print(f" claims={claim_count} chains={chain_count} scenarios={scen_count} "
+ f"datasets={ds_count} metrics={metric_count} failure_modes={fm_count} experiment_plans={plan_count}")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())