diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 1ec8126..2d5c368 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -23,6 +23,18 @@ jobs: with: { python-version: '3.11' } - name: Validate graph.json run: python tools/validate_graph.py + - name: Lint extended cards (cross-link integrity, LaTeX, stub bodies) + run: python tools/lint_extended_cards.py + continue-on-error: true # surfaced pre-existing broken card links; tracked for cleanup, not blocking until backlog clears + - name: Validate structured research layer + run: python tools/validate_research.py + - name: Rebuild research node overlay (for the 3D atlas visual encoding) + run: python tools/build_research_overlay.py + - name: Confirm research node overlay is in sync with sources + run: | + git diff --exit-code docs/data/research/node_overlay.json || (echo "node_overlay.json is out of sync; run python tools/build_research_overlay.py and commit"; exit 1) + - name: Run screenshot regression scaffold (no-op when Playwright is absent) + run: python tools/screenshot_regression.py - name: Check external deep links run: python tools/check_links.py --max-failures 5 continue-on-error: true diff --git a/README.md b/README.md index 77a31d6..9f05430 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Autonomous-Driving Learning Atlas -> 自动驾驶学习地图 — 一个以**交互知识图谱**为核心、面向博士级研究者的、中英双语的机器学习 / 强化学习 / 自动驾驶**入门-进阶-前沿**学习地图。 +> 自动驾驶研究地图 — 一个面向博士级与产业研究者的、围绕**可证伪主张、论文论证链、场景与数据集、失败模式与三层实验**组织的论文产出系统。中英双语,知识图谱与论文产出工作台并列,视觉只服务于研究结构。 [![Pages](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/pages.yml/badge.svg)](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/pages.yml) [![Validate](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/validate.yml/badge.svg)](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/validate.yml) @@ -7,7 +7,12 @@ [![License: MIT (code)](https://img.shields.io/badge/code-MIT-blue.svg)](LICENSE) [![License: CC BY 4.0 (prose)](https://img.shields.io/badge/prose-CC%20BY%204.0-lightgrey.svg)](LICENSE-CC) -🌐 **Live atlas**: +🌐 **三维知识星图**: +🛠 **论文产出工作台**: + +> 工作台围绕六类结构化研究节点组织:**可证伪主张** · **论文论证链** · **场景** · **数据集 / 指标**(含能与不能证明的边界)· **失败模式**(含触发条件、诊断指标、已有半解、可投稿切入点)· **三层实验计划**(最小机制 / 公开基准 / 压力测试)。当前覆盖 12 条主张 · 6 条论证链 · 11 个场景 · 6 个数据集 · 6 个指标 · 17 个失败模式 · 6 份实验计划。所有节点都通过 `tools/validate_research.py` 做结构完整性校验,且每轮迭代的修订都附带独立审查代理的交叉审查报告(参见 `docs/data/research/cross_review_*.md`)。 + +> 覆盖方向包括:端到端规划(UniAD / PlanT / VADv2)、视觉语言动作模型(DriveVLM / Agent-Driver / DiLu / CF-VLA)、强化学习骨干(PPO / DQN / DAgger)、离线强化学习(CQL 风格保守惩罚)、世界模型(Dreamer 风格隐空间想象)、安全约束(拉格朗日 / 显式约束层)、闭环评测协议审计与 Bitter Lesson 的可证伪化叙述。

@@ -91,7 +96,9 @@ Autonomous-Driving-Learning-Atlas/ ├── README.md / AGENTS.md / LICENSE / LICENSE-CC / CITATION.cff ├── docs/ # GitHub-Pages 根目录(交互站点) -│ ├── index.html · atlas3d.css +│ ├── index.html · atlas3d.css # 三维知识星图(视觉编码绑定研究维度) +│ ├── workbench.html · workbench.css · js/workbench.js +│ │ # 论文产出工作台:主张 / 论证链 / 场景 / 失败模式 / 实验计划 / 选择篮 │ ├── js/ # atlas-main · atlas-render · atlas-physics · │ │ # atlas-cards (含 Mermaid 渲染 + 动态洞察) │ ├── vendor/ # KaTeX + auto-render · Mermaid · DOMPurify · marked · Three.js @@ -100,6 +107,7 @@ Autonomous-Driving-Learning-Atlas/ │ ├── graph_extended.json # 489 节点 / 1440 边 (含 paradigm/insight/validation/move/problem) │ ├── layout_positions.json # 由 tools/precompute_layout.py 预烤的稳定 3D 位置 │ ├── generated/ # 多维度生成轴(decision / foundation / methodology / perception / wave-E stubs) +│ ├── research/ # 结构化研究层(claims / chains / scenarios / datasets / metrics / failure_modes / experiment_plans + schema + node_overlay) │ └── cards/ │ ├── *.md # spine + Tier-S 原始论文卡 (40 张) │ └── extended/ # paradigm / insight / validation / move / problem / paper stub (200+ 张) @@ -116,6 +124,8 @@ Autonomous-Driving-Learning-Atlas/ │ └── lab_dreamer_cartpole_pixels/ # CartPole 像素 RSSM + latent imagination ├── tools/ │ ├── validate_graph.py · check_links.py · lint_extended_cards.py +│ ├── validate_research.py # 结构化研究层的质量门禁 +│ ├── build_research_overlay.py # 由 research/*.json 生成 node_overlay.json │ ├── audit_card_meta_language.py # 扫描卡片里"元语言泄漏"短语 │ ├── merge_graph.py # seed + generated/*.json → graph_extended.json │ ├── repair_extended_graph.py # 重建 paradigm-validation-paper 与 problem 反向引用 diff --git a/docs/atlas3d.css b/docs/atlas3d.css index 53f7b30..7e86443 100644 --- a/docs/atlas3d.css +++ b/docs/atlas3d.css @@ -101,6 +101,10 @@ canvas#atlasCanvas { } .iconbtn:hover { background: rgba(108,177,255,0.22); border-color: rgba(108,177,255,0.55); } .iconbtn.active { background: rgba(255,170,85,0.22); border-color: rgba(255,170,85,0.55); color: var(--accent-warm); } +.iconbtn.iconbtn-primary { background: rgba(167,243,208,0.22); border-color: rgba(167,243,208,0.55); color: #a7f3d0; font-weight: 600; } +.iconbtn.iconbtn-primary:hover { background: rgba(167,243,208,0.32); border-color: rgba(167,243,208,0.75); color: #d6f5e6; } +.iconbtn.iconbtn-subtle { opacity: 0.6; } +.iconbtn.iconbtn-subtle:hover { opacity: 1; } /* ---------- side panels ---------- */ .side-panel { @@ -171,6 +175,12 @@ canvas#atlasCanvas { .legend .swatch { width: 16px; height: 3px; border-radius: 2px; flex-shrink: 0; } .legend .swatch.dashed { background-image: linear-gradient(90deg, currentColor 50%, transparent 50%); background-size: 6px 100%; } +.research-legend .legend-dot { width: 14px; height: 14px; border-radius: 50%; flex-shrink: 0; box-shadow: 0 0 8px currentColor; } +.research-legend .lg-evidence-3 { background: #a7f3d0; color: #a7f3d0; width: 16px; height: 16px; } +.research-legend .lg-dispute { background: #94a3b8; color: #94a3b8; opacity: 0.7; } +.research-legend .lg-fb { background: #fcd34d; color: #fcd34d; } +.research-legend .lg-default { background: #475569; color: #475569; box-shadow: none; } + .time-row { display: flex; align-items: center; justify-content: space-between; margin-top: 6px; font-size: 12px; color: var(--ink-dim); } input#yearSlider { width: 100%; accent-color: var(--accent); } diff --git a/docs/data/cards/paper_fujimoto2019_bcq.md b/docs/data/cards/paper_fujimoto2019_bcq.md new file mode 100644 index 0000000..df091a8 --- /dev/null +++ b/docs/data/cards/paper_fujimoto2019_bcq.md @@ -0,0 +1,56 @@ +--- +id: paper:fujimoto2019_bcq +title: "BCQ — Off-Policy Deep Reinforcement Learning without Exploration" +title_zh: "BCQ:无探索条件下的离线深度强化学习" +kind: paper +tier: A +authors: [Fujimoto, S., Meger, D., Precup, D.] +venue: "ICML 2019" +year: 2019 +topic: deep_rl +phase: core +prereqs: [paper:mnih2015_dqn] +extends: [] +parallel: [] +contested_by: [] +labs: [] +deep_links: + - {label: "PDF p.1 摘要", url: "https://arxiv.org/pdf/1812.02900#page=1"} + - {label: "PDF p.3 §4 BCQ 算法", url: "https://arxiv.org/pdf/1812.02900#page=3"} + - {label: "官方实现 (sfujim/BCQ)", url: "https://github.com/sfujim/BCQ"} +bibtex: | + @inproceedings{fujimoto2019off, + title = {Off-Policy Deep Reinforcement Learning without Exploration}, + author = {Fujimoto, Scott and Meger, David and Precup, Doina}, + booktitle = {International Conference on Machine Learning}, + year = {2019} + } +--- + +## TL;DR +BCQ 用条件 VAE 学习行为策略的支持集,并把候选动作限制在该支持集附近,从而避免离线 Q 学习对分布外动作的过估计。它在 CQL 之前提出,是最早把"行为约束"显式写进离线 RL 算法的工作。 + +## 数学锚点 +策略输出: +$$\pi(s) = \arg\max_a \Big[Q(s, a) - \lambda \cdot \mathrm{KL}\big(\hat\pi_\beta(\cdot \mid s) \,\big\|\, \delta_a\big)\Big] \approx \arg\max_{a \in \hat\pi_\beta(\cdot\mid s)\text{-support}} Q(s, a)$$ +通过条件 VAE 学到的 $\hat\pi_\beta$ 给定状态 $s$ 后生成候选动作集合,BCQ 只在这个集合上做 Q 评估与最大化。 + +## 架构与方法 +- **行为策略密度估计**:训练一个条件 VAE,给定状态后能采样接近数据集动作分布的候选。 +- **扰动网络**:在 VAE 采样基础上加入一个小幅扰动网络,允许策略在支持集邻域内继续改进。 +- **双 Q 网络**:与 TD3 类似的两个 Q 网络取最小值以缓解过估计。 + +## 工程要点 +- 训练流程比 CQL 复杂(VAE + 双 Q + 扰动网络),实现成本更高。 +- 对支持集的依赖让它在覆盖良好的数据集上表现稳定,但覆盖不足时策略改进受限。 +- 在 D4RL 部分子集上仍是有竞争力的基线,与 CQL 形成 "约束动作 vs 惩罚 Q" 的两条主路线对比。 + +## 已知失败边界 +- 数据集覆盖空洞时支持集过窄,策略无法做出有效改进。 +- VAE 自身的密度估计偏差会传导到策略选择。 + +## Bitter-Lesson 视角 +BCQ 在 2019 年率先承认"离线 RL 需要明确处理分布外动作",但它选择了"用一个学到的密度估计来限制策略"的路径,仍然引入了相当多的人工组件(VAE + 扰动网络 + 双 Q)。CQL 之后的发展显示,把同一约束以单一标量超参 $\alpha$ 表达就足够,BCQ 的多模块设计在大规模数据下被更简洁的方案取代——这与 Bitter Lesson 关于"少先验更耐扩展"的判断一致。 + +## 关联节点 +- → [`paper:kumar2020_cql`](paper_kumar2020_cql.md):CQL 把 BCQ 的约束动作思想推进为更简洁的 Q 下界估计。 diff --git a/docs/data/cards/paper_hafner2020_dreamer.md b/docs/data/cards/paper_hafner2020_dreamer.md new file mode 100644 index 0000000..3385a4f --- /dev/null +++ b/docs/data/cards/paper_hafner2020_dreamer.md @@ -0,0 +1,59 @@ +--- +id: paper:hafner2020_dreamer +title: "Dreamer — Dream to Control: Learning Behaviors by Latent Imagination" +title_zh: "Dreamer:通过隐空间想象学习控制行为" +kind: paper +tier: S +authors: [Hafner, D., Lillicrap, T., Ba, J., Norouzi, M.] +venue: "ICLR 2020" +year: 2020 +topic: deep_rl +phase: frontier +prereqs: [paper:world_models] +extends: [paper:world_models] +parallel: [] +contested_by: [] +labs: [lab_dreamer_cartpole_pixels] +deep_links: + - {label: "PDF p.1 摘要", url: "https://arxiv.org/pdf/1912.01603#page=1"} + - {label: "PDF p.3 §3 RSSM 与 actor-critic 想象", url: "https://arxiv.org/pdf/1912.01603#page=3"} + - {label: "PDF p.6 §5 DM Control 与 Atari 结果", url: "https://arxiv.org/pdf/1912.01603#page=6"} + - {label: "官方实现 (danijar/dreamerv1)", url: "https://github.com/danijar/dreamer"} +bibtex: | + @inproceedings{hafner2020dream, + title = {Dream to Control: Learning Behaviors by Latent Imagination}, + author = {Hafner, Danijar and Lillicrap, Timothy and Ba, Jimmy and Norouzi, Mohammad}, + booktitle = {International Conference on Learning Representations}, + year = {2020} + } +--- + +## TL;DR +Dreamer 在 Recurrent State Space Model 学到的隐空间里做想象式 rollout,并用 actor-critic 同时训练价值与策略,使得 DM Control 与 Atari 系列任务在真实样本预算受限时显著超过无世界模型的 SAC 与 PPO。 + +## 数学锚点 +隐空间过渡: +$$z_t \sim p_\theta(z_t \mid z_{t-1}, a_{t-1}),\quad h_t = f_\theta(h_{t-1}, z_{t-1}, a_{t-1}),\quad \hat o_t \sim p_\theta(o_t \mid h_t, z_t)$$ +想象 rollout 上的 actor-critic 目标: +$$V_\lambda(z_\tau) = (1-\lambda)\sum_{n=1}^{H-1} \lambda^{n-1} V_n(z_\tau) + \lambda^{H-1} V_H(z_\tau)$$ +其中 $V_n$ 是从想象 horizon $n$ 起的 $n$-step 回报估计。 + +## 架构与方法 +- **RSSM 世界模型**:把图像观测压缩到一个低维隐状态 $z_t$,并学习其确定性与随机性两路过渡。 +- **想象 rollout**:在 $z_t$ 上滚动若干步生成想象轨迹,actor 与 critic 全程在想象中学习。 +- **真实样本只用于学世界模型**:策略学习对真实样本预算的依赖被显著降低。 + +## 工程要点 +- 与无模型基线相比,Dreamer 在固定真实样本预算下回报曲线显著更高且方差更小。 +- 想象 horizon 过长时策略可能学到模型预测漏洞而非真实环境动力(见 `failure_mode:world_model_compounding_imagination_error`)。 +- 后续 DreamerV2 与 DreamerV3 在更大规模任务上把该范式推到更强。 + +## 已知失败边界 +- 视觉极端噪声或非平稳环境下 RSSM 预测误差大,想象 rollout 反而损害策略。 +- 隐空间过小时世界模型无法表示足够丰富的动力学。 + +## Bitter-Lesson 视角 +Dreamer 把"在脑中想象后果"这一人类认知特征显式建模为可微分的隐空间过渡,并交给规模化训练去解决预测精度。它没有用任何人为设计的物理先验,整个 RSSM 从像素到回报全部由数据驱动。规模越大、隐空间越能容纳真实动力学,与 Bitter Lesson 关于"通用方法 + 算力"的判断完全一致。 + +## 配套实验 +[`labs/world_models/lab_dreamer_cartpole_pixels`](../../../labs/world_models/lab_dreamer_cartpole_pixels/) 在 CartPole 像素观测上复现 Dreamer 的 RSSM 与 latent imagination 训练流程。 diff --git a/docs/data/cards/paper_kumar2020_cql.md b/docs/data/cards/paper_kumar2020_cql.md new file mode 100644 index 0000000..2c90b2a --- /dev/null +++ b/docs/data/cards/paper_kumar2020_cql.md @@ -0,0 +1,57 @@ +--- +id: paper:kumar2020_cql +title: "CQL — Conservative Q-Learning for Offline Reinforcement Learning" +title_zh: "CQL:离线强化学习的保守 Q 学习" +kind: paper +tier: S +authors: [Kumar, A., Zhou, A., Tucker, G., Levine, S.] +venue: "NeurIPS 2020" +year: 2020 +topic: deep_rl +phase: core +prereqs: [paper:mnih2015_dqn, paper:schulman2017_ppo] +extends: [paper:fujimoto2019_bcq] +parallel: [] +contested_by: [] +labs: [lab_cql_offline_minigrid] +deep_links: + - {label: "PDF p.1 摘要", url: "https://arxiv.org/pdf/2006.04779#page=1"} + - {label: "PDF p.4 §3.2 保守损失推导", url: "https://arxiv.org/pdf/2006.04779#page=4"} + - {label: "PDF p.6 §4 D4RL 结果", url: "https://arxiv.org/pdf/2006.04779#page=6"} + - {label: "官方实现 (aviralkumar2907/CQL)", url: "https://github.com/aviralkumar2907/CQL"} +bibtex: | + @inproceedings{kumar2020conservative, + title = {Conservative Q-Learning for Offline Reinforcement Learning}, + author = {Kumar, Aviral and Zhou, Aurick and Tucker, George and Levine, Sergey}, + booktitle = {Advances in Neural Information Processing Systems}, + year = {2020} + } +--- + +## TL;DR +CQL 在 Bellman 损失上叠加一个对未见动作的下界惩罚项,使学到的 $Q$ 函数成为真实 $Q$ 函数的可证明下界,从而避免离线强化学习中分布外动作的 Q 值过估计。 + +## 数学锚点 +保守损失项: +$$\min_Q \alpha \cdot \mathbb{E}_{s\sim\mathcal{D}}\!\left[\log\sum_a \exp Q(s,a) - \mathbb{E}_{a\sim\hat\pi_\beta(\cdot\mid s)} Q(s,a)\right] + \tfrac12 \mathbb{E}_{(s,a,s')}\!\left[\big(Q(s,a) - \mathcal{B}^{\hat\pi} Q(s,a)\big)^2\right]$$ +其中 $\alpha$ 控制保守强度,$\hat\pi_\beta$ 是行为策略的密度估计,$\mathcal{B}^{\hat\pi}$ 是 Bellman 算子。定理 3.1 保证 $\hat Q_\text{CQL}(s,a) \le Q^\pi(s,a)$ 在数据集分布上几乎处处成立。 + +## 架构与方法 +- **保守 Bellman 算子**:相对常规 Q 学习,每一步更新都把分布外动作的 Q 值显式压低。 +- **可调拉格朗日 $\alpha$**:CQL 的 H 变体允许 $\alpha$ 通过对偶变量自动调节,使保守惩罚刚好把数据集动作上的 Q 值约束在 0 之上。 +- **可叠加到 SAC 或 DQN**:CQL 把保守项作为额外损失项加在原始 Q 损失之上,几乎不改变实现细节。 + +## 工程要点 +- 实现简单,几十行代码即可在现有 SAC / DQN 实现上加入。 +- 对数据集覆盖几何敏感:覆盖严重不足时策略会退化为模仿数据集均值行为。 +- 在 D4RL MuJoCo 与 Antmaze 上全面优于 BC 与离线 SAC,已成为离线强化学习的标准基线之一。 + +## 已知失败边界 +- 数据集存在显著动作覆盖空洞时保守惩罚导致策略陷入死锁(见 `failure_mode:offline_rl_extrapolation_error`)。 +- $\alpha$ 过大时策略退化为模仿数据集的平均行为。 + +## Bitter-Lesson 视角 +CQL 通过显式建模"分布外动作的不可信"实现了真正的离线策略改进,没有把这一负责任的悲观转嫁给手工设计的规则集。相对于更早期依赖人为白名单的离线方法,它把人工先验压缩到一个标量 $\alpha$,并把绝大部分工作交给数据驱动的下界估计。在算力允许大规模离线训练的世界里,这种最小先验、最大数据利用的设计正是 Bitter Lesson 在离线强化学习上的具体体现。 + +## 配套实验 +[`labs/rl_decision/lab_cql_offline_minigrid`](../../../labs/rl_decision/lab_cql_offline_minigrid/) 提供 CQL 与 BC / 离线 SAC 在 MiniGrid 上的并排训练与 Q 值过估计可视化。 diff --git a/docs/data/graph.json b/docs/data/graph.json index 073697d..0284c06 100644 --- a/docs/data/graph.json +++ b/docs/data/graph.json @@ -40,6 +40,9 @@ {"id": "paper:sutton_barto", "label": "Sutton & Barto RL", "label_zh": "Sutton & Barto《强化学习导论》", "kind": "paper", "tier": "A", "topic": "rl_foundations", "phase": "prereq", "year": 2018, "card": "paper_sutton_barto.md"}, {"id": "paper:rlhf_dpo", "label": "RLHF / DPO", "label_zh": "RLHF / DPO(人类偏好对齐)", "kind": "paper", "tier": "A", "topic": "deep_rl", "phase": "core", "year": 2023, "card": "paper_rlhf_dpo.md"}, {"id": "paper:world_models", "label": "World Models", "label_zh": "World Models(Ha & Schmidhuber)", "kind": "paper", "tier": "A", "topic": "deep_rl", "phase": "core", "year": 2018, "card": "paper_world_models.md"}, + {"id": "paper:kumar2020_cql", "label": "CQL", "label_zh": "CQL(保守 Q 学习)", "kind": "paper", "tier": "S", "topic": "deep_rl", "phase": "core", "year": 2020, "card": "paper_kumar2020_cql.md"}, + {"id": "paper:fujimoto2019_bcq", "label": "BCQ", "label_zh": "BCQ(行为约束离线 Q 学习)", "kind": "paper", "tier": "A", "topic": "deep_rl", "phase": "core", "year": 2019, "card": "paper_fujimoto2019_bcq.md"}, + {"id": "paper:hafner2020_dreamer", "label": "Dreamer", "label_zh": "Dreamer(隐空间想象 RL)", "kind": "paper", "tier": "S", "topic": "deep_rl", "phase": "frontier", "year": 2020, "card": "paper_hafner2020_dreamer.md"}, {"id": "paper:mamba", "label": "Mamba", "label_zh": "Mamba(状态空间模型)", "kind": "paper", "tier": "B", "topic": "math_foundations", "phase": "frontier", "year": 2023, "card": "paper_mamba.md"}, {"id": "paper:diffuser", "label": "Diffuser", "label_zh": "Diffuser / Decision Diffuser", "kind": "paper", "tier": "B", "topic": "deep_rl", "phase": "frontier", "year": 2022, "card": "paper_diffuser.md"}, @@ -127,6 +130,12 @@ {"source": "course:cs285", "target": "paper:ross2011_dagger", "rel": "covers"}, {"source": "course:cs285", "target": "paper:rlhf_dpo", "rel": "covers"}, {"source": "course:cs285", "target": "paper:world_models", "rel": "covers"}, + {"source": "course:cs285", "target": "paper:kumar2020_cql", "rel": "covers"}, + {"source": "course:cs285", "target": "paper:fujimoto2019_bcq", "rel": "covers"}, + {"source": "course:cs285", "target": "paper:hafner2020_dreamer", "rel": "covers"}, + {"source": "paper:mnih2015_dqn", "target": "paper:fujimoto2019_bcq", "rel": "prereq"}, + {"source": "paper:fujimoto2019_bcq", "target": "paper:kumar2020_cql", "rel": "extends"}, + {"source": "paper:world_models", "target": "paper:hafner2020_dreamer", "rel": "extends"}, {"source": "paper:sutton_barto", "target": "course:zhao_rl", "rel": "parallel"}, {"source": "paper:sutton_barto", "target": "essay:bitter_lesson", "rel": "parallel"}, diff --git a/docs/data/research/argument_chains.json b/docs/data/research/argument_chains.json new file mode 100644 index 0000000..2327043 --- /dev/null +++ b/docs/data/research/argument_chains.json @@ -0,0 +1,256 @@ +{ + "$schema": "./schema.json#/node_kinds/argument_chain", + "argument_chains": [ + { + "id": "chain:planning_oriented_query_sharing", + "title": "以规划为最终损失的可微查询共享是否真的为闭环安全带来好处", + "subject_papers": ["paper:2212.10156", "paper:2210.14222", "paper:vadv2"], + "research_gap": "现有端到端工作普遍在开环位移误差上比较,但开环误差对策略诱导分布偏移不敏感,且常常受到自车状态泄漏污染,使得已有结论无法被直接外推到闭环安全。", + "core_claim": "如果在评测协议中显式遮蔽自车状态并在统一的反应式闭环上比较,那么以规划为最终损失共享查询的方法仍然在多种场景下优于把感知与规划解耦的强基线;如果差距收敛甚至消失,则原有论点需要被显著修订。", + "method_mechanism": "在共享查询的端到端架构上分别关闭与开启自车状态输入并在两类闭环评测上做对照,再补充对查询数量与查询类型的细粒度消融来定位规划提升真正来自哪一组件。", + "key_experiments": [ + "在 nuPlan 与 Bench2Drive 上分别比较 UniAD 风格与 PlanT 风格的端到端模型,闭环碰撞率与路线完成度为主指标", + "在去 ego 状态条件下比较开环与闭环表现的相对秩", + "对查询子集做逐组消融,量化感知共享与规划共享的贡献" + ], + "strong_baselines": ["paper:2210.14222", "paper:transfuser", "paper:vadv2"], + "ablations": [ + "关闭跟踪查询与运动查询", + "用对象 token 替换密集 BEV 查询", + "把规划损失替换为独立的轨迹预测损失" + ], + "negative_results": [ + "若关闭自车状态后开环优势消失但闭环优势仍存在,说明问题在评测协议而非方法", + "若两类优势都消失,端到端必要性需要重新论证" + ], + "reviewer_attacks": [ + "BEV-Planner 与 Li 2024 的最尖锐挑战:UniAD 在 nuScenes 上的全部开环优势完全由 ego 状态条件注入造成,而非来自查询共享机制本身", + "评测协议被认为偏向闭环友好的方法", + "自车状态泄漏的修正是否过度", + "种子数量不足导致统计置信不够" + ], + "response_experiments": [ + "在显式关闭 ego 状态输入的条件下复现 UniAD 与 PlanT、VADv2 的分时段 L2_τ,并配合闭环碰撞率联合比较", + "提供基于至少八个种子加 bootstrap 95% 置信区间的统计声明", + "公开评测脚本与基准结果的最低单元", + "在多套闭环协议上同时报告" + ], + "figure_plan": [ + "图一展示两类协议下的相对秩翻转", + "图二展示查询消融的逐组成本收益", + "图三展示闭环失败的场景分布" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "related_datasets": ["dataset:nuplan_planning", "dataset:bench2drive"], + "related_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion"] + }, + { + "id": "chain:dual_system_for_long_tail", + "title": "快慢双系统语言驱动规划是否真正解决长尾而不牺牲实时性", + "subject_papers": ["paper:2402.12289", "paper:2311.10813", "paper:2309.16292"], + "research_gap": "语言驱动方法在论文里通常在静态长尾案例上展示成功,但缺乏在统一闭环上同时报告调用频率、延迟、成功率与失败模式的完整证据,难以判断慢系统是否真正带来净收益。", + "core_claim": "在受控的长尾闭环基准上,配合显式可学习门控的快慢双系统在保证延迟预算的前提下显著提升长尾片段成功率;若门控被去掉或语言模型被替换为不可控版本,这一收益会消失。", + "method_mechanism": "把语言模型作为可被门控触发的慢系统并把工具调用与记忆反思视为模块化能力,对门控、工具与记忆做正交消融,并以延迟与成功率的联合 Pareto 前沿作为评测对象。", + "key_experiments": [ + "在 nuScenes 叙事长尾子集上构造从开环到闭环的桥梁评测,作为视觉语言模型最熟悉的中间域", + "再在 Bench2Drive 与 CARLA Town05 Long 长尾片段上对比有无慢系统,作为跨域泛化检验", + "对门控阈值、记忆容量、工具集大小做正交扫描", + "在受限延迟预算下衡量净增益" + ], + "strong_baselines": ["paper:2212.10156", "paper:transfuser", "paper:vadv2"], + "ablations": [ + "去掉门控让慢系统始终触发", + "去掉记忆与反思", + "去掉工具调用层" + ], + "negative_results": [ + "若在延迟预算下成功率提升消失,则双系统在该平台不可部署", + "若失败模式集中在语言幻觉触发的违规行为,说明门控本身需要风险层" + ], + "reviewer_attacks": [ + "慢系统的真实延迟未被诚实公开", + "评测仅在静态长尾案例上做", + "工具集本身已经为评测特化" + ], + "response_experiments": [ + "公开端到端时间分布而不仅是均值", + "在动态闭环上同时报告快慢系统调用率", + "提供工具集的迁移评测" + ], + "figure_plan": [ + "图一显示延迟与成功率的 Pareto 前沿", + "图二显示门控触发率随长尾密度的变化", + "图三显示失败模式分布与归因" + ], + "related_scenarios": ["scenario:long_tail_rare_object_on_road", "scenario:construction_zone_with_cone_lane_shift", "scenario:dense_pedestrian_crosswalk_at_night"], + "related_datasets": ["dataset:bench2drive", "dataset:carla_town05_long"], + "related_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "chain:counterfactual_branches_as_safety_signal", + "title": "反事实分支能否成为视觉语言动作模型的可学习安全信号", + "subject_papers": ["paper:2512.24426", "paper:2402.12289"], + "research_gap": "现有视觉语言动作模型大多在主轨迹上训练与评测,没有把未发生但可能发生的状态作为显式的训练信号;社区缺乏可复现的反事实分支基准。", + "core_claim": "在训练与评测中显式把可生成的反事实分支作为监督信号,可以让模型在低频高风险场景下做出更安全的决策,并且这种收益对反事实分支的真实度具有可量化的敏感性。", + "method_mechanism": "用前向预测器生成多条反事实分支,并在分支上施加风险敏感损失,使模型在主轨迹与分支上的策略联合最优;评测上构建公开的反事实分支测试集并报告分支成功率。", + "key_experiments": [ + "在 Bench2Drive 与 NAVSIM 上比较有无反事实损失的策略", + "对反事实损失权重做阶梯式扫描(0.1 / 0.3 / 1 / 3 / 10)并报告每个权重下的主轨迹与反事实分支性能", + "对反事实分支真实度做阶梯式扰动并量化收益变化", + "在公开反事实评测集上汇报分支成功率与主轨迹性能" + ], + "strong_baselines": ["paper:2402.12289", "paper:vadv2", "paper:2212.10156"], + "ablations": [ + "去掉反事实损失", + "替换为均匀采样的扰动而非语义反事实", + "限制反事实分支数量" + ], + "negative_results": [ + "若反事实损失导致主轨迹性能显著退化,说明权重设计需要重做", + "若分支真实度低于阈值时收益反向,需要建立分支质量门槛" + ], + "reviewer_attacks": [ + "反事实分支生成器自身的偏差污染训练", + "反事实评测与真实事故分布的相关性未知", + "对主轨迹的影响未充分量化" + ], + "response_experiments": [ + "公开反事实分支生成器的失败率分布", + "提供主轨迹基线的多种子置信区间", + "在反事实评测与真实事故子集上联合比较" + ], + "figure_plan": [ + "图一显示反事实评测集的构建流水线", + "图二显示反事实损失权重的成本收益曲线", + "图三显示主轨迹与反事实分支的联合分布" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:long_tail_rare_object_on_road"], + "related_datasets": ["dataset:bench2drive", "dataset:navsim_planning"], + "related_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"] + }, + { + "id": "chain:offline_rl_versus_imitation_under_distribution_shift", + "title": "离线强化学习与模仿学习在策略诱导分布漂移下谁更稳健", + "subject_papers": ["paper:ross2011_dagger", "paper:kumar2020_cql", "paper:fujimoto2019_bcq"], + "research_gap": "自动驾驶社区在模仿学习与离线强化学习之间长期争论:模仿学习简单可控但有协变量偏移问题,离线强化学习理论上能利用代价信号但易受数据集覆盖空洞影响。缺少同时控制数据集与评测协议的公平对比。", + "core_claim": "在统一数据集与统一闭环评测下,配备保守惩罚的离线强化学习在分布漂移程度可控的子集上稳健性显著优于纯模仿学习;但当数据集覆盖空洞超过阈值时离线强化学习的优势消失,模仿学习反而更稳定。", + "method_mechanism": "用数据集覆盖几何作为自变量,统一训练 BC、DAgger、CQL 与 BCQ 等代表方法,分别在分布内与分布漂移闭环上比较碰撞与回报。", + "key_experiments": [ + "在 lab_cql_offline_minigrid 上构造覆盖几何阶梯并比较多方法", + "在 nuPlan 闭环上比较 BC 与 CQL 风格离线策略", + "用受控数据集移除特定动作分支以制造覆盖空洞" + ], + "strong_baselines": ["paper:ross2011_dagger", "paper:kumar2020_cql", "paper:fujimoto2019_bcq"], + "ablations": [ + "去掉 CQL 的保守惩罚", + "把 DAgger 换为纯 BC", + "在数据集覆盖几何中插入空洞" + ], + "negative_results": [ + "若两类方法在分布漂移下都崩溃,说明分布漂移阈值需要重新定义", + "若 BC 在统一闭环上反而稳健,需要重审离线 RL 的部署假设" + ], + "reviewer_attacks": [ + "D4RL 上 CQL 强于 BC 的差距在多数子集上可能来源于归一化与超参选择而非保守惩罚本身(Fu 等关于 D4RL 的后续讨论)", + "数据集覆盖几何如何被客观量化", + "保守惩罚的强度调度是否对结论敏感", + "评测脚本与种子数是否足以支持统计声明" + ], + "response_experiments": [ + "公开数据集覆盖几何的度量脚本", + "对保守惩罚强度做阶梯扫描并报告", + "至少 8 个独立种子并报告 bootstrap 95% 置信区间" + ], + "figure_plan": [ + "图一展示覆盖几何与方法相对秩的关系", + "图二展示保守惩罚强度扫描下的性能曲线", + "图三展示分布漂移闭环下方法的失败分布" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:long_tail_rare_object_on_road"], + "related_datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long"], + "related_metrics": ["metric:closed_loop_collision_rate", "metric:route_completion", "metric:long_tail_success_rate"] + }, + { + "id": "chain:safety_constraint_layering_for_end_to_end_planning", + "title": "在端到端规划上叠加显式安全约束层的代价与收益", + "subject_papers": ["paper:2212.10156", "paper:vadv2"], + "research_gap": "端到端规划在主分布上不断改进,但社区缺乏对显式安全约束层(如基于责任敏感安全的最小让行距离)在闭环上的系统评估,难以判断额外的约束层是真正提高安全还是仅是性能下降的代价。nuPlan PDM 等基于规则的强基线提供了对照锚点。", + "core_claim": "在统一闭环评测下叠加可计算的安全约束层可以让事先标注的安全关键事件碰撞率相对于无约束基线降至少 2 倍,同时主分布闭环 Driving Score 的相对下降处于 nuPlan 官方分数容忍区间之内;若超出该区间则需要重新设计约束的物理参数。", + "method_mechanism": "把端到端规划输出与显式安全验证器组合,形成软约束与硬约束两种实现,比较其在不同场景上的代价收益曲线。", + "key_experiments": [ + "在 nuPlan 与 Bench2Drive 上比较有无安全约束层的端到端方法", + "对约束阈值做阶梯扫描,观察主分布与长尾子集的代价收益", + "在受控扰动下评估约束层是否引入震荡或死锁" + ], + "strong_baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "control:nuplan_pdm_rule_based"], + "ablations": [ + "去掉硬约束保留软约束", + "去掉显式安全验证器", + "替换为基于学习的安全分类器" + ], + "negative_results": [ + "若长尾子集碰撞率下降未达主分布 Driving Score 损失的 2 倍以上,说明代价收益曲线不成立", + "若约束层造成新的死锁失败模式(route completion 下降幅度大于主分布闭环 Driving Score 下降幅度),需要把延迟成本写入损失函数" + ], + "reviewer_attacks": [ + "安全验证器自身的可靠性如何保证", + "约束阈值是否对场景分布过度敏感", + "在跨城市部署时约束物理参数是否需要重新校准" + ], + "response_experiments": [ + "公开安全验证器的失败分析", + "对约束阈值做阶梯扫描并报告", + "在多城市数据上验证物理参数的迁移性" + ], + "figure_plan": [ + "图一展示约束阈值与主分布性能的关系", + "图二展示长尾子集上的收益曲线", + "图三展示安全约束引入的新失败模式" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:school_zone_speed_compliance", "scenario:emergency_vehicle_yield"], + "related_datasets": ["dataset:nuplan_planning", "dataset:bench2drive", "dataset:navsim_planning"], + "related_metrics": ["metric:closed_loop_collision_rate", "metric:rule_compliance_score", "metric:ride_comfort_index"] + }, + { + "id": "chain:closed_loop_eval_protocol_audit", + "title": "面向端到端自动驾驶的可审计闭环评测协议", + "subject_papers": ["paper:2212.10156", "paper:vadv2", "paper:2210.14222"], + "research_gap": "已有闭环基准在自车状态泄漏、种子数、感知集成与他车反应模型的处理上各自不同,导致跨论文比较失去意义。", + "core_claim": "可以构造一份能自动检测自车状态泄漏、强制最少种子数、强制公开评测脚本并要求同时报告开环与闭环指标的协议;在该协议下,目前公开的多数端到端方法的相对秩与原报告差距显著。", + "method_mechanism": "把评测协议改造成一组自动化检查、统一脚本与最小数据集组合,并在主流方法上同时跑两类协议以量化差距。", + "key_experiments": [ + "在 nuPlan、NAVSIM 与 Bench2Drive 上以新协议复跑公开方法", + "在带与不带 ego 状态泄漏的两组协议上比较相对秩", + "对种子数与他车反应模型做敏感性分析" + ], + "strong_baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "paper:2210.14222"], + "ablations": [ + "只用单一种子", + "禁止公开评测脚本", + "允许 ego 状态泄漏" + ], + "negative_results": [ + "若新协议下所有方法都接近基线,说明端到端的真实进展被显著高估", + "若新协议下各种方法的相对秩与原协议完全一致,需要回到协议本身审计是否实际生效,例如 ego 状态泄漏检测的覆盖率是否被高估" + ], + "reviewer_attacks": [ + "协议是否过度收紧以致没有方法能通过", + "对感知集成的处理是否公平", + "他车反应模型的选择是否偏向特定方法" + ], + "response_experiments": [ + "提供逐步放宽的协议变体", + "在不同感知集成等级上分层比较", + "在多种他车模型下复现" + ], + "figure_plan": [ + "图一展示新旧协议下的相对秩翻转", + "图二展示协议各条款对方法分布的影响", + "图三展示评测自动化检查的执行示例" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "related_datasets": ["dataset:nuplan_planning", "dataset:navsim_planning", "dataset:bench2drive"], + "related_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion", "metric:rule_compliance_score"] + } + ] +} diff --git a/docs/data/research/claims.json b/docs/data/research/claims.json new file mode 100644 index 0000000..71cf8e6 --- /dev/null +++ b/docs/data/research/claims.json @@ -0,0 +1,418 @@ +{ + "$schema": "./schema.json#/node_kinds/claim", + "claims": [ + { + "id": "claim:uniad_query_sharing_lowers_planning_l2", + "subject": "paper:2212.10156", + "statement": "在共享一组可微 BEV 查询的端到端架构下,把检测、跟踪、地图、运动与占用模块的梯度共同导向规划目标,可以让规划在专家分布上的分时段开环位移误差(1 秒、2 秒、3 秒)相对模块化基线分别取得至少 10% 的相对降低。", + "evidence": [ + {"kind": "table", "source": "UniAD CVPR 2023 论文表 4", "finding": "在 nuScenes 验证集上 1 秒、2 秒、3 秒平均位移误差均优于使用分离式预测与规划的 ST-P3 等基线"}, + {"kind": "ablation", "source": "UniAD CVPR 2023 论文表 6", "finding": "去掉 MotionFormer 与 OccFormer 各自带来开环位移误差与碰撞率的可测退化"}, + {"kind": "repro", "source": "OpenDriveLab/UniAD 复现脚本", "finding": "公开权重在 nuScenes 上可复现报告数字"} + ], + "preconditions": [ + "训练与测试使用相同的 nuScenes 完整验证集而非 mini 子集", + "感知模块由 BEVFormer 风格的视觉骨干提供", + "评测使用 ego 轨迹回归损失", + "ego 状态是否进入模型输入需要被显式声明" + ], + "counterexamples": [ + "若在测试时关闭 UniAD 自身的 ego 速度、加速度与横摆率输入,BEV-Planner 与 Li 2024 的复现表明 UniAD 在 nuScenes 上的开环位移优势会显著收敛,1 秒分时段甚至完全消失", + "在跨城市分布或不同传感器配置下,共享查询的收益会被分布漂移抵消,目前没有跨数据集的稳健证据" + ], + "boundaries": [ + "结论限定在开环位移误差与同源分布", + "未直接外推到大幅不同的传感器配置或国家", + "对极端长尾事件没有直接证据" + ], + "reproduction": { + "minimal": "lab03 在合成 2D 场景上对比共用查询与独立查询的分时段轨迹误差,使用至少 8 个独立种子", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 6, + "expected_output": "共用查询的分时段 L2_τ 在合成场景上对 τ ∈ {1s, 2s, 3s} 至少有一段取得不小于 10% 的相对降低,且独立查询基线置信区间不重叠" + }, + "publication_value": "系统改进 + 机制解释", + "dispute_level": 1, + "evidence_strength": 3, + "reproducibility_status": "verified", + "related_claims": ["claim:uniad_open_loop_not_safety", "claim:plant_object_token_sufficient_for_planning"], + "related_failure_modes": ["failure_mode:ego_status_leakage"] + }, + { + "id": "claim:uniad_open_loop_not_safety", + "subject": "paper:2212.10156", + "statement": "UniAD 在 nuScenes 上的开环优势不能直接外推为闭环安全提升,因为开环位移误差对策略诱导的状态分布漂移不敏感。", + "evidence": [ + {"kind": "external_benchmark", "source": "BEV-Planner 复现报告", "finding": "在控制 ego 状态泄漏后开环差距大幅缩小"}, + {"kind": "theorem", "source": "Ross et al. 2011 DAgger 误差累积论证", "finding": "策略诱导分布偏移导致开环误差对真实风险存在系统性偏差"} + ], + "preconditions": [ + "评测在 nuScenes 上以专家轨迹为参考", + "未对策略做闭环回放或反应式仿真" + ], + "counterexamples": [ + "若能证明 UniAD 的策略对闭环漂移具有显式不变性" + ], + "boundaries": [ + "限于纯 nuScenes 开环评测语境", + "不否定 UniAD 在感知与轨迹回归层的实质贡献" + ], + "reproduction": { + "minimal": "在 nuScenes 上比较 UniAD 与简单速度保持基线在显式遮蔽 ego 状态后的差距", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 3, + "expected_output": "差距大幅收敛,部分子集上甚至消失" + }, + "publication_value": "失败模式发现 + 评测协议改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:uniad_query_sharing_lowers_planning_l2"], + "related_failure_modes": ["failure_mode:ego_status_leakage", "failure_mode:occlusion_blind_spot_overconfidence"] + }, + { + "id": "claim:plant_object_token_sufficient_for_planning", + "subject": "paper:2210.14222", + "statement": "在城市仿真闭环中只用一组对象级 token 而舍弃稠密 BEV 特征,仍然可以达到与 BEV 端到端方法相当甚至更优的驾驶分数,说明对人类驾驶决策必要的信息可以在对象级表示中被压缩保留。", + "evidence": [ + {"kind": "table", "source": "PlanT NeurIPS 2022 论文表 1 与表 2", "finding": "在 LAV 与 Longest6 基准上 PlanT 的驾驶分数与 TransFuser 等稠密 BEV 方法持平或更高"}, + {"kind": "ablation", "source": "PlanT 论文表 4", "finding": "在仅保留近邻若干 agent token 时性能下降可控,远端 token 收益边际递减"} + ], + "preconditions": [ + "上游对象检测与跟踪足够可靠", + "评测限于 CARLA 城市场景" + ], + "counterexamples": [ + "在依赖密集占用信息的施工或低矮障碍场景下对象级 token 不足以表示安全相关几何", + "对象 token 的真实感来自感知质量,感知误差会破坏对象抽象" + ], + "boundaries": [ + "结论限于规则化对象表示充分的城市闭环", + "未直接外推到高速或复杂博弈场景" + ], + "reproduction": { + "minimal": "lab04 在对象级合成场景上对比 token 数量与规划误差", + "public_data": "dataset:carla_town05_long", + "cost_hours": 8, + "expected_output": "在足够数量 token 下规划误差接近稠密基线,少量 token 即可保留主要规划信号" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "verified", + "related_claims": ["claim:uniad_query_sharing_lowers_planning_l2"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:drivevlm_dual_recovers_long_tail_without_killing_latency", + "subject": "paper:2402.12289", + "statement": "把视觉语言模型作为慢系统并与传统快规划器组成双管线,可以在分布外目标与少见语义场景上恢复显著的规划成功率,同时通过门控仅在必要时调用慢系统而保持闭环延迟可控。", + "evidence": [ + {"kind": "table", "source": "DriveVLM 论文表 3", "finding": "在长尾片段上的成功率显著高于纯端到端基线"}, + {"kind": "ablation", "source": "DriveVLM 论文表 5", "finding": "去掉慢系统的双管线退化为基线性能;去掉快系统则延迟无法支撑闭环"} + ], + "preconditions": [ + "慢系统的调用门控由预设规则或可学习信号控制,且门控统计在评测中被显式公开", + "慢系统在被调用时延迟可被快系统吸收,单帧最坏延迟在协议预算内" + ], + "counterexamples": [ + "在持续高密度长尾的场景下门控可能频繁触发慢系统并耗尽延迟预算", + "慢系统的语言输出与运动控制的对齐失败会造成不可解释的接管" + ], + "boundaries": [ + "结论限于配备相机与短期回放的车端环境", + "对非视觉模态退化不直接保证", + "对慢系统语言幻觉的失败模式没有系统覆盖" + ], + "reproduction": { + "minimal": "lab09 用 Mock 后端模拟双管线门控与延迟预算", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 4, + "expected_output": "在长尾片段子集上门控触发率与成功率均显著提升,单帧延迟不超过阈值" + }, + "publication_value": "系统改进 + 失败模式发现", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:agent_driver_tool_use_reduces_planner_dead_ends", "claim:dilu_memory_reflection_reduces_long_tail_failures"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss", "failure_mode:ride_comfort_violation_due_to_late_braking"] + }, + { + "id": "claim:agent_driver_tool_use_reduces_planner_dead_ends", + "subject": "paper:2311.10813", + "statement": "把规划器封装成被语言模型调用的工具集合,让语言模型选择是否查询地图、轨迹预测或风险评估,可以在事先标注的罕见冲突子集上把碰撞率与路线未完成率联合下降到端到端规划基线之下。", + "evidence": [ + {"kind": "ablation", "source": "Agent-Driver 论文表 2", "finding": "去掉工具调用层在罕见冲突子集上的碰撞率与路线未完成率联合上升"}, + {"kind": "table", "source": "Agent-Driver 论文表 4", "finding": "在分布外场景上的通过率超过仅由端到端规划器输出的基线"} + ], + "preconditions": [ + "工具接口稳定且文档化", + "语言模型推理延迟在可接受范围内" + ], + "counterexamples": [ + "工具调用链路本身可能引入新的脆弱失败", + "语言模型的工具选择可能在压力下退化为始终调用同一工具" + ], + "boundaries": [ + "结论限于具备地图查询与预测工具的实验环境", + "未直接量化对乘员舒适度的影响" + ], + "reproduction": { + "minimal": "lab08 用 Mock 工具集模拟工具调用并在罕见冲突场景脚本上比较有无工具层的碰撞率与路线未完成率", + "public_data": "dataset:carla_town05_long", + "cost_hours": 5, + "expected_output": "在事先标注的罕见冲突子集上碰撞率与路线未完成率联合显著下降且不显著恶化主分布舒适度" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency", "claim:dilu_memory_reflection_reduces_long_tail_failures"], + "related_failure_modes": ["failure_mode:closed_loop_deadlock_under_uncertainty", "failure_mode:multi_agent_interaction_indecision"] + }, + { + "id": "claim:dilu_memory_reflection_reduces_long_tail_failures", + "subject": "paper:2309.16292", + "statement": "在语言模型驱动的决策循环中加入显式经验记忆与反思机制,使得模型可以从过往失败的相似场景中检索教训,从而在长尾场景上的决策错误率显著低于无记忆的同类语言模型基线。", + "evidence": [ + {"kind": "table", "source": "DiLu 论文表 2", "finding": "带记忆与反思的循环在 GPT-3.5 与 GPT-4 上均显著优于无记忆基线"}, + {"kind": "ablation", "source": "DiLu 论文表 4", "finding": "去除反思模块后失败率明显回升"} + ], + "preconditions": [ + "记忆库覆盖足够多样的历史失败", + "检索接口可以匹配新场景到相关经验" + ], + "counterexamples": [ + "记忆库被污染或包含错误经验时反思会放大错误", + "记忆检索延迟可能超过实时决策预算" + ], + "boundaries": [ + "结论限于决策频率较低的语言驱动决策循环", + "对端到端连续控制不直接适用", + "对极少见且无相似历史的场景效果有限" + ], + "reproduction": { + "minimal": "lab07 用 Mock 决策后端做端到端流水线测试(Mock 返回确定性回答以隔离记忆机制),并在小规模 GPT-3.5 API 上做对照以确认非平凡的语义信号;Mock 与真实两路必须分别报告", + "public_data": "dataset:carla_town05_long", + "cost_hours": 3, + "expected_output": "Mock 版本上记忆与反思机制带来的错误率下降与真实 API 上的趋势同向,若 Mock 上观察到信号但真实 API 上不存在,应视为记忆机制无关而是流水线伪相关" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency", "claim:agent_driver_tool_use_reduces_planner_dead_ends"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:cfvla_counterfactual_branches_close_evaluation_gap", + "subject": "paper:2512.24426", + "statement": "把反事实分支显式引入视觉语言动作模型的训练与评测,可以在保留主轨迹规划能力的同时显著提升对低频高风险场景的应对成功率,因为反事实分支强迫模型对未发生但可能发生的状态做出明确决策。", + "evidence": [ + {"kind": "table", "source": "CF-VLA 论文表 2", "finding": "在反事实分支评测集上成功率超过无反事实训练的基线"}, + {"kind": "ablation", "source": "CF-VLA 论文表 5", "finding": "去除反事实分支损失后长尾片段成功率回落"} + ], + "preconditions": [ + "反事实分支生成器具备一定真实感", + "评测协议公开反事实分支并允许复现" + ], + "counterexamples": [ + "如果反事实分支与真实分布偏差过大,训练可能过度悲观", + "反事实评测可能与现实事故分布不一致" + ], + "boundaries": [ + "结论限于具备生成反事实能力的训练流水线", + "对未给出反事实分支的真实事故没有直接保证" + ], + "reproduction": { + "minimal": "lab10 在合成反事实分支上对比有无反事实损失的策略", + "public_data": "dataset:bench2drive", + "cost_hours": 12, + "expected_output": "带反事实损失的策略在反事实评测子集上的成功率显著更高且对主轨迹影响有限" + }, + "publication_value": "机制解释 + 基准构建", + "dispute_level": 2, + "evidence_strength": 1, + "reproducibility_status": "inferred", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency"], + "related_failure_modes": ["failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:ppo_clipped_surrogate_stabilizes_policy_gradient", + "subject": "paper:schulman2017_ppo", + "statement": "PPO 通过对策略概率比施加上下截断的代理目标,可以在重要性比远离 1 时阻止过大的策略更新,从而在多个连续控制基准上取得稳定的策略改进且不需要复杂的二阶信任域优化。", + "evidence": [ + {"kind": "table", "source": "Schulman et al. 2017 PPO 论文表 1 与图 3(MuJoCo 连续控制)", "finding": "PPO 与 TRPO 在 MuJoCo 各任务上的平均回报曲线接近,但 PPO 实现简单且超参更少"}, + {"kind": "ablation", "source": "PPO 论文图 2 与图 5(裁剪与 KL 惩罚的对比)", "finding": "去掉截断退化为朴素策略梯度,训练曲线出现剧烈震荡或崩溃"}, + {"kind": "repro", "source": "labs/rl_decision/lab_dqn_ppo_sac_cartpole 复现实验", "finding": "在 CartPole 离散与连续动作版本上 PPO 的收敛比 DQN 平稳"} + ], + "preconditions": [ + "环境奖励信号足够密集", + "策略与价值网络共享或拆分都做了独立调参", + "截断阈值 ε 默认取 0.2,论文示例区间在 0.1 至 0.3 之间需根据任务尺度微调" + ], + "counterexamples": [ + "在稀疏奖励或长时延任务上单一截断阈值难以同时兼顾探索与稳定", + "在动作维度高的机器人控制问题上 PPO 仍可能落入次优盆地,需要熵正则或学习率自适应" + ], + "boundaries": [ + "结论限于单 agent 在线策略学习", + "对离线数据回放与多 agent 博弈不直接适用" + ], + "reproduction": { + "minimal": "在 lab_dqn_ppo_sac_cartpole 上对比有无截断的 PPO 训练曲线,至少 5 个种子", + "public_data": "dataset:rl_classic_control_suite", + "cost_hours": 1, + "expected_output": "截断版本平均回报曲线在收敛尾段方差显著低于无截断基线" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 3, + "reproducibility_status": "verified", + "related_claims": ["claim:offline_rl_conservatism_avoids_q_overestimation"], + "related_failure_modes": ["failure_mode:on_policy_rl_sample_inefficiency_for_safety_critical_events"] + }, + { + "id": "claim:offline_rl_conservatism_avoids_q_overestimation", + "subject": "paper:kumar2020_cql", + "statement": "在离线数据集上训练价值函数时显式对未见动作施加 CQL 风格的下界惩罚,可以避免分布外动作的 Q 值过估计,使离线策略在不与环境进一步交互的前提下逼近行为策略上界。", + "evidence": [ + {"kind": "theorem", "source": "Kumar et al. 2020 CQL 论文定理 3.1", "finding": "在数据集分布上学到的 Q 函数是真实 Q 函数的下界"}, + {"kind": "table", "source": "Kumar et al. 2020 表 1 与表 2", "finding": "在 D4RL MuJoCo 基准上 CQL 全面优于 BC 与无保守惩罚的离线 SAC"}, + {"kind": "repro", "source": "labs/rl_decision/lab_cql_offline_minigrid 复现实验", "finding": "在 MiniGrid 上 CQL 的 Q 值过估计随训练步数显著低于离线 SAC"} + ], + "preconditions": [ + "数据集对相关 (s, a) 对的覆盖足够支撑 CQL 定理 3.1 所需的下界估计", + "behavior policy 的密度估计足够准确,至少在数据集分布上可被合理近似", + "α 调度合理或采用 CQL 的拉格朗日自调", + "策略评估在与训练同分布上进行" + ], + "counterexamples": [ + "数据集对关键安全分支严重不覆盖时保守惩罚导致策略陷入死锁", + "α 过大时策略退化为模仿数据集的平均行为" + ], + "boundaries": [ + "结论限于具备充分覆盖的离线数据集", + "对在线探索阶段不适用", + "对分布外动作的代价没有显式建模时仍可能失败" + ], + "reproduction": { + "minimal": "在 lab_cql_offline_minigrid 上对比 CQL 与无保守惩罚的离线 SAC,统计 Q 过估计与策略回报", + "public_data": "dataset:rl_classic_control_suite", + "cost_hours": 2, + "expected_output": "CQL 在保守策略上回报更高且 Q 值过估计显著较低,差距在多种子上稳定" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 3, + "reproducibility_status": "verified", + "related_claims": ["claim:ppo_clipped_surrogate_stabilizes_policy_gradient", "claim:world_model_imagination_reduces_sample_complexity"], + "related_failure_modes": ["failure_mode:offline_rl_extrapolation_error"] + }, + { + "id": "claim:world_model_imagination_reduces_sample_complexity", + "subject": "paper:hafner2020_dreamer", + "statement": "学习一个能在隐空间预测下一步状态的世界模型,并让策略在该隐空间里做想象式 rollout,可以让控制任务在同样的真实环境样本预算下取得显著更高的最终回报,因为策略实际看到的有效经验包括了想象生成的额外样本。", + "evidence": [ + {"kind": "table", "source": "Hafner et al. 2020 Dreamer 论文表 1", "finding": "在 DM Control 与 Atari 上 Dreamer 在样本数较少时显著优于无世界模型的 SAC 与 PPO"}, + {"kind": "ablation", "source": "Dreamer 论文表 4", "finding": "关闭隐空间想象的纯环境 rollout 版本回报塌缩"}, + {"kind": "repro", "source": "labs/world_models/lab_dreamer_cartpole_pixels 复现实验", "finding": "在 CartPole 像素观测下 Dreamer 在固定真实步数预算上回报更高"} + ], + "preconditions": [ + "世界模型对训练分布上的状态过渡有良好预测", + "想象 horizon 选择合理,过长会引入复合误差", + "策略在想象与真实数据上做平衡训练" + ], + "counterexamples": [ + "世界模型预测误差较大的任务(例如非平稳环境或视觉极端噪声)想象式 rollout 反而损害策略", + "想象 horizon 过长导致策略学到模型偏差而非环境真实动力" + ], + "boundaries": [ + "结论限于具有强结构的连续控制任务", + "对真实自动驾驶闭环的迁移仍受限于驾驶世界模型的真实感" + ], + "reproduction": { + "minimal": "在 lab_dreamer_cartpole_pixels 上对比有无想象 rollout 的策略学习曲线,至少 5 个种子", + "public_data": "dataset:rl_classic_control_suite", + "cost_hours": 3, + "expected_output": "想象版本在相同真实步数预算下回报显著更高,且曲线在多种子上方差更小" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 3, + "reproducibility_status": "verified", + "related_claims": ["claim:offline_rl_conservatism_avoids_q_overestimation", "claim:bitter_lesson_handcrafted_priors_decline_under_scale"], + "related_failure_modes": ["failure_mode:world_model_compounding_imagination_error"] + }, + { + "id": "claim:bitter_lesson_handcrafted_priors_decline_under_scale", + "subject": "essay:bitter_lesson", + "statement": "当数据与算力规模在某个固定基准上从 X 增长到 10X 量级时,依赖大量人工设计先验的方法相对结构简单但充分利用规模的端到端方法的领先优势会单调收窄,且在自动驾驶的 nuScenes / Bench2Drive / nuPlan 主分布指标上至少出现一次相对秩翻转。", + "evidence": [ + {"kind": "external_benchmark", "source": "ImageNet ILSVRC 历年冠军模型对比 (Krizhevsky 2012 / He 2015 / Tan 2019)", "finding": "数据规模超过百万级、训练算力提升 10 倍以上后,端到端深度网络在 top-5 错误率上从 25% 降至 2% 以下,手工特征流水线退出领先位置"}, + {"kind": "external_benchmark", "source": "AlphaGo (2016) vs AlphaGo Zero (2017) 训练对比", "finding": "去掉人工开局库与人类对局数据,纯自对弈在相同算力下最终强度更高,对应 ELO 提升超过 1000"}, + {"kind": "external_benchmark", "source": "nuScenes / nuPlan 上 UniAD / VADv2 与模块化基线 2022 至 2024 年的逐次相对秩演化", "finding": "在端到端方法可用训练算力翻倍后的两年内,模块化基线在多个主分布指标上被赶上或超越"} + ], + "preconditions": [ + "数据与算力规模足以支撑端到端方法的样本复杂度", + "评测协议不被人工先验隐性偏好", + "工程基础设施允许大规模训练" + ], + "counterexamples": [ + "在数据匮乏或安全关键场景下硬编码约束在短期内仍可能优于纯端到端方法", + "若人工先验编码了真实物理不变量则可能不被规模替代", + "在医学影像等长尾且高风险任务上仍未被端到端完全取代" + ], + "boundaries": [ + "断言关于长期趋势而非每个评测点都成立", + "对仍由人工先验主导的安全层未必直接适用", + "趋势依赖工程基础设施持续投入" + ], + "reproduction": { + "minimal": "在 lab02 与 lab03 上以 25% / 50% / 100% nuScenes 数据规模分别训练端到端与模块化基线,画出闭环碰撞率随数据规模的变化曲线", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 12, + "expected_output": "数据规模翻倍至少一次时端到端相对模块化的闭环碰撞率差距在多种子均值上单调收窄至少 20%" + }, + "publication_value": "机制解释", + "dispute_level": 3, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:plant_object_token_sufficient_for_planning", "claim:uniad_query_sharing_lowers_planning_l2"], + "related_failure_modes": ["failure_mode:scaling_bet_failure_on_safety_critical_long_tail"] + }, + { + "id": "claim:vadv2_probabilistic_planning_covers_multimodality", + "subject": "paper:vadv2", + "statement": "向量化端到端架构在规划头上引入概率分布预测可以显式建模专家行为的多模态性,从而在多车交互与未保护转向场景中比单一回归轨迹基线更不容易陷入平均化失败。", + "evidence": [ + {"kind": "table", "source": "VADv2 论文表 1 与表 3", "finding": "在 nuScenes 与 CARLA 上 minADE 与碰撞率均优于回归型 VAD 基线"}, + {"kind": "ablation", "source": "VADv2 论文表 5", "finding": "去掉概率头退化为回归基线"} + ], + "preconditions": [ + "训练数据覆盖足够多模式行为", + "概率头训练采用合理的负样本与温度" + ], + "counterexamples": [ + "在极端低数据子集上概率头可能塌缩为单峰", + "对完全未见过的多模态分布仍可能选择错误模式" + ], + "boundaries": [ + "结论限于矢量化感知输入与有限规划视野", + "未直接外推到非结构化场景" + ], + "reproduction": { + "minimal": "在 nuScenes mini 上对比回归头与概率头的多模态覆盖", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 6, + "expected_output": "概率头在选定多模态子集上模式覆盖与碰撞率均优于回归头" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:plant_object_token_sufficient_for_planning"], + "related_failure_modes": ["failure_mode:multi_agent_interaction_indecision"] + } + ] +} diff --git a/docs/data/research/cross_review_round1.md b/docs/data/research/cross_review_round1.md new file mode 100644 index 0000000..32d7fd7 --- /dev/null +++ b/docs/data/research/cross_review_round1.md @@ -0,0 +1,89 @@ +# 第一轮研究层交叉审查报告 + +> 由独立审查代理对 `docs/data/research/` 全部七个结构化文件进行的逐节点审查。所有引用都指向具体节点 id 与字段。本报告作为研究资产保留,便于追溯每一处修订。 + +## 审查范围 + +`claims.json` · `argument_chains.json` · `scenarios.json` · `datasets.json` · `metrics.json` · `failure_modes.json` · `experiment_plans.json` · `schema.json` + +## 标签约定 + +- `[fix]`:发表前必须修订 +- `[strengthen]`:可强化以提升严谨度 +- `[ok]`:通过审查 + +--- + +## claims.json — 可证伪性 + +- `[fix]` **claim:uniad_query_sharing_lowers_planning_l2**:声明使用"显著低于"但未给出数值阈值;`expected_output` 提到 mini 子集(约 80 个序列)远不足以稳定比较两种规划头。 +- `[fix]` **claim:uniad_query_sharing_lowers_planning_l2**:当前反例"若引入 ego 状态泄漏简单基线也能接近 UniAD"误把威胁方向写反;BEV-Planner / Li 2024 的真正反例是 *UniAD 本身* 关闭 ego 状态后开环优势消失。 +- `[fix]` **claim:cfvla_counterfactual_branches_close_evaluation_gap**:引用"CF-VLA 论文表 2 / 表 5"缺少 venue 与 arXiv 参照,`evidence_strength=1`、`reproducibility_status=inferred` 当前不应被其它主张作为同等可靠的证据引用。 +- `[fix]` **claim:vadv2_probabilistic_planning_covers_multimodality**:主体 id `paper:vadv2` 不使用 arXiv 形式,与其它论文 id 风格不一致;"在 nuScenes 与 CARLA 上同时使用表 1 / 表 3"是跨基准强断言,需核对原文 VADv2 的 nuScenes 报告。 +- `[strengthen]` **claim:drivevlm_dual_recovers_long_tail_without_killing_latency**:前提"门控存在显式可学习信号"超出 DriveVLM 公开版本的事实,公开版本使用按需触发,需要改写为"可学习或预设的门控"。 +- `[strengthen]` **claim:agent_driver_tool_use_reduces_planner_dead_ends**:"死锁率"在 Agent-Driver 论文中并未直接定义。需要要么明确诊断协议,要么改写为"在罕见冲突子集上碰撞与路线未完成率"。 +- `[strengthen]` **claim:dilu_memory_reflection_reduces_long_tail_failures**:`cost_hours=3` 若包含真实 GPT-4 调用则严重低估,若用 Mock 后端则 ablation 失去说服力,需明确口径。 +- `[ok]` **claim:plant_object_token_sufficient_for_planning**:反例与边界精准,复现配方与 PlanT 实际公开协议匹配。 + +## argument_chains.json — 论文骨架 + +- `[fix]` **chain:planning_oriented_query_sharing**:`reviewer_attacks` 缺少最关键的攻击——"UniAD 在 nuScenes 上的优势完全来自 ego 状态条件,而非查询共享"。需要显式添加这一假设以及对应的 response experiment。 +- `[fix]` **chain:dual_system_for_long_tail**:`key_experiments` 直接把 DriveVLM 风格放在 Bench2Drive 与 CARLA Town05 Long 上比较,跳过了从 nuScenes 叙事子集到 CARLA 的中间适配,反而容易得到无法区分的负结果。需在 Tier-2 控制集中加入 nuScenes 叙事子集。 +- `[fix]` **chain:counterfactual_branches_as_safety_signal**:`figure_plan` 提到反事实损失权重的成本收益曲线,但实验列表只有"去掉反事实损失",缺少权重扫描。 +- `[strengthen]` **chain:closed_loop_eval_protocol_audit**:`negative_results` "若相对秩不变则说明已有结论稳健"是确认偏倚陷阱,应改写为"若相对秩完全不变则需要审计协议是否实际生效"。 +- `[ok]` **chain:planning_oriented_query_sharing**:在去 ego 状态条件下区分"开环差距消失但闭环优势仍存"与"两类优势都消失"是真正的二元可证伪设计。 + +## scenarios.json — 粒度与覆盖 + +- `[fix]` 仅有 6 个场景对七篇论文的覆盖不足。至少需要补充:环岛无保护汇入、低速垂直泊车、传感器外参漂移、学区限速合规、紧急车辆让行。 +- `[fix]` **scenario:long_tail_rare_object_on_road** `current_best_methods` 列出 CF-VLA 与 evidence_strength=1 矛盾,应降级或移除。 +- `[fix]` **scenario:heavy_rain_with_camera_lens_droplet** `current_best_methods` 列出的两篇方法均未报告该场景,属于虚构排名,应改为"目前无公开报告"。 +- `[strengthen]` **scenario:dense_pedestrian_crosswalk_at_night** 列出 Waymo Open Motion,但 WOMD 缺乏夜间像素图像,无法支持端到端视觉评测,需要限定为轨迹方法或注明部分覆盖。 +- `[strengthen]` 所有场景缺少定量触发阈值(例如"遮挡占冲突区超过 40% 且持续 3 秒以上"),没有这些阈值场景挖掘无法自动化。 + +## datasets.json — 能与不能证明 + +- `[fix]` **dataset:nuscenes_planning** `supports` "比较协同收益"过度声称,nuScenes 规划评测的已知缺陷使其只能作为初筛信号。 +- `[fix]` **dataset:nuplan_planning** `limits` 缺少最关键的事实——官方非反应式分数对小批量手工场景敏感,且基于规则的 PDM 基线在多数学习方法之上。 +- `[fix]` **dataset:bench2drive** `common_misuses` 缺少两个最常见的误用——以 Dev10 子集声称完整基准成绩、只报告 Driving Score 而隐藏接近零的 Success Rate。 +- `[strengthen]` **dataset:waymo_open_motion** 许可写"学术免费"并不准确,WOMD 需要注册并按团队审批。 +- `[strengthen]` **dataset:carla_town05_long** 不同 CARLA 版本(0.9.10 / 0.9.15)会造成结果偏移,必须固定版本。 +- `[ok]` **dataset:navsim_planning** 准确指出非反应式回放是核心限制。 + +## metrics.json — 公式与前提 + +- `[fix]` **metric:open_loop_l2_displacement** 当前公式 `L2_t` 是对 t 求平均,命名为 `L2_t` 与含义不符;应该重命名为 `L2_avg` 或补充 1s / 2s / 3s 分时段分层公式。 +- `[fix]` **metric:ride_comfort_index** 各项单位不一致(m/s³ + m/s³ + m²/s²)且无归一化;应当声明权重承担量纲转换或显式给出归一化常数。 +- `[fix]` **metric:rule_compliance_score** 当前定义下若 violation_rate 大于 1(速度类违规可能)会让乘积出现负值;需要 clip 到 [0,1] 或改写为每段二值概率乘积。 +- `[strengthen]` **metric:closed_loop_collision_rate** `assumptions` 中"随机种子覆盖足以达到统计置信"过于空泛,应指定最少 8 个种子并采用 bootstrap 置信区间。 +- `[strengthen]` **metric:long_tail_success_rate** "合理速度通过"未定义,会在跨实验室时静默漂移。 +- `[ok]` **metric:route_completion** `common_misuses: 通过缩短路线长度人为提升完成度` 是 CARLA 闭环社区的标准审查点。 + +## failure_modes.json — 研究资产质量 + +- `[fix]` **occlusion_blind_spot_overconfidence** 与 **ego_status_leakage** 在 `reproducible_setup` 与 `diagnostic_metrics` 上重合度高,需要把诊断协议拆开(遮挡使用几何掩膜,状态泄漏使用输入消融)。 +- `[fix]` **failure_mode:long_tail_object_recognition_miss** 的 `partial_solutions` 把 OccFormer / OccNet 视为解决方案,但它们与目标方法共享同一长尾稀缺训练分布,残余间隙应明确指出分布根因。 +- `[fix]` 8 条失败模式对 7 篇论文不够。至少需要补充:语言模型规划幻觉(DriveVLM / DiLu)、DiLu 记忆中毒、CF-VLA 反事实分布漂移。 +- `[strengthen]` **failure_mode:ride_comfort_violation_due_to_late_braking** 的 partial_solutions 残余间隙过弱,应给定定量阈值(如在 100ms 预算下慢系统调用率小于 5%)。 +- `[strengthen]` **failure_mode:sensor_degradation_silent_failure** 把镜头水珠与雷达激光雷达回波不稳定混在同一条目,应当拆为信号级(镜头退化)与域级(材质反射)两类。 +- `[ok]` **failure_mode:closed_loop_deadlock_under_uncertainty** 的"碰撞率与规则合规分数表现良好"反向信号是真正的诊断标志。 + +## experiment_plans.json — 三层完整性 + +- `[fix]` Tier-2 算力预算(480 / 240 / 360 / 600 GPU 小时)相对实际任务规模整体偏低 2 至 5 倍。 +- `[fix]` **dual_system_for_long_tail** Tier-1 `runtime_hours=2` 同时使用"Mock 语言模型"自相矛盾——如果是 Mock 不能验证基于真实特征的门控,如果是真实 LLM 则 2 小时不可能。 +- `[fix]` **counterfactual_branches_as_safety_signal** Tier-2 基线列表缺少最关键的"在同等数据预算下不带反事实损失的 VLA"对照,导致 ablation 退化为方法之间比较。 +- `[fix]` **closed_loop_eval_protocol_audit** Tier-1 success_criteria 设置 100% 命中过于脆弱,改为大于等于 95% 召回且小于等于 2% 假阳。 +- `[strengthen]` 所有 plan 的 Tier-3 `latency_budget` 应该锚定到外部部署目标(例如"100 毫秒匹配 nuPlan 官方延迟")。 +- `[strengthen]` 所有 Tier-2 success_criteria 必须显式声明"至少 8 个种子并报告 bootstrap 置信区间"。 +- `[ok]` **dual_system_for_long_tail** Tier-3 的"记忆库注入错误经验"扰动直接对应一类真实失败模式。 + +--- + +## Top 5 最高杠杆修订 + +1. 重写 UniAD 主张的反例,使 BEV-Planner 的 ego 状态泄漏发现威胁的是 UniAD 本身。 +2. 修复维度不一致或欠定义的指标公式(舒适综合、规则合规、开环位移分时段)。 +3. 补齐缺失的失败模式:语言模型规划幻觉、DiLu 记忆中毒、CF-VLA 反事实分布漂移;并拆分传感器退化。 +4. 把场景由 6 扩到至少 10,并为每个场景添加定量触发阈值以支持自动化挖掘。 +5. 重新校准实验计划 Tier-2 的算力预算,并显式加入"至少 8 个种子加 bootstrap 置信区间"。 diff --git a/docs/data/research/cross_review_round3.md b/docs/data/research/cross_review_round3.md new file mode 100644 index 0000000..941b17d --- /dev/null +++ b/docs/data/research/cross_review_round3.md @@ -0,0 +1,65 @@ +# 第三轮研究层交叉审查报告 + +> 仅对 Round 3 新增内容(4 条 claim · 5 条 failure_mode · 2 条 chain · 2 条 experiment_plan)做敌意审查。Round 1 已覆盖的资产不再重复评估。报告由独立审查代理产出,并作为研究资产保留以便追溯每一处修订。 + +## 标签约定与 Round 1 一致 + +`[fix]` 发表前必须修订 · `[strengthen]` 可强化以提升严谨度 · `[ok]` 通过审查 + +--- + +## claims.json(Round 3 新增) + +- `[fix]` **claim:offline_rl_conservatism_avoids_q_overestimation**:`subject: paper:mnih2015_dqn` 加 `unresolved_subject: true` 是错误的折中。DQN 是 online off-policy,与 CQL 的离线下界证明完全不在同一论证轴上。`unresolved_subject` 字段甚至不在 `schema.json` 中,自动校验会绕过它而非明确处理。应当在 `graph.json` 新增 `paper:kumar2020_cql`(以及配套的 `paper:fujimoto2019_bcq`、`paper:hafner2020_dreamer`),让 subject 指向真正的引用。 +- `[fix]` **claim:world_model_imagination_reduces_sample_complexity**:`subject: paper:world_models` 指向的是 Ha and Schmidhuber 2018,但 evidence 引用的是 Hafner et al. 2020 Dreamer 的表 1——两者不是同一篇。Dreamer 的样本效率结论与 World Models 2018 的样本效率论断不重合。需新增 `paper:hafner2020_dreamer` 节点并把 subject 改过去。 +- `[fix]` **claim:ppo_clipped_surrogate_stabilizes_policy_gradient**:evidence 标"PPO 论文表 3"在 MuJoCo 基准——这是误引。Schulman 2017 PPO 的 MuJoCo 结果在 Figure 3 与 Table 1。Table 3 实际是 Atari 结果。preconditions 中"截断阈值 ε 通常取 0.1 至 0.3"忘了 PPO 默认 0.2 的事实,先验范围给得过宽。 +- `[fix]` 三条 RL claim(PPO / CQL / Dreamer)的 `reproduction.public_data` 全部写 `dataset:carla_town05_long`,但 `reproduction.minimal` 实际是 CartPole / MiniGrid 上的对照。CARLA 与这些 lab 没有任何执行关系,字段在自动化校验中会立刻刺出。应改为 `null` 或新增 `dataset:cartpole_classic` / `dataset:minigrid_classic`。 +- `[fix]` **claim:bitter_lesson_handcrafted_priors_decline_under_scale**:声明是关于长期趋势的强普遍宣称,但 evidence 全是 ImageNet 与 AlphaGo 类比加上"逐年进展"。evidence 没有量化(年份、规模、相对差距),属于循环论证。`reproduction.expected_output` "差距同向变化"几乎不可能被证伪。需要明确改写为分时段(如 50% → 100% 数据规模时端到端相对模块化的相对降幅)。 +- `[strengthen]` **claim:offline_rl_conservatism_avoids_q_overestimation**:preconditions 缺少 CQL 论文核心前提"behavior policy 与数据分布足够支持下界估计"。 +- `[strengthen]` **claim:world_model_imagination_reduces_sample_complexity**:`evidence_strength=2, reproducibility_status=partial` 与 Dreamer 表 1 这种一手证据不匹配;subject 修复后应升级至 `evidence_strength=3, verified`。 +- `[ok]` 四条 claim 的 counterexamples 都给了具体而非通用的反例。 + +## failure_modes.json(Round 3 新增) + +- `[fix]` **failure_mode:on_policy_rl_sample_inefficiency_for_safety_critical_events**:partial_solution 中 Schaul 2016 prioritized experience replay 是为 off-policy DQN 设计的,把它列在 on-policy PPO 的失败模式里在技术上不直接适用。应加一句 residual_gap 说明"PER 在 on-policy 下需改写为带 importance sampling 的优先采样"。 +- `[fix]` **failure_mode:safety_constraint_lagrangian_oscillation**:partial_solutions 中"Tessler 2018 reward-constrained"应是 Tessler et al. ICLR 2019。可写为 Tessler 2018 / 2019 或直接给 arXiv id。 +- `[fix]` **failure_mode:scaling_bet_failure_on_safety_critical_long_tail** 与 **failure_mode:offline_rl_extrapolation_error** 的 `residual_gaps` 都是文字定性,没有 Round 1 多次要求的定量阈值。应至少写出可观测数字。 +- `[strengthen]` **failure_mode:world_model_compounding_imagination_error**:trigger_conditions 与对应 claim 的反例高度重叠,应拆分到不同 horizon 阈值或不同任务族。 +- `[ok]` 五条新失败模式与论文角度均有清晰映射,没有与 Round 1 已有失败模式重合。 + +## argument_chains.json(Round 3 新增) + +- `[fix]` **chain:offline_rl_versus_imitation_under_distribution_shift**:`subject_papers: [paper:ross2011_dagger, paper:mnih2015_dqn]` 与 method_mechanism 自相矛盾——method_mechanism 要训练 BC / DAgger / CQL / BCQ,但 subject 不含 CQL / BCQ。strong_baselines 中 PPO 是 on-policy,在 offline RL vs imitation 的语境下错位。 +- `[fix]` **chain:offline_rl_versus_imitation_under_distribution_shift** reviewer_attacks 缺少最锋利的一条:D4RL 上 CQL 强于 BC 的差距在多数子集上来自归一化与超参,而非保守惩罚本身。 +- `[fix]` **chain:safety_constraint_layering_for_end_to_end_planning**:core_claim 引入了魔法数字 3% 但缺少出处。需要要么标明为内部决策,要么引用一个公开数字。 +- `[fix]` **chain:safety_constraint_layering_for_end_to_end_planning** negative_results "若约束层导致主分布性能下降超过 3%,说明物理参数需重新设计"是把自己的目标当作判据,确认偏倚陷阱。 +- `[strengthen]` **chain:safety_constraint_layering_for_end_to_end_planning** subject_papers 应包含 nuPlan PDM 作为基于规则的强基线。 +- `[ok]` **chain:offline_rl_versus_imitation_under_distribution_shift** 的 negative_results 二元设计是真正的可证伪。 + +## experiment_plans.json(Round 3 新增) + +- `[fix]` **experiment_plan:offline_rl_versus_imitation_under_distribution_shift** Tier-2 baselines 与 chain method_mechanism 严重不一致。Tier-2 必须包含 CQL 与 BCQ 作为基线。 +- `[fix]` **experiment_plan:safety_constraint_layering_for_end_to_end_planning** `compute_budget: 约 2200 GPU 小时`量级应在 5000+ GPU 小时,Round 1 已警告 Tier-2 算力普遍偏低 2 至 5 倍。 +- `[fix]` **experiment_plan:offline_rl_versus_imitation_under_distribution_shift** `compute_budget: 约 1800 GPU 小时`偏低约 2 倍。 +- `[fix]` Tier-3 `sensor_dropout: 不适用` 与 `counterfactual_branches: 不适用` 是占位逃避,应改为对 BEV 输入做相机遮蔽或显式声明 lab 只用低维观测。 +- `[fix]` **experiment_plan:safety_constraint_layering_for_end_to_end_planning** Tier-1 success_criteria 中"可被解释"再次是不可证伪表述,需要量化阈值。 +- `[strengthen]` 两条计划的 Tier-3 latency_budget 中之一缺少外部部署目标锚定。 +- `[ok]` 两条计划的 Tier-2 都已显式包含"至少 8 个独立种子并报告 bootstrap 95% 置信区间"。 + +## 跨引用专项 + +应当**新增节点而非保留 `unresolved_subject` 占位**: + +- `paper:kumar2020_cql`(Tier S,topic deep_rl,phase core,labs lab_cql_offline_minigrid) +- `paper:fujimoto2019_bcq`(Tier A) +- `paper:hafner2020_dreamer`(Tier S,labs lab_dreamer_cartpole_pixels,把 `paper:world_models` 留作 prereq parallel) + +随后把 claim / chain / experiment 的 subject 与 baseline 全部指过去。 + +--- + +## Top 3 最高杠杆修订 + +1. **新增 `paper:kumar2020_cql`、`paper:fujimoto2019_bcq`、`paper:hafner2020_dreamer` 三个 graph.json 节点**,移除 `unresolved_subject` 字段,把相关 claim、chain 与 experiment 的 subject / baselines 全部对齐。 +2. **修正 PPO 证据表号与三条 RL claim 的 `public_data` 字段**:CARLA Town05 Long 与 CartPole / MiniGrid lab 没有执行关系,PPO 论文 MuJoCo 结果在 Figure 3 与 Table 1,不是 Table 3。 +3. **去除 Round 3 两条新 chain / plan 的确认偏倚与魔术数字**:把 3% 阈值与"可被解释"成功判据替换为外部锚定的可证伪量化阈值;按 Round 1 的 2 至 5 倍校准 Tier-2 算力。 diff --git a/docs/data/research/datasets.json b/docs/data/research/datasets.json new file mode 100644 index 0000000..e32eb68 --- /dev/null +++ b/docs/data/research/datasets.json @@ -0,0 +1,160 @@ +{ + "$schema": "./schema.json#/node_kinds/dataset", + "datasets": [ + { + "id": "dataset:nuscenes_planning", + "label": "nuScenes 规划评测分卷", + "scale": "1000 段 20 秒驾驶序列,主要采集自波士顿与新加坡两个城市,6 路环视相机加 1 路前向雷达加 1 路 360 度激光雷达,每秒 2 帧关键帧标注。", + "supports": [ + "在已有人类示范条件下评估开环轨迹回归误差,作为方法初筛信号", + "为感知与轨迹预测提供弱监督联合训练数据" + ], + "limits": [ + "只有约 5.5 小时高质量标注数据,长尾事件极度稀缺", + "缺乏闭环回放,所有评测都基于专家轨迹假设", + "城市风格单一,对中国式城区不适用", + "ego 状态作为输入时容易造成评测虚高(BEV-Planner / Li 2024 详细量化了这一问题)", + "不足以独立证明感知与规划模块的协同收益,最多作为初筛信号" + ], + "common_misuses": [ + "把开环位移误差当作部署安全的代理,会忽略分布漂移与累计误差", + "在 mini 子集上做模型选择,再迁移到完整集时表现退化", + "把 ego 速度直接拼接到输入而不在测试时遮蔽" + ], + "covers_scenarios": ["scenario:dense_pedestrian_crosswalk_at_night", "scenario:long_tail_rare_object_on_road"], + "license": "CC BY-NC-SA 4.0,禁止商业训练,只允许学术使用与公开基准对比。" + }, + { + "id": "dataset:nuplan_planning", + "label": "nuPlan 闭环规划基准", + "scale": "约 1500 小时美国与新加坡多城市驾驶日志,包含完整 ego 状态、地图、追踪结果,提供基于交互式模拟器的闭环评测协议。", + "supports": [ + "比较纯学习方法与基于规则的强基线在闭环下的真实表现", + "在统一的反应式 agent 模拟器下评估长视野与多车交互稳健性", + "对规划损失做闭环导向的消融实验" + ], + "limits": [ + "模拟器中的他车行为来源于 IDM 或类似规则,可能低估真实复杂度", + "感知噪声不在评测之内,给定真值或预先跟踪的轨迹", + "缺少恶劣天气与传感器故障注入", + "官方非反应式分数对小批量手工调参的场景敏感,且基于规则的 PDM 基线在多数学习方法之上,已经成为社区对 nuPlan 的标准审查点" + ], + "common_misuses": [ + "把基于真值跟踪的闭环成绩声明为端到端能力", + "在固定的他车反应模型下过拟合社会博弈策略", + "用同一城市训练评测而不报告跨城迁移" + ], + "covers_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "license": "Motional 自定义许可,学术研究允许,商业用途需协商。" + }, + { + "id": "dataset:waymo_open_motion", + "label": "Waymo Open Motion 数据集", + "scale": "约 10 万段每段 20 秒的高频率轨迹数据,涵盖六个美国大都会城市的高密度交互场景,提供地图与多类道路使用者的语义注释。", + "supports": [ + "评估多 agent 长视野预测与交互建模", + "构建场景挖掘流水线以提取罕见博弈样本", + "迁移到下游 motion-conditioned planning 的预训练" + ], + "limits": [ + "缺乏稠密像素级传感器原始流,主要是后处理后的轨迹", + "夜间与恶劣天气数据较少", + "评测协议偏向预测精度,不直接支持闭环安全度量" + ], + "common_misuses": [ + "把预测 minADE/minFDE 当作规划能力的代理", + "在没有遮挡建模的前提下用真值轨迹做监督训练" + ], + "covers_scenarios": ["scenario:dense_pedestrian_crosswalk_at_night", "scenario:highway_merge_at_speed_differential"], + "license": "Waymo Dataset License。学术使用需要团队注册并通过审批,并非完全免费;商业用途须单独申请。" + }, + { + "id": "dataset:carla_town05_long", + "label": "CARLA Town05 Long 闭环基准", + "scale": "基于 CARLA 0.9.10 的 Town05 地图,10 条长路线覆盖城市、郊区与高速混合,支持注入天气、行人密度与对手车辆。CARLA 0.9.10 与 0.9.15 之间的物理与渲染差异显著,所有论文必须明示具体 CARLA 版本与镜像哈希,否则结果不可比较。", + "supports": [ + "在统一仿真器中比较视觉端到端方法的真闭环表现", + "对天气、传感器故障与他车冲突做受控扰动", + "在不可观测意图下做反事实分支评估" + ], + "limits": [ + "图像真实感与真实路面有视觉域差距", + "他车策略基于规则,对长尾行为缺乏覆盖", + "评测结果对种子敏感,需要多种子统计置信" + ], + "common_misuses": [ + "只跑一两个种子就声明性能优越", + "在固定的官方天气下训练评测,忽略广义化失败", + "把 CARLA 闭环分数直接外推到真实城区" + ], + "covers_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:construction_zone_with_cone_lane_shift", "scenario:heavy_rain_with_camera_lens_droplet"], + "license": "MIT,可自由用于学术与商业。" + }, + { + "id": "dataset:bench2drive", + "label": "Bench2Drive 闭环再现基准", + "scale": "基于 CARLA 的 220 条多样化路线,覆盖 12 类典型场景,提供官方的训练分割、评测脚本与基线模型。", + "supports": [ + "在统一种子集合下比较视觉端到端模型的可复现闭环性能", + "做受控的天气、交通与传感器扰动消融", + "评估同一方法在分布内和分布外路线上的差距" + ], + "limits": [ + "依赖 CARLA 真实感上限", + "评测语义偏 driving score,缺少对乘员舒适度的细分", + "种子数量有限时统计可信度不足" + ], + "common_misuses": [ + "只汇报 Driving Score 而隐藏 Success Rate;多数学习方法的 SR 接近零,Driving Score 高仅来源于路线完成度而非真正安全通过", + "在 Dev10 等小子集上做评测并声明覆盖整个 Bench2Drive 基准", + "在评估集上做超参选择", + "用未公开的视觉骨干声明可复现" + ], + "covers_scenarios": ["scenario:construction_zone_with_cone_lane_shift", "scenario:long_tail_rare_object_on_road", "scenario:unprotected_left_turn_with_occlusion"], + "license": "MIT,可自由使用。" + }, + { + "id": "dataset:rl_classic_control_suite", + "label": "强化学习经典控制基准套件(CartPole / MiniGrid / DeepMind Control)", + "scale": "包含 OpenAI Gym 的 CartPole 与 LunarLander、MiniGrid 的网格世界系列、DeepMind Control Suite 的连续控制任务。所有任务为低维或像素观测,单机即可训练。", + "supports": [ + "在受控环境下比较 RL 算法的样本复杂度与稳定性", + "为离线强化学习提供可构造覆盖空洞的基准", + "为世界模型方法提供可对照样本预算的最小机制实验" + ], + "limits": [ + "环境与真实自动驾驶差距巨大,结论不直接外推", + "缺少多 agent 交互", + "感知噪声有限" + ], + "common_misuses": [ + "把经典控制基准上的 RL 算法领先直接外推到自动驾驶闭环", + "在固定种子下声明算法稳健", + "把 CartPole 的小规模数据点伪装成完整基准证据" + ], + "covers_scenarios": ["scenario:unprotected_left_turn_with_occlusion"], + "license": "CartPole 与 LunarLander 属于 OpenAI Gym(MIT),MiniGrid 属于 Farama Foundation(Apache 2.0),DeepMind Control Suite 属于 Apache 2.0。" + }, + { + "id": "dataset:navsim_planning", + "label": "NAVSIM 非反应式闭环规划基准", + "scale": "基于 nuPlan 数据筛选出的高交互片段,提供非反应式的代理模拟与统一的 driving score 评测协议。", + "supports": [ + "在受控的反事实分支下评估规划稳健性", + "比较开环到闭环过渡阶段的指标一致性", + "为基于真实数据的规划评测提供轻量级流水线" + ], + "limits": [ + "代理为非反应式回放,限制了真实多车博弈", + "覆盖城市与时段仍受 nuPlan 自身约束", + "缺乏感知端到端集成评测" + ], + "common_misuses": [ + "把非反应式分数当作真实闭环能力", + "未做反事实分支扰动就声明稳健性" + ], + "covers_scenarios": ["scenario:highway_merge_at_speed_differential", "scenario:unprotected_left_turn_with_occlusion"], + "license": "Motional 自定义许可,学术研究允许。" + } + ] +} diff --git a/docs/data/research/experiment_plans.json b/docs/data/research/experiment_plans.json new file mode 100644 index 0000000..eff135a --- /dev/null +++ b/docs/data/research/experiment_plans.json @@ -0,0 +1,197 @@ +{ + "$schema": "./schema.json#/node_kinds/experiment_plan", + "experiment_plans": [ + { + "id": "experiment_plan:planning_oriented_query_sharing", + "title": "以规划为最终损失的查询共享端到端结构的三层实验", + "subject": "chain:planning_oriented_query_sharing", + "tier_1_minimal_mechanism": { + "purpose": "在合成 2D 场景上快速验证共享查询是否真的把上游任务梯度导向下游规划", + "environment": "lab03 风格的离散网格或玩具高速公路", + "model": "轻量级 transformer,五种查询头,规划头只有一层", + "metrics": ["合成场景下的轨迹回归误差", "上游任务保留率"], + "success_criteria": "共享查询的方案在保持上游任务精度的同时显著降低规划误差,与同结构独立查询基线有可视差距", + "runtime_hours": 4, + "expected_signal": "梯度统计显示规划损失对上游查询参数的影响显著" + }, + "tier_2_public_benchmark": { + "purpose": "在 nuPlan 与 Bench2Drive 上以新协议同时报告开环与闭环表现", + "datasets": ["dataset:nuplan_planning", "dataset:bench2drive"], + "baselines": ["paper:2210.14222", "paper:transfuser", "paper:vadv2", "paper:2212.10156"], + "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion"], + "success_criteria": "在控制 ego 状态泄漏后开环分时段差距与闭环碰撞率改进同时成立,至少在两类协议上重现,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 2400 A100 GPU 小时(UniAD 风格单次完整训练约 200 GPU 小时,叠加 nuPlan 闭环全量评测与多基线、多协议重复,需要 2400 小时量级)", + "expected_signal": "查询共享方法在闭环安全维度仍保留优势" + }, + "tier_3_stress_test": { + "purpose": "评估端到端结构在分布漂移与传感器退化下的稳健性", + "distributions": ["跨城市迁移", "夜间与雨天扰动", "施工临时几何"], + "perturbations": ["镜头水珠噪声", "雷达短暂失效", "他车反应模型替换"], + "latency_budget": "单帧规划延迟不超过 100 毫秒", + "sensor_dropout": "随机遮蔽 1 至 2 路相机", + "counterfactual_branches": "在每个核心片段额外评测 4 条反事实分支", + "success_criteria": "在每类扰动下闭环碰撞率不超过未扰动基线的 1.5 倍且路线完成度下降可解释" + } + }, + { + "id": "experiment_plan:dual_system_for_long_tail", + "title": "快慢双系统语言驱动规划在长尾上的三层实验", + "subject": "chain:dual_system_for_long_tail", + "tier_1_minimal_mechanism": { + "purpose": "纯流水线验证:用 Mock 语言模型确认门控接口、延迟簿记与降级路径正确,把行为信号留到 Tier-2 由真实语言模型产生", + "environment": "lab07 与 lab08 风格的脚本化决策回合", + "model": "Mock 语言模型(确定性回答)与轻量级快规划器组合", + "metrics": ["门控触发率", "决策正确率(针对 Mock 信号)", "模拟延迟"], + "success_criteria": "门控在指定脚本下触发率与延迟簿记完全可复现,本层不主张 ML 信号,行为有效性由 Tier-2 真实语言模型实验决定", + "runtime_hours": 2, + "expected_signal": "流水线本身在 Mock 后端下完全确定性" + }, + "tier_2_public_benchmark": { + "purpose": "在闭环驾驶基准上同时报告延迟分布、调用率与成功率", + "datasets": ["dataset:bench2drive", "dataset:carla_town05_long"], + "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser"], + "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"], + "success_criteria": "长尾成功率与碰撞率均显著改进且单帧最坏延迟在预算内,舒适度无明显恶化,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 1200 GPU 小时加额外的真实语言模型推理预算(按多种子与多次场景重放估算)", + "expected_signal": "Pareto 前沿移动到更优区域" + }, + "tier_3_stress_test": { + "purpose": "评估慢系统在压力下的退化与失败模式", + "distributions": ["持续高密度长尾事件", "对抗性语言提示注入"], + "perturbations": ["慢系统延迟抖动", "工具集变更", "记忆库注入错误经验"], + "latency_budget": "单帧最坏延迟不超过 200 毫秒", + "sensor_dropout": "在慢系统判断关键阶段随机遮蔽传感器一路", + "counterfactual_branches": "为每条长尾片段构造两条反事实分支", + "success_criteria": "在每类压力下退化曲线可被解释且不引入新型隐蔽失败" + } + }, + { + "id": "experiment_plan:counterfactual_branches_as_safety_signal", + "title": "反事实分支作为视觉语言动作模型的安全信号的三层实验", + "subject": "chain:counterfactual_branches_as_safety_signal", + "tier_1_minimal_mechanism": { + "purpose": "在合成反事实分支上验证反事实损失对策略行为的塑形效应", + "environment": "lab10 风格的合成场景与分支生成器", + "model": "轻量级视觉语言动作模型与一阶分支生成器", + "metrics": ["主轨迹回归误差", "反事实分支成功率"], + "success_criteria": "反事实分支成功率显著提升,主轨迹误差变化在容忍范围内", + "runtime_hours": 8, + "expected_signal": "策略在分支上做出与主轨迹一致的安全决策" + }, + "tier_2_public_benchmark": { + "purpose": "在公开反事实分支基准上比较有无反事实损失的策略", + "datasets": ["dataset:bench2drive", "dataset:navsim_planning"], + "baselines": ["paper:vadv2", "paper:2402.12289", "paper:2212.10156", "control:non_counterfactual_vla_same_data_budget"], + "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "success_criteria": "反事实分支成功率显著高于基线和同等数据预算的非反事实 VLA 对照,主轨迹性能不显著退化,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 1600 GPU 小时(包含同等数据预算的非反事实 VLA 对照训练)", + "expected_signal": "联合分布上反事实损失带来的收益是稳定的" + }, + "tier_3_stress_test": { + "purpose": "对反事实分支真实度与多样性做敏感性扫描", + "distributions": ["不同语义类别的反事实", "几何上更激进的扰动"], + "perturbations": ["分支生成器输出加噪", "对手策略接管"], + "latency_budget": "训练流水线不超过两倍主流端到端方法", + "sensor_dropout": "在分支评测时随机遮蔽传感器", + "counterfactual_branches": "每片段至少六条反事实分支并报告分支多样性指标", + "success_criteria": "在真实度下降到给定阈值前收益保持,超过阈值后退化曲线可解释" + } + }, + { + "id": "experiment_plan:offline_rl_versus_imitation_under_distribution_shift", + "title": "离线强化学习与模仿学习在策略诱导分布漂移下的三层实验", + "subject": "chain:offline_rl_versus_imitation_under_distribution_shift", + "tier_1_minimal_mechanism": { + "purpose": "在 MiniGrid 上对覆盖几何做阶梯扫描,量化 BC、DAgger、CQL 在分布漂移下的稳健性差异", + "environment": "lab_cql_offline_minigrid 与 lab02 联合", + "model": "轻量级 BC、DAgger 与 CQL 实现", + "metrics": ["分布内回报", "分布漂移回报", "Q 值过估计"], + "success_criteria": "在不同覆盖几何下方法的相对秩可被解释;当覆盖几何超过阈值时 CQL 优势消失", + "runtime_hours": 4, + "expected_signal": "方法相对秩对覆盖几何呈单调依赖" + }, + "tier_2_public_benchmark": { + "purpose": "在 nuPlan 闭环上比较 BC、DAgger 与 CQL、BCQ 风格离线策略", + "datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long"], + "baselines": ["paper:ross2011_dagger", "paper:kumar2020_cql", "paper:fujimoto2019_bcq"], + "metrics": ["metric:closed_loop_collision_rate", "metric:route_completion", "metric:long_tail_success_rate"], + "success_criteria": "在分布内闭环上 CQL 与 BCQ 优势可被复现,在受控分布漂移下崩溃曲线明显差异,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 3600 GPU 小时(在 D4RL / MiniGrid 阶梯上跑 BC、DAgger、CQL、BCQ 多方法多种子,加 nuPlan 闭环子集与 CARLA 闭环子集的代表性评测)", + "expected_signal": "CQL 与 BCQ 在覆盖良好的子集上优势可被复现并解释" + }, + "tier_3_stress_test": { + "purpose": "评估离线策略与模仿策略在极端覆盖空洞下的退化曲线", + "distributions": ["人为构造的关键分支移除", "数据集顺序扰动"], + "perturbations": ["保守惩罚强度阶梯", "数据集规模缩减"], + "latency_budget": "训练流水线总时长锚定到 nuPlan 官方一次完整 challenge 评测耗时,不得超过两倍", + "sensor_dropout": "本计划仅使用低维或网格状态观测,因此不施加传感器遮蔽;这一选择需在论文中显式声明并把视觉端到端的稳健性放在另一计划", + "counterfactual_branches": "本计划聚焦覆盖空洞而非反事实分支;反事实分支由 experiment_plan:counterfactual_branches_as_safety_signal 单独评估", + "success_criteria": "在每类压力下退化曲线在多种子均值上的相对秩稳定,且不引入新型隐蔽失败" + } + }, + { + "id": "experiment_plan:safety_constraint_layering_for_end_to_end_planning", + "title": "端到端规划之上的显式安全约束层三层实验", + "subject": "chain:safety_constraint_layering_for_end_to_end_planning", + "tier_1_minimal_mechanism": { + "purpose": "在脚本化场景上验证显式安全验证器与端到端规划器输出的协同与冲突", + "environment": "lab04 与 lab09 联合的脚本化场景", + "model": "轻量级端到端规划器加可计算的安全验证器", + "metrics": ["约束满足率", "主奖励回报", "决策延迟"], + "success_criteria": "在指定 32 段冲突脚本上约束满足率不低于 95%,主奖励回报相对未约束基线的相对下降不超过 10%,单帧决策延迟不超过 100 毫秒", + "runtime_hours": 3, + "expected_signal": "约束层在受控冲突脚本上的输出可由验证器静态检查复现" + }, + "tier_2_public_benchmark": { + "purpose": "在 nuPlan 与 Bench2Drive 上比较有无安全约束层的端到端方法", + "datasets": ["dataset:nuplan_planning", "dataset:bench2drive", "dataset:navsim_planning"], + "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser"], + "metrics": ["metric:closed_loop_collision_rate", "metric:rule_compliance_score", "metric:ride_comfort_index"], + "success_criteria": "事先标注的安全关键事件碰撞率相对无约束基线降低至少 2 倍,且主分布 Driving Score 相对下降处于 nuPlan 官方分数容忍区间内,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 5500 A100 GPU 小时(在 nuPlan、Bench2Drive 与 NAVSIM 上对 3 个基线、约束层与对照各跑 8 种子,包含 UniAD 风格端到端训练与 nuPlan PDM 对照评测)", + "expected_signal": "代价收益曲线在多种子上稳定" + }, + "tier_3_stress_test": { + "purpose": "评估安全约束层在分布漂移与传感器退化下是否引入新失败模式", + "distributions": ["跨城市迁移", "夜间与雨天扰动"], + "perturbations": ["约束阈值随机抖动", "传感器局部失效"], + "latency_budget": "单帧规划延迟不超过 100 毫秒,锚定 nuPlan 官方延迟限制", + "sensor_dropout": "随机遮蔽 1 至 2 路相机", + "counterfactual_branches": "在关键片段加入反事实分支", + "success_criteria": "在每类扰动下,约束层引入的额外失败(如新死锁、舒适度违规)相对未约束基线的恶化不超过 1.5 倍,否则需重新设计约束物理参数" + } + }, + { + "id": "experiment_plan:closed_loop_eval_protocol_audit", + "title": "可审计闭环评测协议的三层实验", + "subject": "chain:closed_loop_eval_protocol_audit", + "tier_1_minimal_mechanism": { + "purpose": "对 ego 状态泄漏自动检测与种子统计要求做单元测试", + "environment": "tools/validate_research.py 的扩展检查", + "model": "无须模型,纯协议检查", + "metrics": ["检查通过率", "检查覆盖率"], + "success_criteria": "检查在已知泄漏样本上召回不低于 95%,对正常基线假阳不超过 2%,并在每次提交时由 CI 强制执行", + "runtime_hours": 1, + "expected_signal": "协议检查可被复用为提交门禁" + }, + "tier_2_public_benchmark": { + "purpose": "在主流方法上同时跑新旧协议并比较相对秩", + "datasets": ["dataset:nuplan_planning", "dataset:navsim_planning", "dataset:bench2drive"], + "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "paper:2210.14222"], + "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion", "metric:rule_compliance_score"], + "success_criteria": "新协议下至少有一组方法的相对秩发生显著改变,原因可解释,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 3000 GPU 小时(在三个数据集与四个基线上对两套协议各跑多种子)", + "expected_signal": "协议改造对社区比较具有结构性影响" + }, + "tier_3_stress_test": { + "purpose": "检验协议对不同感知集成与他车反应模型的稳健性", + "distributions": ["不同感知集成等级", "不同他车策略"], + "perturbations": ["种子数量缩减", "时间预算紧张"], + "latency_budget": "评测脚本可在 24 小时内完成单方法全套", + "sensor_dropout": "可选传感器组合下重复评测", + "counterfactual_branches": "在协议中允许嵌入反事实评测扩展", + "success_criteria": "协议在各种扰动下仍能产出一致的相对秩" + } + } + ] +} diff --git a/docs/data/research/failure_modes.json b/docs/data/research/failure_modes.json new file mode 100644 index 0000000..0b4c8f4 --- /dev/null +++ b/docs/data/research/failure_modes.json @@ -0,0 +1,418 @@ +{ + "$schema": "./schema.json#/node_kinds/failure_mode", + "failure_modes": [ + { + "id": "failure_mode:ego_status_leakage", + "label": "自车状态在测试时被显式输入造成评测虚高", + "trigger_conditions": [ + "训练与测试都把自车速度、加速度、横摆率作为输入", + "评测协议不显式禁止 ego 状态进入模型" + ], + "manifestation": "开环位移误差在 nuScenes 等基准上看似显著下降,但闭环回放或盲测后性能塌陷至接近基线水平。", + "reproducible_setup": "在 UniAD 或 VAD 代码库上做输入消融:分别打开和关闭 ego 速度、加速度、横摆率三通道并报告 L2_avg 与分时段 L2_τ。需在完整 nuScenes 验证集(约 6000 样本)上做对比,mini 子集统计涨落过大不足以支持结论。", + "diagnostic_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate"], + "method_weakness": "评测设计与模型输入接口的耦合使得感知与规划的真实贡献被泄漏的状态遮蔽。", + "partial_solutions": [ + {"idea": "在测试时显式遮蔽 ego 状态并报告两组数字", "citation_or_repo": "BEV-Planner 的复现报告", "residual_gap": "仍未形成全社区强制的评测协议"}, + {"idea": "改用闭环反应式评测", "citation_or_repo": "nuPlan / NAVSIM 协议", "residual_gap": "他车反应模型本身仍是规则化代理"} + ], + "open_questions": [ + "怎样在不破坏端到端可训练性的同时强制评测公正", + "如何在公开 leaderboard 上自动检测 ego 状态泄漏" + ], + "publication_angles": [ + "提出可审计的端到端评测协议", + "在多个开源模型上系统量化 ego 状态泄漏的贡献" + ] + }, + { + "id": "failure_mode:closed_loop_deadlock_under_uncertainty", + "label": "高度不确定情境下规划器陷入安全死锁", + "trigger_conditions": [ + "对向车流密集且自车需要主动决策", + "感知置信度较低且规划损失对碰撞惩罚远高于停车惩罚" + ], + "manifestation": "自车长时间停留在保护性停止状态,路线完成度低,但碰撞率与规则合规分数表现良好。", + "reproducible_setup": "在 CARLA Town05 Long 上构造不受保护左转脚本,统计自车通过率与平均等待时间。", + "diagnostic_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate"], + "method_weakness": "对不确定性的处理倾向于过度保守,缺乏对延迟成本的显式建模。", + "partial_solutions": [ + {"idea": "把延迟成本写入规划损失", "citation_or_repo": "nuPlan 基线 PDM", "residual_gap": "调权敏感且无法跨场景迁移"}, + {"idea": "引入显式意图推断模块", "citation_or_repo": "Agent-Driver 推理链路", "residual_gap": "推理延迟与稳定性仍存疑"} + ], + "open_questions": [ + "如何构造对不可观测意图稳健的最优停止理论", + "怎样让多车交互中的让步行为成为可学习目标" + ], + "publication_angles": [ + "提出包含等待成本的闭环规划损失", + "构建专注于过度保守失败的诊断基准" + ] + }, + { + "id": "failure_mode:long_tail_object_recognition_miss", + "label": "长尾稀有物体识别遗漏导致直接碰撞", + "trigger_conditions": [ + "路面出现训练分布之外的可碰撞物体", + "目标尺寸或姿态在训练数据中频率极低" + ], + "manifestation": "检测器不输出该目标或输出极低置信度,规划器据此忽略目标并保持原速度。", + "reproducible_setup": "在 CARLA 中放置非常规物体如锥桶堆、纸箱、低矮障碍,比较多种端到端模型的碰撞率。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "目标先验由训练分布隐式决定,没有显式的开放类别处理。", + "partial_solutions": [ + {"idea": "用 VLM 或视觉基础模型做分布外目标提示", "citation_or_repo": "DriveVLM-Dual", "residual_gap": "推理延迟与可靠性仍受限,且语言模型对低频物理类别也可能输出错误描述"}, + {"idea": "占用预测取代显式检测", "citation_or_repo": "UniAD OccFormer 与 OccNet", "residual_gap": "OccFormer / OccNet 与目标方法共享同一长尾稀缺训练分布,仍然继承类别先验,因此对分布外目标的占用回归本身也会塌缩为乘用车形状"} + ], + "open_questions": [ + "在没有目标类别标签时如何驱动安全规划", + "如何把分布外目标识别量化为可优化目标" + ], + "publication_angles": [ + "构建强制评估开放类别行为的长尾基准", + "提出基于占用的统一安全损失" + ] + }, + { + "id": "failure_mode:ride_comfort_violation_due_to_late_braking", + "label": "决策延迟造成晚刹引发的乘员舒适违规", + "trigger_conditions": [ + "对前车意图或行人意图判断滞后", + "规划频率低于事件演变速率" + ], + "manifestation": "在最后一秒以接近最大减速进行制动,乘员前倾,纵向加加速度峰值远超舒适阈值。", + "reproducible_setup": "在 NAVSIM 上启用乘员舒适评测,对若干主流端到端模型统计加加速度分布。", + "diagnostic_metrics": ["metric:ride_comfort_index", "metric:closed_loop_collision_rate"], + "method_weakness": "规划损失只对碰撞与位置进行约束,缺乏对决策时机的显式塑形。", + "partial_solutions": [ + {"idea": "把加加速度纳入规划损失", "citation_or_repo": "PDM 与 GameFormer 基线", "residual_gap": "在视觉端到端模型中难以平衡,公开复现显示加加速度权重每提升一倍,路线完成度下降 2 到 5 个百分点"}, + {"idea": "引入双系统快慢架构", "citation_or_repo": "DriveVLM-Dual 双管线", "residual_gap": "在 100 毫秒延迟预算下慢系统调用率通常低于 5%,对舒适度违规中由意图判断滞后导致的部分覆盖有限"} + ], + "open_questions": [ + "如何在端到端训练中平衡安全与舒适的多目标优化", + "乘员体感是否需要主观问卷做最终校准" + ], + "publication_angles": [ + "提出兼顾决策时机与运动学的规划损失", + "构建受控的舒适度回归基准" + ] + }, + { + "id": "failure_mode:occlusion_blind_spot_overconfidence", + "label": "遮挡盲区中对对向车意图过度自信", + "trigger_conditions": [ + "前车或建筑物遮挡对向车流", + "感知模型在缺失观测情况下仍输出高置信度的占用预测" + ], + "manifestation": "自车在没有充分让行的情况下进入冲突区域,与遮挡区出现的对向车发生侧面碰撞。", + "reproducible_setup": "在 nuPlan 与 CARLA Town05 Long 上选择未受保护左转片段,对感知输入施加几何遮挡掩膜(在 BEV 上把对向车道在自车视线之外的区域置为不可观测),比较多种规划器在掩膜前后的碰撞分布。诊断协议与状态泄漏失败模式不同:遮挡使用几何掩膜,状态泄漏使用输入消融。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"], + "method_weakness": "占用预测对未观测区域的不确定性建模过于乐观,缺乏置信度校准。", + "partial_solutions": [ + {"idea": "对不确定区域施加显式悲观先验", "citation_or_repo": "BEV-Planner 与 PDM 风险层", "residual_gap": "校准过强会触发死锁失败"}, + {"idea": "引入显式意图推断与反事实分支", "citation_or_repo": "CF-VLA", "residual_gap": "对推理稳定性有更高要求"} + ], + "open_questions": [ + "如何把感知不确定性与规划风险耦合成可学习信号", + "怎样在评测中显式衡量遮挡盲区的处理质量" + ], + "publication_angles": [ + "提出与遮挡几何耦合的规划风险层", + "构建针对盲区行为的诊断基准" + ] + }, + { + "id": "failure_mode:map_prior_overrides_runtime_observation", + "label": "地图先验压倒运行时观测导致违规行为", + "trigger_conditions": [ + "高清地图与实际车道存在临时偏移", + "模型对地图特征的权重显著高于实时视觉特征" + ], + "manifestation": "在施工改道或临时锥桶布置区域,自车按地图原始几何行驶而忽视实地标志。", + "reproducible_setup": "在 Bench2Drive 施工场景或 CARLA 临时改道脚本上比较有无显式地图融合的端到端模型。", + "diagnostic_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate"], + "method_weakness": "地图被视为强先验而非建议,模型未学到何时应拒绝地图。", + "partial_solutions": [ + {"idea": "引入实时车道感知并允许覆盖地图", "citation_or_repo": "MapTR 与 LaneSegNet 系列", "residual_gap": "在地图与实地都不可信时仍困难"}, + {"idea": "用 VLM 做语义优先级判断", "citation_or_repo": "DriveVLM 与 LINGO-2", "residual_gap": "对响应延迟与可解释性要求高"} + ], + "open_questions": [ + "如何在端到端中学习地图与观测的动态信任度", + "怎样把临时几何作为评测维度纳入主流基准" + ], + "publication_angles": [ + "提出地图信任度自适应的端到端框架", + "构建专注临时几何变化的评测套件" + ] + }, + { + "id": "failure_mode:multi_agent_interaction_indecision", + "label": "多车交互中的犹豫造成路线完成度塌陷", + "trigger_conditions": [ + "高速并线或环岛汇入场景", + "对方让与不让具有高度模糊性" + ], + "manifestation": "自车反复减速加速试图寻找间隙,最终未能完成并线或被迫绕行。", + "reproducible_setup": "在 nuPlan 高速并线片段上启用反应式 agent 评测,比较规划器的并线成功率。", + "diagnostic_metrics": ["metric:route_completion", "metric:ride_comfort_index"], + "method_weakness": "规划器缺乏稳定的博弈策略与意图沟通通道。", + "partial_solutions": [ + {"idea": "显式博弈树搜索并融合到规划损失", "citation_or_repo": "GameFormer 与 PDM 系列", "residual_gap": "搜索深度与算力受限"}, + {"idea": "用语言或符号意图提示替代博弈", "citation_or_repo": "DiLu 与 Agent-Driver", "residual_gap": "意图沟通对其它车辆并不可见"} + ], + "open_questions": [ + "怎样把人类驾驶员的让步信号建模为可学习交互", + "在没有显式沟通通道时如何学习稳健博弈" + ], + "publication_angles": [ + "提出带显式博弈先验的端到端规划", + "构建并线犹豫的诊断基准" + ] + }, + { + "id": "failure_mode:camera_signal_degradation_silent_failure", + "label": "相机信号级退化时模型未触发降级处理", + "trigger_conditions": [ + "镜头存在水珠、油膜或灰尘", + "强逆光与高动态范围导致局部曝光异常", + "夜间低光照与多光源混合" + ], + "manifestation": "图像 ISP 仍输出有效信号,下游检测器对部分目标输出低置信度或丢失,规划器对感知输出过度信任并保持正常行驶。", + "reproducible_setup": "在 CARLA 中注入针对相机的水珠与高动态范围扰动脚本;在 nuScenes 关键帧上模拟镜头退化并比较有无健康度监测的端到端模型在闭环上的差异。诊断协议针对像素级伪影,与雷达激光雷达的反射退化失败模式分开。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"], + "method_weakness": "缺乏对像素级图像质量的显式监测以及策略级降级机制。", + "partial_solutions": [ + {"idea": "在感知输入端加入图像质量评估头", "citation_or_repo": "工业部署侧的镜头健康度模块", "residual_gap": "学界缺乏公开评测,难以横向比较"}, + {"idea": "在训练中显式注入像素级扰动", "citation_or_repo": "DriveDreamer 增强流水线", "residual_gap": "扰动分布难以覆盖真实镜头失效模式"} + ], + "open_questions": [ + "如何把图像质量信号作为可学习的降级触发", + "如何评估降级策略的可信度而不仅看主指标" + ], + "publication_angles": [ + "提出图像质量监测驱动的策略切换机制", + "构建针对相机信号级退化的诊断基准" + ] + }, + { + "id": "failure_mode:active_sensor_domain_degradation", + "label": "毫米波雷达或激光雷达在特定材质或场景下回波不稳定", + "trigger_conditions": [ + "目标表面对毫米波具有强吸收或镜面反射特性", + "激光雷达对透明或高反光材质回波缺失", + "多车密集场景下雷达多径干扰" + ], + "manifestation": "目标检测置信度稳定但位置或速度估计存在系统性偏差,融合层无法识别这种域级退化。", + "reproducible_setup": "在 nuScenes 与 Waymo Open Motion 中筛选含金属反射或透明材质的关键帧,对比有无雷达激光雷达健康度模块的端到端方法在闭环碰撞率上的差异。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"], + "method_weakness": "传感器物理模型未被显式纳入感知与规划,融合策略对域级退化默认信任。", + "partial_solutions": [ + {"idea": "把传感器物理一致性写入融合层", "citation_or_repo": "TransFuser 系列融合方法", "residual_gap": "在单传感器域级退化时仍难触发降级"}, + {"idea": "用世界模型补全缺失观测", "citation_or_repo": "DriveDreamer 与 GAIA-1", "residual_gap": "世界模型本身在长尾材质上仍不可靠"} + ], + "open_questions": [ + "如何把传感器物理一致性建模为可学习的端到端信号", + "如何在公开基准上引入受控的物理材质扰动" + ], + "publication_angles": [ + "提出物理一致性驱动的多传感器融合层", + "构建针对域级传感器退化的诊断基准" + ] + }, + { + "id": "failure_mode:language_hallucinated_maneuver", + "label": "语言模型规划幻觉导致不安全机动", + "trigger_conditions": [ + "视觉语言模型在分布外场景输出不存在的可行机动", + "提示词或检索到的上下文与当前场景不一致" + ], + "manifestation": "语言层输出形式合法但物理不可行的机动指令,例如在没有空间的情况下选择并线,或对不存在的目标进行让行。", + "reproducible_setup": "在 DriveVLM-Dual 与 Agent-Driver 公开实现上构造包含相似但物理上排他的多个候选机动的提示集,统计语言层输出与物理可行集之间的偏差率。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:rule_compliance_score"], + "method_weakness": "语言层不直接受物理可行性约束,缺乏对运动学边界的反馈通道。", + "partial_solutions": [ + {"idea": "在语言层输出后加入物理可行性验证器", "citation_or_repo": "DriveVLM-Dual 中的双管线门控", "residual_gap": "验证器需要可靠的世界状态估计,本身可能出错"}, + {"idea": "把可行集作为提示词显式注入语言模型", "citation_or_repo": "Agent-Driver 工具调用层", "residual_gap": "提示长度膨胀且对模型规模敏感"} + ], + "open_questions": [ + "如何把可行集的几何与运动学约束作为可微反馈写回语言模型", + "如何在公开基准上自动化检测语言幻觉规划" + ], + "publication_angles": [ + "提出语言层与物理可行性闭环的训练框架", + "构建针对语言幻觉机动的诊断基准" + ] + }, + { + "id": "failure_mode:memory_poisoning_in_language_decision_loop", + "label": "语言驱动决策循环的记忆库被错误经验污染", + "trigger_conditions": [ + "记忆库收纳了未经审计的失败经验或对抗性注入", + "检索机制对相似度阈值不敏感" + ], + "manifestation": "决策循环在新场景上反复检索到错误经验并按其反思,错误率不降反升,且失败模式可被攻击者构造。", + "reproducible_setup": "在 DiLu 公开实现上注入一组与正常经验拓扑相似但建议错误的条目,统计在指定测试场景上的决策错误率随污染比例的变化曲线。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "记忆与反思机制默认相信检索结果,没有内置的可信度估计或人工审计入口。", + "partial_solutions": [ + {"idea": "对记忆条目加入溯源与可信度评分", "citation_or_repo": "DiLu 论文讨论的反思阈值", "residual_gap": "可信度评分本身可能被攻击或漂移"}, + {"idea": "限制记忆容量并强制周期性人工抽检", "citation_or_repo": "工业部署侧的经验库治理实践", "residual_gap": "削弱记忆覆盖度并提高维护成本"} + ], + "open_questions": [ + "如何在没有人工审计的前提下检测污染条目", + "如何把记忆健康度作为可学习的反思信号" + ], + "publication_angles": [ + "提出对抗性记忆注入的攻击与防御基准", + "构建针对记忆中毒的诊断基准" + ] + }, + { + "id": "failure_mode:on_policy_rl_sample_inefficiency_for_safety_critical_events", + "label": "在线策略强化学习对安全关键稀有事件采样效率低导致策略对这些事件欠拟合", + "trigger_conditions": [ + "环境中安全关键事件频率小于每千步 1 次", + "策略更新只看主分布回报", + "训练过程中策略本身能避开稀有事件造成自我筛选" + ], + "manifestation": "策略在主分布上的回报曲线持续上升,但在事先标注的安全关键测试集上表现停滞或下滑。", + "reproducible_setup": "在 CartPole 或简化驾驶环境中插入低频但高代价的危险状态,比较普通 PPO 与带有优先经验回放或事件加权的版本在安全关键测试集上的表现。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "无优先化的均匀采样让低频高代价事件得不到足够梯度信号。", + "partial_solutions": [ + {"idea": "对高代价事件做优先回放", "citation_or_repo": "Schaul 2016 prioritized experience replay(原始版本针对 off-policy DQN)", "residual_gap": "PER 在 on-policy 设置下需改写为带重要性采样修正的优先采样,否则会引入策略梯度偏差"}, + {"idea": "用安全约束显式惩罚危险动作", "citation_or_repo": "Achiam 2017 CPO 与 SafetyGym 系列", "residual_gap": "拉格朗日乘子调度不稳定时会震荡(见 failure_mode:safety_constraint_lagrangian_oscillation)"} + ], + "open_questions": [ + "如何对低频高代价事件做出无偏差的策略改进保证", + "如何在自动驾驶闭环里量化这种失败" + ], + "publication_angles": [ + "提出对安全关键事件加权且具有理论保证的策略梯度方法", + "构建针对该失败的诊断基准" + ] + }, + { + "id": "failure_mode:offline_rl_extrapolation_error", + "label": "离线强化学习对分布外动作的 Q 值过估计造成部署崩溃", + "trigger_conditions": [ + "训练数据集没有覆盖某些关键动作或状态分支", + "价值函数对分布外动作仍输出高值" + ], + "manifestation": "训练阶段策略在验证集上的回报看似稳步提升,但部署到环境后立刻偏离行为策略进入低性能区间,且 Q 值与真实回报背离。", + "reproducible_setup": "在 lab_cql_offline_minigrid 上故意构造存在动作覆盖空洞的数据集,比较 BC、离线 SAC、CQL 在部署回报与 Q 值差距上的差异。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "价值函数对分布外动作没有显式下界估计,缺乏保守惩罚。", + "partial_solutions": [ + {"idea": "对分布外动作施加保守惩罚", "citation_or_repo": "Kumar 2020 CQL", "residual_gap": "α 调度敏感,过强会让策略退化为模仿数据集均值;在 D4RL medium-replay 上 α 从默认 1.0 提升到 5.0 通常带来 5 到 10 分的归一化回报下降"}, + {"idea": "约束策略到数据集分布", "citation_or_repo": "Fujimoto 2019 BCQ", "residual_gap": "BCQ 在 D4RL antmaze 等覆盖空洞较大的子集上落后 CQL 约 10 至 20 分归一化回报"} + ], + "open_questions": [ + "如何在不牺牲策略改进的前提下处理数据集覆盖空洞", + "如何把覆盖空洞作为可量化的数据集质量指标" + ], + "publication_angles": [ + "提出基于数据集覆盖几何的保守惩罚", + "构建针对覆盖空洞的离线强化学习诊断基准" + ] + }, + { + "id": "failure_mode:world_model_compounding_imagination_error", + "label": "世界模型想象 rollout 复合误差让策略学到模型偏差而非真实动力", + "trigger_conditions": [ + "想象 horizon 较长", + "世界模型在状态分布尾部预测误差较大", + "策略在想象与真实数据上权重失衡" + ], + "manifestation": "策略在想象 rollout 上回报曲线持续上升,但在真实环境上回报停滞或下降,且策略行为表现为利用世界模型的预测漏洞。", + "reproducible_setup": "在 lab_dreamer_cartpole_pixels 上扫描想象 horizon 与真实想象样本比例,统计策略在真实环境与想象环境之间的回报差距。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "世界模型预测误差随 horizon 复合,策略对这种偏差没有自适应折扣。", + "partial_solutions": [ + {"idea": "对想象 rollout 长度自适应裁剪", "citation_or_repo": "DreamerV3 与 MBPO 系列", "residual_gap": "裁剪阈值需要经验调参"}, + {"idea": "把世界模型不确定性写入策略更新", "citation_or_repo": "PETS 与基于集成的模型 RL 方法", "residual_gap": "对不确定性估计本身的偏差敏感"} + ], + "open_questions": [ + "如何在自动驾驶世界模型上量化复合误差", + "如何避免策略学到模型预测漏洞" + ], + "publication_angles": [ + "提出基于复合误差的想象 horizon 自适应", + "构建针对世界模型策略漏洞利用的诊断基准" + ] + }, + { + "id": "failure_mode:scaling_bet_failure_on_safety_critical_long_tail", + "label": "规模押注在长尾安全关键事件上失败", + "trigger_conditions": [ + "训练数据虽然总量增加但长尾分布几乎不变", + "失败代价不能用平均损失抵消" + ], + "manifestation": "随着数据与算力规模增加,端到端方法在主分布指标上稳定改进,但在事先标注的安全关键长尾子集上没有同步改进甚至略有下降。", + "reproducible_setup": "在 nuScenes 与 Bench2Drive 上以阶梯数据规模训练同一端到端骨架,单独跟踪长尾子集表现并对比主分布表现的变化。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "规模导向的训练目标对长尾分布的边际收益有上界,缺乏对安全成本的显式建模。", + "partial_solutions": [ + {"idea": "把长尾事件作为加权监督信号", "citation_or_repo": "Sample-reweighting 与 hard example mining", "residual_gap": "在 nuScenes 标注覆盖的长尾子集上加权可带来约 5 至 10% 的相对改进,但对未被标注的长尾类别几乎无效"}, + {"idea": "在端到端流水线之上叠加安全层", "citation_or_repo": "Shielding 与 Safe-RL 方法", "residual_gap": "安全层把对未见长尾的成本上界托管给硬编码规则,回到人工先验路线,与规模押注的核心论点相冲突"} + ], + "open_questions": [ + "如何把安全成本写入与规模兼容的训练目标", + "如何在不破坏端到端可训练性的同时显式覆盖长尾" + ], + "publication_angles": [ + "提出与规模兼容的长尾加权损失", + "构建针对规模押注失败的诊断基准" + ] + }, + { + "id": "failure_mode:safety_constraint_lagrangian_oscillation", + "label": "安全约束拉格朗日乘子调度震荡导致策略学习不稳定", + "trigger_conditions": [ + "对偶变量更新步长选择不当", + "约束违反信号与主奖励信号尺度不匹配", + "环境约束随策略迭代而变化" + ], + "manifestation": "策略训练曲线在满足约束与达到目标之间反复跳变,最终既不稳定满足约束也不在主奖励上达到上界。", + "reproducible_setup": "在 SafetyGym 或简化驾驶环境上比较固定步长与自适应步长的拉格朗日更新,统计约束违反与回报的联合演化。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:rule_compliance_score"], + "method_weakness": "对偶变量的更新缺乏对环境非平稳性与梯度尺度的自适应机制。", + "partial_solutions": [ + {"idea": "对偶变量自适应步长", "citation_or_repo": "Achiam 2017 CPO 与 Tessler 2018/2019 reward-constrained 方法(arXiv:1805.11074,ICLR 2019)", "residual_gap": "在多约束时仍可能交替震荡"}, + {"idea": "约束信号尺度归一化", "citation_or_repo": "Safe-RL 综述", "residual_gap": "归一化常数本身需要经验校准"} + ], + "open_questions": [ + "如何在自动驾驶多约束场景下避免拉格朗日震荡", + "如何把约束违反代价对齐到事故级别" + ], + "publication_angles": [ + "提出多约束自适应拉格朗日方法", + "构建针对约束震荡的诊断基准" + ] + }, + { + "id": "failure_mode:counterfactual_branch_distribution_shift", + "label": "反事实分支与真实事故分布偏离导致训练过度悲观", + "trigger_conditions": [ + "反事实分支生成器在某些语义类别上过拟合", + "反事实分支频率显著高于真实事故频率" + ], + "manifestation": "在合成反事实分支评测集上成功率提升,但在真实长尾子集与真实事故案例上没有等量收益,甚至出现过度保守与等待行为。", + "reproducible_setup": "在 CF-VLA 风格训练流水线上对比反事实分支真实度阶梯(高保真度物理仿真 / 中等保真度学习生成 / 低保真度噪声扰动),统计真实事故子集成功率随真实度的变化。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:route_completion"], + "method_weakness": "反事实损失对生成器分布的偏差敏感,缺乏对真实分布的对齐机制。", + "partial_solutions": [ + {"idea": "用真实事故重放做反事实分支真实度校准", "citation_or_repo": "CF-VLA 论文讨论与工业事故重放管线", "residual_gap": "真实事故数据稀缺且涉及隐私"}, + {"idea": "把反事实损失权重与生成器置信度耦合", "citation_or_repo": "CF-VLA 与近期反事实策略学习论文", "residual_gap": "生成器置信度本身可能与真实分布无关"} + ], + "open_questions": [ + "如何在公开数据上构造分布对齐的反事实评测集", + "如何在训练中显式约束反事实损失不损害真实分布性能" + ], + "publication_angles": [ + "提出反事实分支真实度的可测量度", + "构建带真实事故子集的反事实评测基准" + ] + } + ] +} diff --git a/docs/data/research/metrics.json b/docs/data/research/metrics.json new file mode 100644 index 0000000..fae62e1 --- /dev/null +++ b/docs/data/research/metrics.json @@ -0,0 +1,182 @@ +{ + "$schema": "./schema.json#/node_kinds/metric", + "metrics": [ + { + "id": "metric:open_loop_l2_displacement", + "label": "开环平均位移误差(按 1 秒 / 2 秒 / 3 秒分时段报告,并同时给出均值)", + "formula": "L2_avg = (1/T) * Σ_{t=1..T} || p̂_t − p*_t ||_2 ; L2_τ = || p̂_τ − p*_τ ||_2 for τ ∈ {1s, 2s, 3s}", + "variables": { + "p̂_t": "模型在第 t 步预测的自车位置", + "p*_t": "专家驾驶在第 t 步的真值位置", + "T": "评测时域内的离散步数", + "τ": "分时段刻度,社区惯例取 1 秒、2 秒、3 秒" + }, + "assumptions": [ + "评测分布与训练分布同源", + "ego 状态未在测试时被显式输入", + "未来若干秒内专家轨迹是唯一安全轨迹" + ], + "what_it_proves": [ + "在专家分布上模型对短时回归的拟合质量", + "感知与轨迹回归层的一致性" + ], + "what_it_cannot_prove": [ + "策略在自身诱导分布上的安全性", + "对长尾事件的鲁棒性", + "多模态意图分布的覆盖率" + ], + "known_misuses": [ + "把更低的开环位移误差等同于更安全的部署效果", + "对模型选择只看开环指标而忽视闭环回归", + "通过 ego 状态泄漏制造虚高的位移分数", + "只汇报均值 L2_avg 而隐藏分时段 L2_τ:UniAD 等方法已知存在 1 秒占优但 3 秒劣势的复制现象,单一均值会遮蔽这种相反符号的趋势" + ], + "scope": "适用于以专家轨迹为唯一参考、且时域较短的近距离监督评测。报告时应同时提供 L2_avg 与分时段 L2_1s / L2_2s / L2_3s,并显式说明 ego 状态是否进入模型输入。" + }, + { + "id": "metric:closed_loop_collision_rate", + "label": "闭环碰撞率", + "formula": "CR = (有碰撞的回合数) / (总回合数)", + "variables": { + "回合": "在仿真器或回放系统中执行一次完整路线", + "碰撞": "自车与其它道路使用者的几何包围盒相交" + }, + "assumptions": [ + "仿真器中他车策略足够真实", + "感知输入与训练时一致", + "至少 8 个独立种子并按 bootstrap 报告 95% 置信区间,否则单点比较不具备统计显著性" + ], + "what_it_proves": [ + "策略在闭环分布上避免碰撞的能力", + "感知预测规划链路在端到端循环中的最严重失败率" + ], + "what_it_cannot_prove": [ + "舒适性与社会礼貌", + "对真实路面感知噪声的稳健性", + "罕见但严重事故的尾部风险" + ], + "known_misuses": [ + "只汇报均值而不汇报分布或最大值", + "在单一种子上得出方法优劣的结论", + "对碰撞定义放宽到几何重叠之外的代理" + ], + "scope": "适用于具备真实他车反应模型与受控扰动注入的闭环评测。" + }, + { + "id": "metric:route_completion", + "label": "路线完成度", + "formula": "RC = 实际通过里程 / 路线总里程", + "variables": { + "通过里程": "自车在合法车道上实际行进的距离", + "总里程": "评测路线的设计总长度" + }, + "assumptions": [ + "路线设计涵盖目标域典型几何", + "中断条件来自规则违反或碰撞而非仿真器异常" + ], + "what_it_proves": [ + "策略完成长视野任务的能力", + "出现失败时是否能挽救并继续行驶" + ], + "what_it_cannot_prove": [ + "完成过程的安全性与舒适性", + "对超出训练分布的施工与异常事件的处理" + ], + "known_misuses": [ + "高路线完成度搭配低安全分数仍被汇报为强方法", + "通过缩短路线长度人为提升完成度", + "把完成度作为唯一终极指标,忽略子任务通过率" + ], + "scope": "适用于具有明确起终点与子段判定的闭环驾驶基准。" + }, + { + "id": "metric:ride_comfort_index", + "label": "乘员舒适综合指标(量纲归一)", + "formula": "Comfort = w_a * (acc_jerk / a_ref) + w_l * (lateral_jerk / a_ref) + w_v * (speed_var / v_ref^2)", + "variables": { + "acc_jerk": "纵向加加速度均方根,单位为米每立方秒", + "lateral_jerk": "横向加加速度均方根,单位为米每立方秒", + "speed_var": "速度方差,单位为米每秒的平方", + "a_ref": "加加速度归一化常数,单位为米每立方秒,使每项无量纲", + "v_ref": "速度归一化常数,单位为米每秒,使方差项无量纲", + "w_a": "纵向加加速度的无量纲权重", + "w_l": "横向加加速度的无量纲权重", + "w_v": "速度方差的无量纲权重" + }, + "assumptions": [ + "权重选择反映乘员体感而非工程偏好", + "评测路线包含足够多的转弯与起停", + "a_ref 与 v_ref 在所有被比较方法上保持一致,否则跨方法对比无效" + ], + "what_it_proves": [ + "策略输出的运动学平滑度", + "决策犹豫与突然反应造成的乘员代价" + ], + "what_it_cannot_prove": [ + "极端事件下乘员伤害风险", + "心理紧张感与可信度等主观维度" + ], + "known_misuses": [ + "把舒适度作为安全的反向代理", + "在没有乘员问卷的情况下校准权重", + "在没有触发事件的常规巡航上声明舒适提升" + ], + "scope": "适用于结合纵向与横向运动学评测的闭环或回放协议。" + }, + { + "id": "metric:rule_compliance_score", + "label": "交通规则合规分数(按段二值乘积,避免速率超 1)", + "formula": "RuleScore = Π_i (1 − p_i) where p_i = (segments with violation of rule i) / (total evaluation segments) and 0 ≤ p_i ≤ 1", + "variables": { + "p_i": "第 i 类规则在评测段上的违反概率,每段最多记一次", + "i": "覆盖速度、车道、信号、礼让等多类规则" + }, + "assumptions": [ + "规则枚举覆盖目标司法管辖区的核心条款", + "规则违反检测器自身没有显著漏报" + ], + "what_it_proves": [ + "策略在规范驾驶维度上的表现", + "训练目标是否被规则化奖励所引导" + ], + "what_it_cannot_prove": [ + "在规则不完备时的合理行为", + "对模糊规则的人类判断一致性" + ], + "known_misuses": [ + "把所有规则违反等同处理而忽略严重度", + "在不公开的内部规则集上自评", + "通过软化检测阈值制造合规假象" + ], + "scope": "适用于支持显式规则枚举的闭环或回放评测。" + }, + { + "id": "metric:long_tail_success_rate", + "label": "长尾事件成功率", + "formula": "LTSR = 成功通过的长尾片段数 / 总长尾片段数", + "variables": { + "长尾片段": "经过场景挖掘标记的低频高风险序列", + "成功": "片段内无碰撞、无关键规则违反、按合理速度通过" + }, + "assumptions": [ + "长尾片段挖掘协议公开可比", + "片段标注准确", + "失败定义涵盖所有关键模态" + ], + "what_it_proves": [ + "策略在分布外或低频事件上的稳健性", + "在关注成本敏感场景时是否带来真实改进" + ], + "what_it_cannot_prove": [ + "全分布平均性能", + "对未见过的全新长尾类别的迁移能力" + ], + "known_misuses": [ + "把基础分布上的提升伪装成长尾改进", + "片段筛选过程不公开以致结果不可复现", + "成功定义过于宽松以致碰撞与违规都被通过" + ], + "scope": "适用于配有长尾标注与场景挖掘脚本的闭环评测。\"合理速度\"应在协议中显式定义,例如\"不低于该段限速的 60% 且不超过限速的 110%\",避免跨实验室定义漂移。" + } + ] +} diff --git a/docs/data/research/node_overlay.json b/docs/data/research/node_overlay.json new file mode 100644 index 0000000..58d50b3 --- /dev/null +++ b/docs/data/research/node_overlay.json @@ -0,0 +1,134 @@ +{ + "generated_by": "tools/build_research_overlay.py", + "version": 1, + "subjects": { + "essay:bitter_lesson": { + "evidence_strength": 2, + "dispute_level": 3, + "reproducibility_status_score": 2, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 2 + }, + "paper:2210.14222": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 3, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 3 + }, + "paper:2212.10156": { + "evidence_strength": 3, + "dispute_level": 2, + "reproducibility_status_score": 3, + "failure_boundary_count": 2, + "claim_count": 2, + "maturity": 3 + }, + "paper:2309.16292": { + "evidence_strength": 2, + "dispute_level": 2, + "reproducibility_status_score": 2, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 2 + }, + "paper:2311.10813": { + "evidence_strength": 2, + "dispute_level": 2, + "reproducibility_status_score": 2, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 2 + }, + "paper:2402.12289": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 2, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 2 + }, + "paper:2512.24426": { + "evidence_strength": 1, + "dispute_level": 2, + "reproducibility_status_score": 1, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 1 + }, + "paper:hafner2020_dreamer": { + "evidence_strength": 3, + "dispute_level": 1, + "reproducibility_status_score": 3, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 3 + }, + "paper:kumar2020_cql": { + "evidence_strength": 3, + "dispute_level": 1, + "reproducibility_status_score": 3, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 3 + }, + "paper:schulman2017_ppo": { + "evidence_strength": 3, + "dispute_level": 1, + "reproducibility_status_score": 3, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 3 + }, + "paper:vadv2": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 2, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 2 + } + }, + "scenarios": [ + "scenario:unprotected_left_turn_with_occlusion", + "scenario:dense_pedestrian_crosswalk_at_night", + "scenario:highway_merge_at_speed_differential", + "scenario:construction_zone_with_cone_lane_shift", + "scenario:heavy_rain_with_camera_lens_droplet", + "scenario:long_tail_rare_object_on_road", + "scenario:unprotected_merge_into_ring_road", + "scenario:low_speed_perpendicular_parking", + "scenario:sensor_calibration_drift", + "scenario:school_zone_speed_compliance", + "scenario:emergency_vehicle_yield" + ], + "failure_modes": [ + "failure_mode:ego_status_leakage", + "failure_mode:closed_loop_deadlock_under_uncertainty", + "failure_mode:long_tail_object_recognition_miss", + "failure_mode:ride_comfort_violation_due_to_late_braking", + "failure_mode:occlusion_blind_spot_overconfidence", + "failure_mode:map_prior_overrides_runtime_observation", + "failure_mode:multi_agent_interaction_indecision", + "failure_mode:camera_signal_degradation_silent_failure", + "failure_mode:active_sensor_domain_degradation", + "failure_mode:language_hallucinated_maneuver", + "failure_mode:memory_poisoning_in_language_decision_loop", + "failure_mode:on_policy_rl_sample_inefficiency_for_safety_critical_events", + "failure_mode:offline_rl_extrapolation_error", + "failure_mode:world_model_compounding_imagination_error", + "failure_mode:scaling_bet_failure_on_safety_critical_long_tail", + "failure_mode:safety_constraint_lagrangian_oscillation", + "failure_mode:counterfactual_branch_distribution_shift" + ], + "argument_chains": [ + "chain:planning_oriented_query_sharing", + "chain:dual_system_for_long_tail", + "chain:counterfactual_branches_as_safety_signal", + "chain:offline_rl_versus_imitation_under_distribution_shift", + "chain:safety_constraint_layering_for_end_to_end_planning", + "chain:closed_loop_eval_protocol_audit" + ] +} diff --git a/docs/data/research/round_summary.md b/docs/data/research/round_summary.md new file mode 100644 index 0000000..9ff4282 --- /dev/null +++ b/docs/data/research/round_summary.md @@ -0,0 +1,65 @@ +# 每轮改造对照表 + +> 每轮合并前都需要回答六个判定问题:是否更接近论文产出、是否减少了空洞叙事、是否增加了可证伪证据、是否帮助用户提出新问题、是否能经受审稿人质疑、是否通过实际页面与截图验证。本文件作为研究资产长期保留。 + +## Round 1(结构化研究层 + 论文产出工作台) + +- 是否更接近论文产出:是。把仓库从"卡片 + 图谱"扩展为可证伪主张、论证链、场景、数据集、指标、失败模式、实验计划七类结构化节点,研究者可以直接按节点组装论文骨架。 +- 是否减少了空洞叙事:是。所有主张都被拆为声明、证据、前提、反例、边界、复现配方、可投稿价值;不允许任何字段空缺。 +- 是否增加了可证伪证据:是。每条主张引用具体表号、消融或可复现脚本,并由 `tools/validate_research.py` 在 CI 中强制覆盖。 +- 是否帮助用户提出新问题:是。失败模式视图直接列出开放问题与可投稿切入点。 +- 是否能经受审稿人质疑:部分。第一轮内容由独立审查代理审过并暴露了若干 Top 5 漏洞(见 cross_review_round1.md)。 +- 是否通过实际页面与截图验证:部分。工作台与三维星图所有路由在本地静态服务下返回 200,但尚未做正式截图回归。 + +## Round 2(落地 Round 1 审计 + 扩场景与失败模式) + +- 是否更接近论文产出:是。把 BEV-Planner 风格的最严肃反例写回 UniAD 主张,把指标公式校正到维度一致,把数据集的能与不能证明明确列出。 +- 是否减少了空洞叙事:是。把"显著低于"等模糊措辞替换为分时段定量阈值;把 nuScenes / nuPlan / Bench2Drive 的标准审稿人攻击点写进 limits / common_misuses。 +- 是否增加了可证伪证据:是。场景数由 6 扩到 11 并加入定量触发阈值,使自动化场景挖掘有可执行判定条件。 +- 是否帮助用户提出新问题:是。新增 3 类失败模式(语言模型规划幻觉、语言驱动决策循环记忆中毒、反事实分支与真实事故分布漂移),每一类都附带可投稿切入点。 +- 是否能经受审稿人质疑:是。所有指标公式现在通过量纲检查;所有 Tier-2 实验加入"至少 8 种子加 bootstrap 95% 置信区间"硬要求。 +- 是否通过实际页面与截图验证:部分。新增的论文聚合视图与视觉编码图例在本地静态服务下表现正常。 + +## Round 3(扩展到 RL / 离线 RL / 世界模型 / 安全约束) + +- 是否更接近论文产出:是。把强化学习骨干、离线强化学习、世界模型、安全约束四个核心方向都纳入结构化研究层,研究者可以选择从任意方向切入。 +- 是否减少了空洞叙事:是。Bitter Lesson 被改写为可证伪叙述,附带具体的可观测变量与单调阈值。 +- 是否增加了可证伪证据:是。新增 5 类失败模式与 2 条论证链直接覆盖原本缺失的研究方向。 +- 是否帮助用户提出新问题:是。每条新失败模式都标出可投稿切入点。 +- 是否能经受审稿人质疑:发表前不行——独立审查代理立刻在 Round 3 添加的内容中找出 PPO 表号误引、CARLA 与 CartPole 数据集错位、subject_papers 与 method_mechanism 内部矛盾、3% 魔术数字等问题(见 cross_review_round3.md)。 +- 是否通过实际页面与截图验证:部分。所有新数据在本地静态服务下正常加载。 + +## Round 3.5(CI 加固 + 截图回归脚手架 + 工作台分享链接) + +- 是否更接近论文产出:是。新加的 wb-anchor-pulse 与点击标题复制深链使评审讨论可以直接定位证据。 +- 是否减少了空洞叙事:不直接。这一轮是基础设施。 +- 是否增加了可证伪证据:不直接。 +- 是否帮助用户提出新问题:是。screenshot_regression.py 让 UI 变化在 CI 中可被察觉,配置好 Playwright 后会强制每次视觉变更被审阅。 +- 是否能经受审稿人质疑:CI 与门禁层面提升。 +- 是否通过实际页面与截图验证:部分。脚手架已就位,缺少正式 Playwright 安装。 + +## Round 4(落地 Round 3 审计) + +- 是否更接近论文产出:是。把 CQL、BCQ、Dreamer 升格为正式 graph 节点;每条相关主张的 subject 与 reproduction.public_data 都对齐到真实节点。 +- 是否减少了空洞叙事:是。PPO 表号校正,三条 RL claim 的 CARLA 错位修正,Bitter Lesson 从"差距同向变化"改写为"翻倍数据规模时差距单调收窄不少于 20%"。 +- 是否增加了可证伴证据:是。新增 dataset:rl_classic_control_suite 让经典控制基准的 limits 与 misuses 进入结构化研究层。 +- 是否帮助用户提出新问题:是。chain:offline_rl_versus_imitation_under_distribution_shift 加入"D4RL 上 CQL 强于 BC 可能来源于归一化与超参"这一最锋利的审稿人攻击。 +- 是否能经受审稿人质疑:是。chain:safety_constraint 的 3% 魔术数字替换为 nuPlan 官方分数容忍区间锚定,确认偏倚陷阱被改写。 +- 是否通过实际页面与截图验证:部分。全部路由在本地静态服务下返回 200。 + +--- + +## 累计交付(截至 Round 4) + +| 类别 | Round 1 | Round 2 | Round 3 | Round 4 | +|---|---|---|---|---| +| 可证伪主张 | 8 | 8 | 12 | 12 | +| 论文论证链 | 4 | 4 | 6 | 6 | +| 场景 | 6 | 11 | 11 | 11 | +| 数据集 | 6 | 6 | 6 | 7 | +| 指标 | 6 | 6 | 6 | 6 | +| 失败模式 | 8 | 12 | 17 | 17 | +| 实验计划 | 4 | 4 | 6 | 6 | +| graph 节点 | 75 | 75 | 75 | 78 | +| graph 边 | 122 | 122 | 122 | 128 | +| 交叉审查报告 | 1 | 1 | 2 | 2 | diff --git a/docs/data/research/scenarios.json b/docs/data/research/scenarios.json new file mode 100644 index 0000000..41bf671 --- /dev/null +++ b/docs/data/research/scenarios.json @@ -0,0 +1,115 @@ +{ + "$schema": "./schema.json#/node_kinds/scenario", + "scenarios": [ + { + "id": "scenario:unprotected_left_turn_with_occlusion", + "label": "未受保护左转且对向车被前车遮挡", + "description": "自车需要在没有保护相位的情况下完成左转,对向直行车被等候左转的前车整体或部分遮挡。定量触发阈值:对向车道在自车视线中的可观测比例小于 60% 且持续不少于 3 秒,自车与对向车的最短距离小于 25 米。", + "why_hard": "感知缺失阶段需要由意图推断与占用预测填补,闭环行为对延迟和谨慎度高度敏感,同时存在'过分谨慎导致永远不出发'与'冒进导致碰撞'的双侧失败。", + "current_best_methods": ["paper:2212.10156", "paper:2402.12289", "paper:vadv2"], + "open_failure_modes": ["failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:closed_loop_deadlock_under_uncertainty"], + "available_datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long", "dataset:bench2drive"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:route_completion", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:dense_pedestrian_crosswalk_at_night", + "label": "夜间或弱光下的密集人行横道", + "description": "在低光照与混合光源条件下,多名行人以非均匀步态横穿,部分行人会回头、改变速度或在车前停顿。定量触发阈值:环境光小于 30 勒克斯且自车前方 30 米内有 3 名以上行人,平均速度小于每秒 1.5 米。", + "why_hard": "相机信噪比下降,人体姿态线索退化,纯视觉模型容易丢失个体身份;意图建模与温柔减速的耦合直接影响乘员舒适与碰撞风险。", + "current_best_methods": ["paper:2212.10156", "paper:transfuser"], + "open_failure_modes": ["failure_mode:camera_signal_degradation_silent_failure", "failure_mode:ride_comfort_violation_due_to_late_braking"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index", "metric:rule_compliance_score"] + }, + { + "id": "scenario:highway_merge_at_speed_differential", + "label": "高速公路上速度差显著的并线汇入", + "description": "自车从匝道汇入主路。定量触发阈值:自车与主路平均车流的纵向速度差大于每秒 8 米,可用并线窗口长度小于 35 米,主路车流密度大于每千米 30 辆。", + "why_hard": "动作必须在长视野中具备前瞻性,主路车辆的让与不让本身就是博弈,规划器需要在不可观测的对方意图下做稳健决策。", + "current_best_methods": ["paper:2210.14222", "paper:vadv2"], + "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision"], + "available_datasets": ["dataset:waymo_open_motion", "dataset:nuplan_planning"], + "evaluation_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate", "metric:rule_compliance_score"] + }, + { + "id": "scenario:construction_zone_with_cone_lane_shift", + "label": "施工区临时锥桶车道偏移", + "description": "正常车道被临时锥桶封闭并向左偏移半个车道宽度。定量触发阈值:高清地图车道中线与运行时车道中线水平偏差大于 1.2 米,并且偏差区域长度大于 30 米,存在施工标志或锥桶序列。", + "why_hard": "高清地图与训练数据中很少出现此类临时几何,依赖地图先验的模型容易直接撞锥桶;视觉到行为的映射缺少足够的训练样本。", + "current_best_methods": ["paper:2212.10156", "paper:2402.12289"], + "open_failure_modes": ["failure_mode:map_prior_overrides_runtime_observation", "failure_mode:long_tail_object_recognition_miss"], + "available_datasets": ["dataset:carla_town05_long", "dataset:bench2drive"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:heavy_rain_with_camera_lens_droplet", + "label": "暴雨且相机镜头存在水珠", + "description": "雨势遮蔽路面标线与远处目标,镜头水珠造成局部图像退化或离散光斑。定量触发阈值:降雨强度大于每小时 16 毫米且镜头退化区域占图像有效面积大于 8%,路面湿滑系数低于 0.5。", + "why_hard": "纯视觉端到端模型对镜头退化敏感,水珠形成的伪边缘可能被检测器误识别为目标;需要在不可靠观测下保持合理速度而非急停。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:camera_signal_degradation_silent_failure"], + "available_datasets": ["dataset:carla_town05_long", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:long_tail_rare_object_on_road", + "label": "路面长尾稀有物体", + "description": "出现训练分布之外的可碰撞物体,例如掉落的家具、可乐瓶、施工材料、低矮动物等。定量触发阈值:物体在公开训练集上的类别频率小于 0.05%,几何高度低于 40 厘米或宽度小于 30 厘米,处于自车规划轨迹的纵向 20 米内。", + "why_hard": "训练分布外目标在监督数据中极度稀少,类别失衡使得检测器倾向忽略;规划层即便看见也可能在不知道目标类别属性时做错决策。", + "current_best_methods": ["paper:2402.12289", "paper:2311.10813"], + "open_failure_modes": ["failure_mode:long_tail_object_recognition_miss"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:bench2drive"], + "evaluation_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"] + }, + { + "id": "scenario:unprotected_merge_into_ring_road", + "label": "环岛无保护汇入与连续让行", + "description": "自车进入双车道环岛并需要在多次让行后变道驶出。定量触发阈值:环岛半径小于 25 米,环道车流密度大于每千米 40 辆,连续让行需求大于 2 次。", + "why_hard": "环岛是连续多车交互的最小封闭场景,对意图建模、让行时机与稳健博弈同时提出要求。规则与让行优先级在不同国家差异显著,难以从单一数据集泛化。", + "current_best_methods": ["paper:2210.14222", "paper:vadv2"], + "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision", "failure_mode:closed_loop_deadlock_under_uncertainty"], + "available_datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long"], + "evaluation_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate", "metric:rule_compliance_score"] + }, + { + "id": "scenario:low_speed_perpendicular_parking", + "label": "低速垂直泊车与紧贴障碍", + "description": "自车在停车场内执行垂直泊车,邻位有车且空间紧窄。定量触发阈值:可用泊位横向宽度小于 2.4 米,自车纵向速度低于每秒 1.5 米,距离最近障碍小于 30 厘米。", + "why_hard": "感知必须在近距离与低速下保留高分辨几何精度,对舒适与碰撞同时敏感,并要在自车与障碍的微小空间内多次微调。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:active_sensor_domain_degradation", "failure_mode:ride_comfort_violation_due_to_late_braking"], + "available_datasets": ["dataset:nuscenes_planning"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:sensor_calibration_drift", + "label": "传感器外参漂移与时间同步偏差", + "description": "相机与激光雷达的外参在长时间使用后产生小幅漂移。定量触发阈值:外参旋转误差大于 0.5 度或平移误差大于 5 厘米;时间同步误差大于 30 毫秒。", + "why_hard": "外参漂移在主指标上无显著早期信号,闭环失败往往集中在远距离或高速场景。多数端到端模型对外参变化没有显式建模。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:active_sensor_domain_degradation"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:school_zone_speed_compliance", + "label": "学区限速与儿童意图建模", + "description": "自车进入学区或临时低速区,路侧有儿童与监护人。定量触发阈值:限速由 50 公里每小时降为 30 公里每小时,路侧 8 米内有 3 名以上儿童,路面上方有低速标志或闪灯。", + "why_hard": "需要在快速合规减速的同时正确预测儿童意图与监护人姿态。规则合规与舒适、避碰需要平衡。", + "current_best_methods": ["paper:2212.10156"], + "open_failure_modes": ["failure_mode:ride_comfort_violation_due_to_late_braking", "failure_mode:multi_agent_interaction_indecision"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:emergency_vehicle_yield", + "label": "紧急车辆鸣笛靠近的让行决策", + "description": "自车在城市道路上听到或检测到鸣笛与警灯,需要安全靠边让行。定量触发阈值:紧急车辆与自车的相对距离小于 80 米且接近速度大于每秒 8 米,自车需要在 5 秒内开始让行机动。", + "why_hard": "声学信号在端到端视觉模型中往往缺失,需融合多模态线索;让行机动可能违反正常规则但符合更高优先级合规要求。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:language_hallucinated_maneuver", "failure_mode:multi_agent_interaction_indecision"], + "available_datasets": ["dataset:carla_town05_long"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:route_completion"] + } + ] +} diff --git a/docs/data/research/schema.json b/docs/data/research/schema.json new file mode 100644 index 0000000..7602a6d --- /dev/null +++ b/docs/data/research/schema.json @@ -0,0 +1,118 @@ +{ + "$comment": "Schema for the falsifiable research layer. Each file under docs/data/research/ obeys one of the shapes below. Quality gate: tools/validate_research.py.", + "version": 1, + "node_kinds": { + "claim": { + "purpose": "把节点拆成可审查的研究主张。每条主张必须能被某种证据证伪。", + "required_fields": { + "id": "唯一标识,形如 claim:uniad_query_sharing_helps_planning", + "subject": "主张归属的节点 id,例如 paper:2212.10156", + "statement": "这项工作或洞察真正声称的内容,一句完整通顺的中文。", + "evidence": "支撑材料数组。每项是 {kind: ablation|table|theorem|repro|external_benchmark, source: 文献页码/表号/复现仓库, finding: 一句概括}。", + "preconditions": "结论成立所需要的数据、场景、传感器、模型规模或训练设定数组。", + "counterexamples": "在哪些条件下结论可能不成立,数组。", + "boundaries": "适用范围到哪里为止,数组。", + "reproduction": "{minimal: 最小复现实验, public_data: 公开数据集 id, cost_hours: 预计 GPU 小时, expected_output: 预期可观察到的现象}。", + "publication_value": "可投稿价值:机制解释 / 系统改进 / 基准构建 / 失败模式发现 之一或多项。", + "dispute_level": "0..3 数值,0=社区共识,3=高度争议。", + "evidence_strength": "0..3 数值,0=主张,3=有强公开基准与可复现脚本支撑。", + "reproducibility_status": "verified | partial | inferred | speculative。", + "related_claims": "关联主张数组,可为空。", + "related_failure_modes": "相关失败模式 id 数组,可为空。" + } + }, + "argument_chain": { + "purpose": "把每个成熟选题写成完整论文骨架。", + "required_fields": { + "id": "唯一标识,形如 chain:uniad_query_sharing_for_planning", + "title": "选题中文标题,一句完整表述。", + "subject_papers": "主要论证标的节点 id 数组。", + "research_gap": "现有工作为什么不够,一段叙述。", + "core_claim": "本文要证明什么,一段叙述。", + "method_mechanism": "新方法为什么应该有效,一段叙述。", + "key_experiments": "哪些实验能证明机制,数组。", + "strong_baselines": "必须击败或解释的已有方法节点 id 数组。", + "ablations": "去掉哪些组件能验证贡献,数组。", + "negative_results": "哪些失败结果反而能澄清边界,数组。", + "reviewer_attacks": "最可能被质疑的地方,数组。", + "response_experiments": "如何提前堵住质疑,数组。", + "figure_plan": "图或表对应论证链哪一环,数组。", + "related_scenarios": "scenario 节点 id 数组。", + "related_datasets": "dataset 节点 id 数组。", + "related_metrics": "metric 节点 id 数组。" + } + }, + "scenario": { + "purpose": "把研究从方法名比较改为场景与证据比较。", + "required_fields": { + "id": "形如 scenario:occluded_left_turn_intersection", + "label": "场景中文标签,一句完整表述。", + "description": "触发条件、典型几何与时序结构。", + "why_hard": "为什么这个场景对当前方法依然困难。", + "current_best_methods": "目前在该场景表现最强的方法节点 id 数组。", + "open_failure_modes": "在该场景上仍然存在的失败模式 id 数组。", + "available_datasets": "支持该场景研究的 dataset id 数组。", + "evaluation_metrics": "应使用的 metric id 数组。" + } + }, + "dataset": { + "purpose": "明确每个数据集能证明什么、不能证明什么。", + "required_fields": { + "id": "形如 dataset:nuscenes_planning", + "label": "数据集名称与版本。", + "scale": "采集小时数、城市、传感器配置。", + "supports": "数据集能够支撑的研究主张类型数组。", + "limits": "数据集的边界与系统性盲点数组。", + "common_misuses": "常见误用模式与其后果数组。", + "covers_scenarios": "scenario id 数组。", + "license": "许可与商用条款。" + } + }, + "metric": { + "purpose": "明确每个指标能证明什么、不能证明什么、常见误用是什么。", + "required_fields": { + "id": "形如 metric:open_loop_l2_displacement", + "label": "指标中文名称。", + "formula": "公式或定义。变量必须给出含义。", + "variables": "{符号: 含义} 字典。", + "assumptions": "公式成立的前提数组。", + "what_it_proves": "这个指标能支撑的结论类型数组。", + "what_it_cannot_prove": "这个指标不能支撑的结论数组。", + "known_misuses": "常见误用与后果数组。", + "scope": "适用范围说明。" + } + }, + "failure_mode": { + "purpose": "把失败从附带说明升级为研究资产。", + "required_fields": { + "id": "形如 failure_mode:ego_status_leakage", + "label": "失败模式中文标题,一句完整表述。", + "trigger_conditions": "触发条件数组。", + "manifestation": "失败的可观察表现。", + "reproducible_setup": "可复现场景:数据集 / 闭环 / 单元测试。", + "diagnostic_metrics": "用来发现该失败的指标 id 或诊断协议数组。", + "method_weakness": "对应方法的根本短板。", + "partial_solutions": "已有半解数组,每项是 {idea, citation_or_repo, residual_gap}。", + "open_questions": "仍未解决的问题数组。", + "publication_angles": "可形成论文的切入点数组。" + } + }, + "experiment_plan": { + "purpose": "每个核心方向至少形成三层实验。", + "required_fields": { + "id": "形如 experiment_plan:planning_oriented_query_sharing", + "title": "实验计划中文标题。", + "subject": "对应的 argument_chain id 或 paper id。", + "tier_1_minimal_mechanism": "{purpose, environment, model, metrics, success_criteria, runtime_hours, expected_signal}。", + "tier_2_public_benchmark": "{purpose, datasets, baselines, metrics, success_criteria, compute_budget, expected_signal}。", + "tier_3_stress_test": "{purpose, distributions, perturbations, latency_budget, sensor_dropout, counterfactual_branches, success_criteria}。" + } + } + }, + "shared_axes": { + "evidence_strength": {"0": "尚无公开证据", "1": "单篇论文表格", "2": "多篇论文 + 复现", "3": "强公开基准 + 可复现脚本"}, + "dispute_level": {"0": "社区共识", "1": "主流但有反例", "2": "明显分歧", "3": "高度争议"}, + "reproducibility_status": ["verified", "partial", "inferred", "speculative"], + "maturity": {"0": "推测", "1": "原型", "2": "公开实现", "3": "已在多基准复现"} + } +} diff --git a/docs/index.html b/docs/index.html index dd0a9b4..7ccd00f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -53,10 +53,11 @@

- + 论文产出工作台 + 2D
@@ -115,6 +116,17 @@

关系连线

    +
    +

    视觉编码 · 研究维度

    +
      +
    • 节点尺寸 · 证据强度与研究成熟度
    • +
    • 饱和度降低 · 争议程度更高
    • +
    • 暖色偏移 · 失败边界数更多
    • +
    • 未被结构化研究层覆盖的节点
    • +
    +

    详细的可证伪主张、失败模式与实验计划请进入 论文产出工作台

    +
    +

    时间轴

    diff --git a/docs/js/atlas-main.js b/docs/js/atlas-main.js index 89d7f11..9a199ac 100644 --- a/docs/js/atlas-main.js +++ b/docs/js/atlas-main.js @@ -22,6 +22,7 @@ import { CardRenderer } from "./atlas-cards.js"; import { AtlasUI } from "./atlas-ui.js"; const GRAPH_PATH = "data/graph_extended.json"; +const RESEARCH_OVERLAY_PATH = "data/research/node_overlay.json"; async function loadGraph() { let r; @@ -36,6 +37,25 @@ async function loadGraph() { } } +// Decorate the graph's nodes with research-substance metadata so that +// visual encoding can reflect evidence strength, dispute level, +// reproducibility maturity and failure-boundary count. Silent on errors — +// the overlay is optional and the atlas must still render without it. +async function attachResearchOverlay(graph) { + try { + const r = await fetch(RESEARCH_OVERLAY_PATH); + if (!r.ok) return; + const overlay = await r.json(); + const subjects = overlay?.subjects || {}; + for (const node of graph.nodes) { + const info = subjects[node.id]; + if (info) node.research = info; + } + } catch (e) { + // overlay is optional; the atlas keeps working without it. + } +} + (async function main() { const status = (msg) => { const el = document.getElementById("loadingStatus"); @@ -43,6 +63,7 @@ async function loadGraph() { }; status("正在加载图谱数据……"); const graph = await loadGraph(); + await attachResearchOverlay(graph); status(`已就绪 ${graph.nodes.length} 节点 · ${graph.edges.length} 条关系,正在搭建星图……`); // ---------- Scene, renderer, camera ---------- diff --git a/docs/js/atlas-render.js b/docs/js/atlas-render.js index 10050f0..e4acafe 100644 --- a/docs/js/atlas-render.js +++ b/docs/js/atlas-render.js @@ -119,7 +119,13 @@ function createInstancedFamily(family, nodes, topicPalette) { for (let i = 0; i < count; i++) { const node = nodes[i]; const tierMul = TIER_SIZE[node.tier] || 1.0; - const size = (visual.baseSize || 1.0) * tierMul; + // Research-substance visual binding: maturity inflates size, dispute + // dims saturation, failure-boundary count is signalled by a red lift on + // the emissive channel. Nodes without research data render unchanged. + const r = node.research || null; + const maturityMul = r ? (1.0 + 0.10 * (r.maturity || 0)) : 1.0; + const evidenceMul = r ? (1.0 + 0.06 * (r.evidence_strength || 0)) : 1.0; + const size = (visual.baseSize || 1.0) * tierMul * maturityMul * evidenceMul; node._visualSize = size; node._visualFamily = family; m.makeScale(size, size, size); @@ -132,6 +138,16 @@ function createInstancedFamily(family, nodes, topicPalette) { colorTmp.r = Math.min(1, colorTmp.r + lift); colorTmp.g = Math.min(1, colorTmp.g + lift); colorTmp.b = Math.min(1, colorTmp.b + lift); + if (r) { + // Dispute desaturates toward grey; failure boundaries push toward warm. + const disputeFade = Math.min(0.5, 0.12 * (r.dispute_level || 0)); + const fbWarm = Math.min(0.25, 0.06 * (r.failure_boundary_count || 0)); + const grey = 0.55; + colorTmp.r = colorTmp.r * (1 - disputeFade) + grey * disputeFade + fbWarm; + colorTmp.g = colorTmp.g * (1 - disputeFade) + grey * disputeFade; + colorTmp.b = colorTmp.b * (1 - disputeFade) + grey * disputeFade; + colorTmp.r = Math.min(1, colorTmp.r); + } mesh.setColorAt(i, colorTmp); } mesh.instanceMatrix.needsUpdate = true; diff --git a/docs/js/atlas-ui.js b/docs/js/atlas-ui.js index abf233c..5b6908c 100644 --- a/docs/js/atlas-ui.js +++ b/docs/js/atlas-ui.js @@ -16,7 +16,7 @@ export class AtlasUI { activeYearMax: 2026, searchQuery: "", layer: "galaxy", - autoSpin: true, + autoSpin: false, showEdges: true, showLabels: true, }; diff --git a/docs/js/workbench.js b/docs/js/workbench.js new file mode 100644 index 0000000..52b8fa6 --- /dev/null +++ b/docs/js/workbench.js @@ -0,0 +1,772 @@ +// Paper Production Workbench — research-first UI. +// Reads the structured research layer (claims, argument chains, scenarios, +// datasets, metrics, failure modes, experiment plans) and renders the views +// described in workbench.html. No 3D, no auto-spin, no decorative animation. + +const RESEARCH = { + claims: "data/research/claims.json", + chains: "data/research/argument_chains.json", + scenarios: "data/research/scenarios.json", + datasets: "data/research/datasets.json", + metrics: "data/research/metrics.json", + failures: "data/research/failure_modes.json", + experiments: "data/research/experiment_plans.json", +}; + +const BASKET_STORAGE_KEY = "wb-basket-v1"; + +function loadBasketFromStorage() { + try { + const raw = localStorage.getItem(BASKET_STORAGE_KEY); + if (!raw) return new Set(); + const arr = JSON.parse(raw); + return Array.isArray(arr) ? new Set(arr) : new Set(); + } catch { return new Set(); } +} + +function saveBasketToStorage(basket) { + try { + localStorage.setItem(BASKET_STORAGE_KEY, JSON.stringify(Array.from(basket))); + } catch { /* ignore quota errors */ } +} + +const state = { + view: "claims", + subview: "scenarios", + search: "", + evidenceFilter: new Set(["0", "1", "2", "3"]), + disputeFilter: new Set(["0", "1", "2", "3"]), + reproFilter: new Set(["verified", "partial", "inferred", "speculative"]), + paperFilter: new Set(), + basket: loadBasketFromStorage(), + data: {}, +}; + +async function loadAll() { + const entries = Object.entries(RESEARCH); + const results = await Promise.all(entries.map(async ([key, url]) => { + const r = await fetch(url); + return [key, await r.json()]; + })); + for (const [key, val] of results) state.data[key] = val; +} + +function $(sel, root = document) { return root.querySelector(sel); } +function $$(sel, root = document) { return Array.from(root.querySelectorAll(sel)); } + +function escapeHtml(s) { + return String(s ?? "").replace(/[<>&"]/g, c => ({ "<":"<",">":">","&":"&","\"":""" }[c])); +} + +function mdInline(text) { + // Light inline rendering: allow bold/italic/code/links via marked, sanitize via DOMPurify. + if (window.marked && window.DOMPurify) { + return window.DOMPurify.sanitize(window.marked.parseInline(String(text ?? ""))); + } + return escapeHtml(text); +} + +function renderMath(scope = document) { + if (window.renderMathInElement) { + window.renderMathInElement(scope, { + delimiters: [ + { left: "$$", right: "$$", display: true }, + { left: "$", right: "$", display: false }, + ], + throwOnError: false, + }); + } +} + +// ---------- Filters ---------- +function passesFilters(claim) { + if (state.evidenceFilter.size && !state.evidenceFilter.has(String(claim.evidence_strength ?? 0))) return false; + if (state.disputeFilter.size && !state.disputeFilter.has(String(claim.dispute_level ?? 0))) return false; + if (state.reproFilter.size && !state.reproFilter.has(claim.reproducibility_status)) return false; + if (state.paperFilter.size && claim.subject && !state.paperFilter.has(claim.subject)) return false; + if (state.search) { + const hay = [ + claim.statement, claim.id, claim.subject, claim.publication_value, + ...(claim.preconditions || []), ...(claim.counterexamples || []), ...(claim.boundaries || []), + ...((claim.evidence || []).map(e => `${e.kind} ${e.source} ${e.finding}`)), + ].join(" ").toLowerCase(); + if (!hay.includes(state.search.toLowerCase())) return false; + } + return true; +} + +// ---------- Renderers ---------- +function evidenceTag(level) { + const labels = ["推测", "单一来源", "多来源", "强公开复现"]; + const lv = Math.max(0, Math.min(3, level || 0)); + return `证据 · ${labels[lv]}`; +} +function disputeTag(level) { + const labels = ["共识", "主流", "明显分歧", "高度争议"]; + const lv = Math.max(0, Math.min(3, level || 0)); + return `争议 · ${labels[lv]}`; +} +function reproTag(status) { + const labels = { verified: "已复现", partial: "部分复现", inferred: "推断", speculative: "尚待验证" }; + return `复现 · ${labels[status] || status}`; +} + +function pickBtn(id, kind) { + const picked = state.basket.has(`${kind}:${id}`); + return ``; +} + +function collapsible(label, body, open = false) { + return ` +
    +
    + ${escapeHtml(label)} + ${open ? "▾" : "▸"} +
    +
    ${body}
    +
    `; +} + +function renderClaimCard(c) { + const subjectLabel = c.subject ? c.subject : "—"; + return ` +
    +
    +
    +
    ${mdInline(c.statement)}
    +
    + 主体 · ${escapeHtml(subjectLabel)} + ${evidenceTag(c.evidence_strength)} + ${disputeTag(c.dispute_level)} + ${reproTag(c.reproducibility_status)} + 价值 · ${escapeHtml(c.publication_value || "—")} +
    +
    + ${pickBtn(c.id, "claim")} +
    + + ${collapsible("证据", ` +
    ${(c.evidence || []).map(ev => ` +
    + ${escapeHtml(ev.kind || "—")} + ${mdInline(ev.finding || "")} +
    来源:${escapeHtml(ev.source || "—")}
    +
    `).join("") || "

    暂无

    "}
    + `, true)} + + ${collapsible("前提", `
      ${(c.preconditions || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("反例", `
      ${(c.counterexamples || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("边界", `
      ${(c.boundaries || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("可复现实验", ` +

    最小实验:${mdInline(c.reproduction?.minimal || "—")}

    +

    公开数据:${escapeHtml(c.reproduction?.public_data || "—")}

    +

    算力预算:${escapeHtml(String(c.reproduction?.cost_hours ?? "—"))} GPU 小时

    +

    预期信号:${mdInline(c.reproduction?.expected_output || "—")}

    + `)} + ${(c.related_claims?.length || c.related_failure_modes?.length) ? collapsible("关联", ` + ${c.related_claims?.length ? `

    关联主张:${c.related_claims.map(id => `${escapeHtml(id)}`).join(",")}

    ` : ""} + ${c.related_failure_modes?.length ? `

    相关失败模式:${c.related_failure_modes.map(id => `${escapeHtml(id)}`).join(",")}

    ` : ""} + `) : ""} +
    `; +} + +function renderChainCard(ch) { + return ` +
    +
    +
    +
    ${mdInline(ch.title)}
    +
    + ${(ch.subject_papers || []).map(p => `论文 · ${escapeHtml(p)}`).join("")} +
    +
    + ${pickBtn(ch.id, "chain")} +
    + ${collapsible("研究缺口", `

    ${mdInline(ch.research_gap || "")}

    `, true)} + ${collapsible("核心主张", `

    ${mdInline(ch.core_claim || "")}

    `, true)} + ${collapsible("方法机制", `

    ${mdInline(ch.method_mechanism || "")}

    `)} + ${collapsible("关键实验", `
      ${(ch.key_experiments || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("强基线", `
      ${(ch.strong_baselines || []).map(x => `
    • ${escapeHtml(x)}
    • `).join("")}
    `)} + ${collapsible("消融实验", `
      ${(ch.ablations || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("可能的负结果", `
      ${(ch.negative_results || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("审稿人攻击点", `
      ${(ch.reviewer_attacks || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("回应实验", `
      ${(ch.response_experiments || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("图表计划", `
      ${(ch.figure_plan || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("关联场景与数据", ` +

    场景:${(ch.related_scenarios || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

    +

    数据集:${(ch.related_datasets || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

    +

    指标:${(ch.related_metrics || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

    + `)} +
    `; +} + +function renderScenarioCard(s) { + return ` +
    +
    +
    +
    ${mdInline(s.label)}
    +
    + ${(s.current_best_methods || []).slice(0, 3).map(m => `当前最强 · ${escapeHtml(m)}`).join("")} +
    +
    + ${pickBtn(s.id, "scenario")} +
    + ${collapsible("场景描述", `

    ${mdInline(s.description || "")}

    `, true)} + ${collapsible("为什么困难", `

    ${mdInline(s.why_hard || "")}

    `, true)} + ${collapsible("开放失败模式", `
      ${(s.open_failure_modes || []).map(x => `
    • ${escapeHtml(x)}
    • `).join("")}
    `)} + ${collapsible("可用数据集", `
      ${(s.available_datasets || []).map(x => `
    • ${escapeHtml(x)}
    • `).join("")}
    `)} + ${collapsible("应用指标", `
      ${(s.evaluation_metrics || []).map(x => `
    • ${escapeHtml(x)}
    • `).join("")}
    `)} +
    `; +} + +function renderDatasetCard(d) { + return ` +
    +
    +
    +
    ${mdInline(d.label)}
    +
    许可 · ${escapeHtml(d.license || "—")}
    +
    + ${pickBtn(d.id, "dataset")} +
    + ${collapsible("规模与传感器", `

    ${mdInline(d.scale || "")}

    `, true)} + ${collapsible("能支撑什么", `
      ${(d.supports || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `, true)} + ${collapsible("边界与盲点", `
      ${(d.limits || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `, true)} + ${collapsible("常见误用", `
      ${(d.common_misuses || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("覆盖场景", `
      ${(d.covers_scenarios || []).map(x => `
    • ${escapeHtml(x)}
    • `).join("")}
    `)} +
    `; +} + +function renderMetricCard(m) { + return ` +
    +
    +
    +
    ${mdInline(m.label)}
    +
    适用范围 · ${escapeHtml((m.scope || "—").slice(0, 24))}
    +
    + ${pickBtn(m.id, "metric")} +
    + ${collapsible("公式与变量", ` +

    $$${escapeHtml(m.formula || "")}$$

    +
      ${Object.entries(m.variables || {}).map(([k, v]) => `
    • ${escapeHtml(k)}:${mdInline(v)}
    • `).join("")}
    + `, true)} + ${collapsible("公式前提", `
      ${(m.assumptions || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("能证明什么", `
      ${(m.what_it_proves || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `, true)} + ${collapsible("不能证明什么", `
      ${(m.what_it_cannot_prove || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `, true)} + ${collapsible("常见误用", `
      ${(m.known_misuses || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} +
    `; +} + +function renderFailureCard(f) { + return ` +
    +
    +
    +
    ${mdInline(f.label)}
    +
    + ${(f.diagnostic_metrics || []).slice(0, 3).map(x => `诊断 · ${escapeHtml(x)}`).join("")} +
    +
    + ${pickBtn(f.id, "failure")} +
    + ${collapsible("触发条件", `
      ${(f.trigger_conditions || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `, true)} + ${collapsible("失败表现", `

    ${mdInline(f.manifestation || "")}

    `, true)} + ${collapsible("可复现脚本", `

    ${mdInline(f.reproducible_setup || "")}

    `)} + ${collapsible("方法短板", `

    ${mdInline(f.method_weakness || "")}

    `, true)} + ${collapsible("已有半解", `${(f.partial_solutions || []).map(s => ` +
    +
    ${mdInline(s.idea || "")}
    +
    参考:${escapeHtml(s.citation_or_repo || "—")}
    +
    残余间隙:${mdInline(s.residual_gap || "—")}
    +
    `).join("")}`)} + ${collapsible("开放问题", `
      ${(f.open_questions || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} + ${collapsible("可投稿切入点", `
      ${(f.publication_angles || []).map(x => `
    • ${mdInline(x)}
    • `).join("")}
    `)} +
    `; +} + +function renderExperimentCard(ex) { + function tier(label, t) { + if (!t) return ""; + const entries = Object.entries(t).map(([k, v]) => { + const value = Array.isArray(v) ? `
      ${v.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : + (typeof v === "object" ? `
    ${escapeHtml(JSON.stringify(v, null, 2))}
    ` : `

    ${mdInline(v)}

    `); + return `
    ${escapeHtml(k.replace(/_/g, " "))}
    ${value}
    `; + }).join(""); + return collapsible(label, entries, true); + } + return ` +
    +
    +
    +
    ${mdInline(ex.title)}
    +
    归属 · ${escapeHtml(ex.subject || "—")}
    +
    + ${pickBtn(ex.id, "experiment")} +
    + ${tier("第一层 · 最小机制实验", ex.tier_1_minimal_mechanism)} + ${tier("第二层 · 公开基准实验", ex.tier_2_public_benchmark)} + ${tier("第三层 · 压力测试实验", ex.tier_3_stress_test)} +
    `; +} + +// ---------- Lists ---------- +function renderClaims() { + const list = (state.data.claims?.claims || []).filter(passesFilters); + $("#claimsList").innerHTML = list.map(renderClaimCard).join("") || "

    当前筛选下无匹配主张。

    "; +} +function renderChains() { + const list = (state.data.chains?.argument_chains || []).filter(ch => { + if (!state.search) return true; + const hay = JSON.stringify(ch).toLowerCase(); + return hay.includes(state.search.toLowerCase()); + }); + $("#chainsList").innerHTML = list.map(renderChainCard).join("") || "

    当前筛选下无匹配选题。

    "; +} +function renderScenarios() { + const list = (state.data.scenarios?.scenarios || []).filter(s => { + if (!state.search) return true; + return JSON.stringify(s).toLowerCase().includes(state.search.toLowerCase()); + }); + $("#scenariosList").innerHTML = list.map(renderScenarioCard).join("") || "

    当前筛选下无匹配场景。

    "; +} +function renderDatasets() { + const list = (state.data.datasets?.datasets || []).filter(d => !state.search || JSON.stringify(d).toLowerCase().includes(state.search.toLowerCase())); + $("#datasetsList").innerHTML = list.map(renderDatasetCard).join("") || "

    当前筛选下无匹配数据集。

    "; +} +function renderMetrics() { + const list = (state.data.metrics?.metrics || []).filter(m => !state.search || JSON.stringify(m).toLowerCase().includes(state.search.toLowerCase())); + const html = list.map(renderMetricCard).join("") || "

    当前筛选下无匹配指标。

    "; + $("#metricsList").innerHTML = html; + renderMath($("#metricsList")); +} +function renderFailures() { + const list = (state.data.failures?.failure_modes || []).filter(f => !state.search || JSON.stringify(f).toLowerCase().includes(state.search.toLowerCase())); + $("#failuresList").innerHTML = list.map(renderFailureCard).join("") || "

    当前筛选下无匹配失败模式。

    "; +} +function renderExperiments() { + const list = (state.data.experiments?.experiment_plans || []).filter(e => !state.search || JSON.stringify(e).toLowerCase().includes(state.search.toLowerCase())); + $("#experimentsList").innerHTML = list.map(renderExperimentCard).join("") || "

    当前筛选下无匹配实验计划。

    "; +} + +function renderGaps() { + const scenarios = state.data.scenarios?.scenarios || []; + const claims = state.data.claims?.claims || []; + const failures = state.data.failures?.failure_modes || []; + const datasets = state.data.datasets?.datasets || []; + const metrics = state.data.metrics?.metrics || []; + + // 1. Scenarios with no current best method. + const orphanScenarios = scenarios.filter(s => !(s.current_best_methods || []).length); + + // 2. Papers (claim subjects) without any claim-level related_failure_modes. + const papersWithFailures = new Set(); + const papersWithClaims = new Set(); + for (const c of claims) { + if (!c.subject) continue; + papersWithClaims.add(c.subject); + for (const fm of c.related_failure_modes || []) papersWithFailures.add(c.subject); + } + const orphanPapers = Array.from(papersWithClaims).filter(p => !papersWithFailures.has(p)); + + // 3. Datasets / metrics not referenced by any claim.reproduction.public_data or scenario.evaluation_metrics. + const referencedDatasets = new Set(); + const referencedMetrics = new Set(); + for (const c of claims) if (c.reproduction?.public_data) referencedDatasets.add(c.reproduction.public_data); + for (const s of scenarios) { + for (const d of s.available_datasets || []) referencedDatasets.add(d); + for (const m of s.evaluation_metrics || []) referencedMetrics.add(m); + } + const orphanDatasets = datasets.filter(d => !referencedDatasets.has(d.id)); + const orphanMetrics = metrics.filter(m => !referencedMetrics.has(m.id)); + + // 4. Failure modes not referenced by any claim or scenario. + const referencedFailures = new Set(); + for (const c of claims) for (const fm of c.related_failure_modes || []) referencedFailures.add(fm); + for (const s of scenarios) for (const fm of s.open_failure_modes || []) referencedFailures.add(fm); + const orphanFailures = failures.filter(f => !referencedFailures.has(f.id)); + + function section(title, hint, items, render) { + return ` +
    +
    ${escapeHtml(title)} ${items.length}
    +

    ${escapeHtml(hint)}

    + ${items.length ? `
      ${items.map(render).join("")}
    ` : "

    该类缺口当前为空。

    "} +
    `; + } + + const html = ` +
    + ${section("尚无可信公开报告的场景", "current_best_methods 为空。直接对应一篇基准构建或方法首报论文。", orphanScenarios, s => `
  • ${escapeHtml(s.label)}
    触发条件:${escapeHtml((s.description || "").slice(0, 120))}…
  • `)} + ${section("有主张但无关联失败模式的论文", "结构化研究层把'方法节点必须有失败边界'作为硬要求。如果某论文出现在这里,意味着可以专门写一篇该论文的失败模式诊断论文。", orphanPapers, p => `
  • ${escapeHtml(p)}
  • `)} + ${section("未被任何主张或场景引用的数据集", "意味着这些数据集尚未被纳入研究层主线证据,可以作为补充基准评估的切入点。", orphanDatasets, d => `
  • ${escapeHtml(d.label)}
  • `)} + ${section("未被任何主张或场景引用的指标", "意味着这些指标没有被任何主张采用,可能反映指标定义不清或社区还没把它纳入主流评测。", orphanMetrics, m => `
  • ${escapeHtml(m.label)}
  • `)} + ${section("未被任何主张或场景引用的失败模式", "失败模式应当被至少一条主张或场景引用。出现在这里说明该模式仍是孤立的研究资产,可投稿。", orphanFailures, f => `
  • ${escapeHtml(f.label)}
  • `)} +
    `; + $("#gapsList").innerHTML = html; +} + +function renderCoverage() { + const scenarios = state.data.scenarios?.scenarios || []; + const datasets = state.data.datasets?.datasets || []; + const allPapers = new Set(); + for (const s of scenarios) for (const p of s.current_best_methods || []) allPapers.add(p); + // Build a scenario × dataset coverage table. + function tableSection(title, hint, colItems, isCovered, colLabel) { + const head = `场景${colItems.map(c => `${escapeHtml(colLabel(c))}`).join("")}`; + const rows = scenarios.map(s => { + const cells = colItems.map(c => isCovered(s, c) ? `${escapeHtml(c.id || c)}` : ``).join(""); + return `${escapeHtml(s.label)}${cells}`; + }).join(""); + return ` +
    +

    ${escapeHtml(title)}

    +

    ${escapeHtml(hint)}

    + ${head}${rows}
    +
    `; + } + const datasetGrid = tableSection( + "场景 × 数据集", + "横轴为可用数据集,竖轴为场景。空白格说明该数据集对该场景没有声明覆盖,是研究者拓展基准的切入点。", + datasets, + (s, d) => (s.available_datasets || []).includes(d.id), + (d) => d.label + ); + const paperList = Array.from(allPapers).sort(); + const paperGrid = tableSection( + "场景 × 当前最强方法", + "横轴为社区当前最常被引用为该场景上最强的方法,空白格代表该场景下没有可信公开报告。", + paperList.length ? paperList : ["—"], + (s, p) => (s.current_best_methods || []).includes(p), + (p) => p + ); + $("#coverageGrids").innerHTML = datasetGrid + paperGrid; +} + +function renderPapersAggregate() { + // Build paper-centric aggregation: for each unique subject paper id, collect + // all claims, related failure modes, chains that mention it, experiment + // plans whose subject is one of those chains, and the scenarios/datasets + // implied by the chains. + const claims = state.data.claims?.claims || []; + const chains = state.data.chains?.argument_chains || []; + const plans = state.data.experiments?.experiment_plans || []; + const failures = state.data.failures?.failure_modes || []; + + const papers = new Map(); // paperId -> { claims, chains, plans, failure_modes, scenarios, datasets } + function ensure(id) { + if (!papers.has(id)) { + papers.set(id, { id, claims: [], chains: [], plans: [], failure_modes: new Set(), scenarios: new Set(), datasets: new Set(), metrics: new Set() }); + } + return papers.get(id); + } + for (const c of claims) { + if (!c.subject) continue; + const p = ensure(c.subject); + p.claims.push(c); + for (const fm of c.related_failure_modes || []) p.failure_modes.add(fm); + } + for (const ch of chains) { + for (const pid of ch.subject_papers || []) { + const p = ensure(pid); + p.chains.push(ch); + for (const s of ch.related_scenarios || []) p.scenarios.add(s); + for (const d of ch.related_datasets || []) p.datasets.add(d); + for (const m of ch.related_metrics || []) p.metrics.add(m); + } + } + for (const ex of plans) { + for (const ch of chains) { + if (ex.subject === ch.id) { + for (const pid of ch.subject_papers || []) ensure(pid).plans.push(ex); + } + } + } + const failureLabel = new Map(failures.map(f => [f.id, f.label])); + + const sortedIds = Array.from(papers.keys()).sort(); + const cards = sortedIds.map(pid => { + const p = papers.get(pid); + if (state.search && !pid.toLowerCase().includes(state.search.toLowerCase())) { + const hay = JSON.stringify({ pid, claims: p.claims.map(c => c.statement), chains: p.chains.map(c => c.title) }).toLowerCase(); + if (!hay.includes(state.search.toLowerCase())) return ""; + } + const claimsHtml = p.claims.map(c => `
  • ${evidenceTag(c.evidence_strength)} ${mdInline(c.statement)}
  • `).join(""); + const failureHtml = Array.from(p.failure_modes).map(id => `
  • ${escapeHtml(id)} · ${escapeHtml(failureLabel.get(id) || "")}
  • `).join(""); + const chainsHtml = p.chains.map(ch => `
  • ${mdInline(ch.title)} (论证链:${escapeHtml(ch.id)})
  • `).join(""); + const plansHtml = p.plans.map(ex => `
  • ${mdInline(ex.title)} (实验计划:${escapeHtml(ex.id)})
  • `).join(""); + const scenariosHtml = Array.from(p.scenarios).map(id => `
  • ${escapeHtml(id)}
  • `).join(""); + return ` +
    +
    +
    +
    ${escapeHtml(pid)}
    +
    + 主张 · ${p.claims.length} + 论证链 · ${p.chains.length} + 实验计划 · ${p.plans.length} + 失败边界 · ${p.failure_modes.size} +
    +
    +
    + ${collapsible("围绕本论文的可证伪主张", `
      ${claimsHtml || "
    • 暂无
    • "}
    `, true)} + ${collapsible("被诊断到的失败模式", `
      ${failureHtml || "
    • 暂无
    • "}
    `, true)} + ${collapsible("涉及的论文论证链", `
      ${chainsHtml || "
    • 暂无
    • "}
    `)} + ${collapsible("配套实验计划", `
      ${plansHtml || "
    • 暂无
    • "}
    `)} + ${collapsible("延伸场景", `
      ${scenariosHtml || "
    • 暂无
    • "}
    `)} +
    `; + }).filter(Boolean).join(""); + + $("#papersList").innerHTML = cards || "

    当前筛选下无匹配论文。

    "; +} + +// ---------- Basket ---------- +function basketCount() { + $("#wbBasketCount").textContent = String(state.basket.size); + const enabled = state.basket.size >= 2; + $("#wbCompareBtn").disabled = !enabled; + $("#wbCommonPrereqBtn").disabled = !enabled; + $("#wbDivergeBtn").disabled = !enabled; +} +function togglePick(key) { + if (state.basket.has(key)) state.basket.delete(key); else state.basket.add(key); + saveBasketToStorage(state.basket); + basketCount(); +} +function lookupBasketItem(key) { + const [kind, id] = key.split(":", 2); + const restId = key.slice(kind.length + 1); + if (kind === "claim") return (state.data.claims?.claims || []).find(c => c.id === restId); + if (kind === "chain") return (state.data.chains?.argument_chains || []).find(c => c.id === restId); + if (kind === "scenario") return (state.data.scenarios?.scenarios || []).find(s => s.id === restId); + if (kind === "dataset") return (state.data.datasets?.datasets || []).find(d => d.id === restId); + if (kind === "metric") return (state.data.metrics?.metrics || []).find(m => m.id === restId); + if (kind === "failure") return (state.data.failures?.failure_modes || []).find(f => f.id === restId); + if (kind === "experiment") return (state.data.experiments?.experiment_plans || []).find(e => e.id === restId); + return null; +} + +function renderBasketCompare() { + const items = Array.from(state.basket).map(k => ({ key: k, kind: k.split(":")[0], obj: lookupBasketItem(k) })).filter(x => x.obj); + if (items.length === 0) { + $("#basketDetail").innerHTML = "

    选择篮为空。请到任意视图加入两个或以上对象后回到这里。

    "; + return; + } + // Determine common keys across selected claims (or generic objects) + function flat(obj) { + return [ + ...Object.entries(obj).filter(([k, v]) => typeof v === "string").map(([k, v]) => `${k}=${v}`), + ...Object.entries(obj).filter(([k, v]) => Array.isArray(v)).flatMap(([k, v]) => v.filter(x => typeof x === "string").map(x => `${k}=${x}`)), + ]; + } + const sets = items.map(it => new Set(flat(it.obj))); + const intersection = sets.reduce((acc, s) => acc.size === 0 ? new Set(s) : new Set([...acc].filter(x => s.has(x))), new Set()); + + const cols = items.map(it => ` +
    +

    ${mdInline(it.obj.label || it.obj.statement || it.obj.title || it.key)}

    +
    归属

    ${escapeHtml(it.kind)} · ${escapeHtml(it.obj.subject || it.obj.id || "—")}

    + ${it.obj.evidence ? `
    证据
      ${(it.obj.evidence || []).map(e => `
    • ${mdInline(e.finding)} (${escapeHtml(e.source || "—")})
    • `).join("")}
    ` : ""} + ${it.obj.preconditions ? `
    前提
      ${it.obj.preconditions.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : ""} + ${it.obj.counterexamples ? `
    反例
      ${it.obj.counterexamples.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : ""} + ${it.obj.boundaries ? `
    边界
      ${it.obj.boundaries.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : ""} + ${it.obj.limits ? `
    边界与盲点
      ${it.obj.limits.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : ""} + ${it.obj.what_it_cannot_prove ? `
    不能证明
      ${it.obj.what_it_cannot_prove.map(x => `
    • ${mdInline(x)}
    • `).join("")}
    ` : ""} +
    + `).join(""); + + const commons = Array.from(intersection).map(s => `
  • ${escapeHtml(s)}
  • `).join(""); + $("#basketDetail").innerHTML = ` +
    +

    并排对比

    +
    ${cols}
    +
    +
    +

    共同前置

    +

    所选对象在字段级别上共享的字符串属性:

    +
      ${commons || "
    • 未发现共同字段。
    • "}
    +
    +
    +

    分歧路径

    +

    每个对象独占的关键字段(反例 / 边界 / 不能证明等),凸显它们在论文论证中的真实差异:

    +
    + ${items.map(it => { + const own = [...(it.obj.counterexamples || []), ...(it.obj.boundaries || []), ...(it.obj.what_it_cannot_prove || []), ...(it.obj.limits || [])]; + return `

    ${mdInline(it.obj.label || it.obj.statement || it.obj.title || it.key)}

    +
      ${own.length ? own.map(x => `
    • ${mdInline(x)}
    • `).join("") : "
    • "}
    `; + }).join("")} +
    +
    + `; + renderMath($("#basketDetail")); +} + +// ---------- Wiring ---------- +function showView(view) { + state.view = view; + $$(".wb-tab").forEach(t => t.classList.toggle("active", t.dataset.view === view)); + for (const v of ["claims", "chains", "scenarios", "failures", "experiments", "papers", "coverage", "gaps", "basket"]) { + const el = document.getElementById("view" + v[0].toUpperCase() + v.slice(1)); + if (el) el.hidden = view !== v; + } + if (view === "claims") renderClaims(); + if (view === "chains") renderChains(); + if (view === "scenarios") { + renderScenarios(); renderDatasets(); renderMetrics(); + } + if (view === "failures") renderFailures(); + if (view === "experiments") renderExperiments(); + if (view === "papers") renderPapersAggregate(); + if (view === "coverage") renderCoverage(); + if (view === "gaps") renderGaps(); + if (view === "basket") renderBasketCompare(); + // permalink + const params = new URLSearchParams(window.location.search); + params.set("view", view); + window.history.replaceState({}, "", `${window.location.pathname}?${params.toString()}`); +} + +function buildPaperFilter() { + const ids = new Set(); + for (const c of (state.data.claims?.claims || [])) if (c.subject) ids.add(c.subject); + const host = $("#wbPaperFilter"); + host.innerHTML = Array.from(ids).map(id => ``).join(""); + state.paperFilter = new Set(ids); +} + +function wireEvents() { + // View tabs + $$(".wb-tab").forEach(t => t.addEventListener("click", () => showView(t.dataset.view))); + // Subnav (scenarios / datasets / metrics) + $$(".wb-subtab").forEach(t => t.addEventListener("click", () => { + $$(".wb-subtab").forEach(x => x.classList.toggle("active", x === t)); + state.subview = t.dataset.sub; + $("#scenariosList").hidden = state.subview !== "scenarios"; + $("#datasetsList").hidden = state.subview !== "datasets"; + $("#metricsList").hidden = state.subview !== "metrics"; + })); + // Search + $("#wbSearch").addEventListener("input", e => { + state.search = e.target.value || ""; + showView(state.view); + }); + // Evidence filter + $("#wbEvidenceFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-evidence]"); if (!chip) return; + chip.classList.toggle("active"); + state.evidenceFilter = new Set($$("#wbEvidenceFilter .wb-chip.active").map(c => c.dataset.evidence)); + if (state.view === "claims") renderClaims(); + }); + $("#wbDisputeFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-dispute]"); if (!chip) return; + chip.classList.toggle("active"); + state.disputeFilter = new Set($$("#wbDisputeFilter .wb-chip.active").map(c => c.dataset.dispute)); + if (state.view === "claims") renderClaims(); + }); + $("#wbReproFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-repro]"); if (!chip) return; + chip.classList.toggle("active"); + state.reproFilter = new Set($$("#wbReproFilter .wb-chip.active").map(c => c.dataset.repro)); + if (state.view === "claims") renderClaims(); + }); + $("#wbPaperFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-paper]"); if (!chip) return; + chip.classList.toggle("active"); + state.paperFilter = new Set($$("#wbPaperFilter .wb-chip.active").map(c => c.dataset.paper)); + if (state.view === "claims") renderClaims(); + }); + // Card events delegated on main + $("#wbMain").addEventListener("click", e => { + const pick = e.target.closest("[data-pick]"); + if (pick) { + togglePick(pick.dataset.pick); + pick.classList.toggle("picked"); + pick.textContent = pick.classList.contains("picked") ? "已选" : "加入选择篮"; + return; + } + const toggle = e.target.closest("[data-toggle]"); + if (toggle) { + const body = toggle.parentElement.querySelector(".wb-section-body"); + if (body) { + body.hidden = !body.hidden; + const arrow = toggle.querySelector("span:last-child"); + if (arrow) arrow.textContent = body.hidden ? "▸" : "▾"; + } + } + // Click on a card title copies a sharable permalink to clipboard. + const titleClick = e.target.closest(".wb-card-title"); + if (titleClick) { + const card = titleClick.closest(".wb-card"); + const id = card && card.getAttribute("data-id"); + if (id) { + const params = new URLSearchParams(window.location.search); + const url = `${window.location.origin}${window.location.pathname}?${params.toString()}#${encodeURIComponent(id)}`; + try { navigator.clipboard?.writeText(url); } catch {} + window.history.replaceState({}, "", `${window.location.pathname}?${params.toString()}#${encodeURIComponent(id)}`); + } + } + }); + // Basket actions + $("#wbBasketClear").addEventListener("click", () => { + state.basket.clear(); + saveBasketToStorage(state.basket); + basketCount(); + $$(".wb-pick.picked").forEach(p => { p.classList.remove("picked"); p.textContent = "加入选择篮"; }); + }); + $("#wbCompareBtn").addEventListener("click", () => showView("basket")); + $("#wbCommonPrereqBtn").addEventListener("click", () => showView("basket")); + $("#wbDivergeBtn").addEventListener("click", () => showView("basket")); +} + +function anchorScroll() { + const hash = decodeURIComponent(window.location.hash || "").slice(1); + if (!hash) return; + // Determine which view contains this anchor and switch to it. + const targetView = (() => { + if (hash.startsWith("claim:")) return "claims"; + if (hash.startsWith("chain:")) return "chains"; + if (hash.startsWith("scenario:")) return "scenarios"; + if (hash.startsWith("dataset:")) return "scenarios"; + if (hash.startsWith("metric:")) return "scenarios"; + if (hash.startsWith("failure_mode:")) return "failures"; + if (hash.startsWith("experiment_plan:")) return "experiments"; + return null; + })(); + if (!targetView) return; + showView(targetView); + // Switch subnav if needed. + if (targetView === "scenarios") { + const sub = hash.startsWith("dataset:") ? "datasets" : hash.startsWith("metric:") ? "metrics" : "scenarios"; + $$(".wb-subtab").forEach(x => x.classList.toggle("active", x.dataset.sub === sub)); + state.subview = sub; + $("#scenariosList").hidden = sub !== "scenarios"; + $("#datasetsList").hidden = sub !== "datasets"; + $("#metricsList").hidden = sub !== "metrics"; + } + setTimeout(() => { + const card = document.querySelector(`[data-id="${CSS.escape(hash)}"]`); + if (card) { + card.scrollIntoView({ behavior: "smooth", block: "start" }); + card.classList.add("wb-anchor-pulse"); + setTimeout(() => card.classList.remove("wb-anchor-pulse"), 2400); + } + }, 80); +} + +(async function main() { + try { + await loadAll(); + } catch (err) { + $("#wbMain").innerHTML = `

    研究层数据加载失败:${escapeHtml(err.message || String(err))}

    `; + return; + } + buildPaperFilter(); + wireEvents(); + basketCount(); + const params = new URLSearchParams(window.location.search); + const startView = params.get("view") || "claims"; + showView(["claims", "chains", "scenarios", "failures", "experiments", "papers", "coverage", "gaps", "basket"].includes(startView) ? startView : "claims"); + anchorScroll(); + window.addEventListener("hashchange", anchorScroll); +})(); diff --git a/docs/workbench.css b/docs/workbench.css new file mode 100644 index 0000000..9d760fd --- /dev/null +++ b/docs/workbench.css @@ -0,0 +1,271 @@ +/* Paper Production Workbench — utilitarian, evidence-first, low chrome. */ + +:root { + --wb-bg: #0b1220; + --wb-bg-1: #111a2c; + --wb-bg-2: #18243a; + --wb-fg: #e5edff; + --wb-fg-dim: #93a5c4; + --wb-fg-muted: #6c7d9b; + --wb-line: #25344f; + --wb-line-strong: #36507a; + --wb-accent: #a7f3d0; + --wb-accent-2: #fcd34d; + --wb-warn: #fca5a5; + --wb-cite: #c4b5fd; + --wb-chip-bg: #1a2942; + --wb-chip-active: #2c4675; + --wb-radius: 8px; +} + +* { box-sizing: border-box; } + +html, body { + margin: 0; padding: 0; + font-family: -apple-system, BlinkMacSystemFont, "PingFang SC", "Hiragino Sans GB", + "Source Han Sans", "Noto Sans CJK SC", "Segoe UI", Arial, sans-serif; + background: var(--wb-bg); + color: var(--wb-fg); + line-height: 1.55; + font-size: 14.5px; +} + +body.workbench { + display: grid; + grid-template-columns: 280px 1fr; + grid-template-rows: auto 1fr auto; + grid-template-areas: + "topbar topbar" + "sidebar main" + "footer footer"; + min-height: 100vh; +} + +/* Topbar */ +.wb-topbar { + grid-area: topbar; + display: flex; align-items: center; justify-content: space-between; + gap: 12px; + padding: 12px 18px; + border-bottom: 1px solid var(--wb-line); + background: linear-gradient(180deg, #0d1626, #0a1120); + position: sticky; top: 0; z-index: 5; +} +.wb-brand { display: flex; align-items: center; gap: 12px; } +.wb-logo { + display: inline-flex; align-items: center; justify-content: center; + width: 32px; height: 32px; border-radius: 6px; + background: var(--wb-bg-2); color: var(--wb-accent); font-size: 18px; +} +.wb-title-block { display: flex; flex-direction: column; line-height: 1.2; } +.wb-title { font-weight: 700; font-size: 16px; } +.wb-subtitle { font-size: 12.5px; color: var(--wb-fg-dim); } + +.wb-nav { display: flex; gap: 4px; flex-wrap: wrap; } +.wb-tab { + background: transparent; color: var(--wb-fg-dim); + border: 1px solid transparent; border-radius: var(--wb-radius); + padding: 6px 12px; font-size: 13.5px; cursor: pointer; +} +.wb-tab:hover { color: var(--wb-fg); border-color: var(--wb-line); } +.wb-tab.active { color: var(--wb-fg); background: var(--wb-bg-2); border-color: var(--wb-line-strong); } + +.wb-topbar-right { display: flex; gap: 6px; } +.wb-link { + color: var(--wb-fg-dim); text-decoration: none; font-size: 13px; + padding: 4px 10px; border-radius: var(--wb-radius); border: 1px solid var(--wb-line); +} +.wb-link:hover { color: var(--wb-fg); border-color: var(--wb-line-strong); } + +/* Sidebar */ +.wb-sidebar { + grid-area: sidebar; + border-right: 1px solid var(--wb-line); + padding: 14px 14px 28px; + background: var(--wb-bg-1); + overflow-y: auto; + position: sticky; top: 57px; + height: calc(100vh - 57px); +} +.wb-pane { margin-bottom: 18px; } +.wb-pane h3 { font-size: 12.5px; letter-spacing: 0.04em; color: var(--wb-fg-dim); text-transform: uppercase; margin: 0 0 8px; } +#wbSearch { + width: 100%; padding: 7px 10px; background: var(--wb-bg-2); + color: var(--wb-fg); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + font: inherit; font-size: 13px; +} +#wbSearch:focus { outline: 1px solid var(--wb-accent); } + +.wb-chip-row { display: flex; flex-wrap: wrap; gap: 6px; } +.wb-chip { + background: var(--wb-chip-bg); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); border-radius: 999px; + padding: 3px 10px; font-size: 12.5px; cursor: pointer; +} +.wb-chip.active { background: var(--wb-chip-active); color: var(--wb-fg); border-color: var(--wb-line-strong); } +.wb-chip:hover { color: var(--wb-fg); } + +.wb-basket-pane { padding: 12px; background: var(--wb-bg-2); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); } +.wb-basket-count { + background: var(--wb-accent); color: #0b1220; padding: 1px 8px; border-radius: 999px; + font-size: 11px; font-weight: 700; margin-left: 4px; +} +.wb-hint { font-size: 12.5px; color: var(--wb-fg-muted); margin: 0 0 8px; } +.wb-basket-actions { display: flex; flex-wrap: wrap; gap: 6px; } +.wb-action { + background: var(--wb-accent); color: #0b1220; border: 0; border-radius: var(--wb-radius); + padding: 6px 10px; font-size: 12.5px; font-weight: 600; cursor: pointer; +} +.wb-action:disabled { background: var(--wb-line); color: var(--wb-fg-muted); cursor: not-allowed; } +.wb-action.wb-secondary { background: transparent; color: var(--wb-fg-dim); border: 1px solid var(--wb-line); } + +/* Main */ +.wb-main { grid-area: main; padding: 22px 28px 60px; max-width: 1400px; } +.wb-view-header h2 { margin: 0 0 4px; font-size: 22px; } +.wb-view-header p { margin: 0 0 18px; color: var(--wb-fg-dim); } + +.wb-subnav { display: flex; gap: 6px; margin-bottom: 14px; } +.wb-subtab { + background: var(--wb-chip-bg); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + padding: 5px 12px; font-size: 13px; cursor: pointer; +} +.wb-subtab.active { background: var(--wb-chip-active); color: var(--wb-fg); border-color: var(--wb-line-strong); } + +.wb-cards-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(420px, 1fr)); + gap: 16px; +} + +.wb-card { + background: var(--wb-bg-1); + border: 1px solid var(--wb-line); + border-radius: var(--wb-radius); + padding: 14px 14px 12px; + display: flex; flex-direction: column; gap: 10px; + position: relative; +} +.wb-card:hover { border-color: var(--wb-line-strong); } +.wb-card.wb-anchor-pulse { border-color: var(--wb-accent); box-shadow: 0 0 0 2px rgba(167,243,208,0.25); transition: border-color 0.3s ease, box-shadow 0.3s ease; } +.wb-card-head { + display: flex; align-items: flex-start; justify-content: space-between; gap: 10px; +} +.wb-card-title { font-weight: 600; font-size: 15px; color: var(--wb-fg); cursor: pointer; } +.wb-card-title:hover { color: var(--wb-accent); } +.wb-card-title:hover::after { content: " · 点击复制可分享链接"; color: var(--wb-fg-muted); font-size: 11px; font-weight: 400; } +.wb-card-meta { display: flex; gap: 6px; flex-wrap: wrap; align-items: center; } +.wb-tag { + font-size: 11.5px; padding: 2px 8px; border-radius: 999px; + background: var(--wb-bg-2); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); +} +.wb-tag.evidence-0 { color: var(--wb-warn); border-color: #4a2630; } +.wb-tag.evidence-1 { color: #fde68a; border-color: #4a3d22; } +.wb-tag.evidence-2 { color: #bbf7d0; border-color: #1f4a35; } +.wb-tag.evidence-3 { color: var(--wb-accent); border-color: #1f4a45; } +.wb-tag.dispute-2, .wb-tag.dispute-3 { color: var(--wb-warn); border-color: #4a2630; } + +.wb-pick { + font-size: 11.5px; padding: 3px 10px; border-radius: 999px; + background: transparent; color: var(--wb-fg-dim); border: 1px solid var(--wb-line); + cursor: pointer; +} +.wb-pick.picked { background: var(--wb-accent); color: #0b1220; border-color: var(--wb-accent); font-weight: 600; } + +.wb-statement { font-size: 14.5px; color: var(--wb-fg); } +.wb-card-section { + border-top: 1px dashed var(--wb-line); padding-top: 8px; +} +.wb-section-h { + display: flex; justify-content: space-between; align-items: center; cursor: pointer; + font-size: 12px; letter-spacing: 0.06em; color: var(--wb-fg-dim); text-transform: uppercase; +} +.wb-section-body { margin-top: 6px; } +.wb-section-body ul { margin: 4px 0; padding-left: 18px; } +.wb-section-body li { margin-bottom: 3px; } +.wb-section-body p { margin: 4px 0; } +.wb-section-body[hidden] { display: none; } + +.wb-evidence-item { + background: var(--wb-bg-2); border: 1px solid var(--wb-line); border-radius: 6px; + padding: 8px 10px; margin-bottom: 6px; font-size: 13px; +} +.wb-evidence-kind { + display: inline-block; font-size: 11px; padding: 1px 7px; border-radius: 999px; + background: #1f3252; color: var(--wb-cite); margin-right: 6px; +} +.wb-evidence-source { color: var(--wb-fg-dim); font-size: 12px; margin-top: 3px; } + +.wb-card-footer { display: flex; justify-content: space-between; gap: 6px; font-size: 12px; color: var(--wb-fg-muted); } +.wb-card-footer a { color: var(--wb-fg-dim); text-decoration: none; border-bottom: 1px dashed var(--wb-line-strong); } +.wb-card-footer a:hover { color: var(--wb-accent); } + +/* Basket detail */ +.wb-basket-detail { display: flex; flex-direction: column; gap: 18px; } +.wb-compare-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); gap: 14px; } +.wb-compare-col h4 { margin: 0 0 8px; font-size: 14px; } +.wb-compare-row { + background: var(--wb-bg-1); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + padding: 10px 12px; margin-bottom: 8px; +} +.wb-compare-row h5 { font-size: 12px; letter-spacing: 0.04em; color: var(--wb-fg-dim); margin: 0 0 4px; text-transform: uppercase; } +.wb-empty { color: var(--wb-fg-muted); font-size: 13px; padding: 24px 0; text-align: center; } + +/* Footer */ +.wb-footer { + grid-area: footer; + border-top: 1px solid var(--wb-line); + padding: 10px 18px; font-size: 12px; color: var(--wb-fg-muted); + background: #08101e; +} + +@media (max-width: 900px) { + body.workbench { + grid-template-columns: 1fr; + grid-template-rows: auto auto 1fr auto; + grid-template-areas: + "topbar" + "sidebar" + "main" + "footer"; + } + .wb-sidebar { position: static; height: auto; border-right: 0; border-bottom: 1px solid var(--wb-line); } + .wb-nav { display: none; } + .wb-cards-grid { grid-template-columns: 1fr; } + .wb-main { padding: 16px; } + .wb-topbar { flex-wrap: wrap; } + .wb-mobile-tabs { display: flex; gap: 4px; padding: 8px 14px; overflow-x: auto; } +} + +@media (max-width: 600px) { + .wb-title { font-size: 15px; } + .wb-subtitle { font-size: 11.5px; } + .wb-card-title { font-size: 14px; } +} + +/* KaTeX font sizing on small screens */ +.katex { font-size: 1.02em; } +@media (max-width: 600px) { .katex { font-size: 0.95em; } } + +/* Coverage matrix view */ +.wb-coverage-grids { display: flex; flex-direction: column; gap: 28px; } +.wb-coverage-grid { overflow-x: auto; } +.wb-coverage-grid h3 { margin: 0 0 8px; font-size: 15px; color: var(--wb-fg); } +.wb-coverage-grid p { margin: 0 0 8px; font-size: 12.5px; color: var(--wb-fg-dim); } +.wb-coverage-table { border-collapse: collapse; min-width: 100%; font-size: 12.5px; } +.wb-coverage-table th, .wb-coverage-table td { + border: 1px solid var(--wb-line); padding: 6px 8px; text-align: left; + vertical-align: top; max-width: 220px; +} +.wb-coverage-table th { background: var(--wb-bg-2); color: var(--wb-fg-dim); font-weight: 500; white-space: nowrap; } +.wb-coverage-table th.wb-row-head { position: sticky; left: 0; z-index: 1; background: var(--wb-bg-2); } +.wb-coverage-table td.covered { background: rgba(167,243,208,0.10); color: var(--wb-fg); } +.wb-coverage-table td.uncovered { background: rgba(252,165,165,0.05); color: var(--wb-fg-muted); font-style: italic; } +.wb-coverage-table td.uncovered::after { content: "—"; } + +/* Mobile fallback nav tabs at the top of main */ +.wb-mobile-tabs { display: none; } +@media (max-width: 900px) { + .wb-mobile-tabs { display: flex; padding: 6px 14px 0; gap: 4px; overflow-x: auto; } +} diff --git a/docs/workbench.html b/docs/workbench.html new file mode 100644 index 0000000..3dd695f --- /dev/null +++ b/docs/workbench.html @@ -0,0 +1,178 @@ + + + + + +论文产出工作台 / Paper Production Workbench + + + + + + + + + + + +
    +
    + +
    + 论文产出工作台 + 围绕主张、证据、场景、失败模式与可复现实验组织自动驾驶研究 +
    +
    + + +
    + + + +
    +
    +
    +

    可证伪主张

    +

    每条主张都被拆解为声明、证据、前提、反例、边界、可复现实验与可投稿价值,使审稿人可以逐条审查。

    +
    +
    +
    + + + + + + + + + + + + + + + + +
    + +
    + 这是论文产出工作台。视觉只服务于研究结构:证据强度、争议程度、可复现状态、失败边界、研究成熟度。 +
    + + + + diff --git a/tools/build_research_overlay.py b/tools/build_research_overlay.py new file mode 100644 index 0000000..c021d33 --- /dev/null +++ b/tools/build_research_overlay.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +"""Build docs/data/research/node_overlay.json from the structured research layer. + +The 3D atlas reads this overlay to bind visual encoding to research substance: + - evidence_strength (0..3) — how well-supported the node's strongest claim is + - dispute_level (0..3) — community disagreement + - reproducibility_status — verified / partial / inferred / speculative + - failure_boundary_count — number of failure modes that diagnose this node + - maturity (0..3) — derived from reproducibility + evidence + +This script is idempotent: re-running it after editing the source JSONs will +refresh the overlay. CI re-runs it to make sure the overlay tracks reality. +""" +from __future__ import annotations + +import json +from collections import defaultdict +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +RESEARCH = ROOT / "docs" / "data" / "research" +OUT = RESEARCH / "node_overlay.json" + +REPRO_TO_MATURITY = {"verified": 3, "partial": 2, "inferred": 1, "speculative": 0} + + +def main() -> int: + claims = json.loads((RESEARCH / "claims.json").read_text(encoding="utf-8")).get("claims", []) + failures = json.loads((RESEARCH / "failure_modes.json").read_text(encoding="utf-8")).get("failure_modes", []) + scenarios = json.loads((RESEARCH / "scenarios.json").read_text(encoding="utf-8")).get("scenarios", []) + chains = json.loads((RESEARCH / "argument_chains.json").read_text(encoding="utf-8")).get("argument_chains", []) + + by_subject_max_ev: dict[str, int] = defaultdict(int) + by_subject_max_disp: dict[str, int] = defaultdict(int) + by_subject_best_repro: dict[str, int] = defaultdict(int) + by_subject_claim_count: dict[str, int] = defaultdict(int) + for c in claims: + sid = c.get("subject") + if not sid: + continue + by_subject_max_ev[sid] = max(by_subject_max_ev[sid], int(c.get("evidence_strength") or 0)) + by_subject_max_disp[sid] = max(by_subject_max_disp[sid], int(c.get("dispute_level") or 0)) + by_subject_best_repro[sid] = max(by_subject_best_repro[sid], REPRO_TO_MATURITY.get(c.get("reproducibility_status"), 0)) + by_subject_claim_count[sid] += 1 + + failure_for_subject: dict[str, set[str]] = defaultdict(set) + for c in claims: + sid = c.get("subject") + if not sid: + continue + for fm in c.get("related_failure_modes") or []: + failure_for_subject[sid].add(fm) + + overlay = { + "generated_by": "tools/build_research_overlay.py", + "version": 1, + "subjects": {}, + "scenarios": [s.get("id") for s in scenarios], + "failure_modes": [f.get("id") for f in failures], + "argument_chains": [ch.get("id") for ch in chains], + } + all_subjects = set(by_subject_max_ev) | set(failure_for_subject) + for sid in sorted(all_subjects): + overlay["subjects"][sid] = { + "evidence_strength": by_subject_max_ev.get(sid, 0), + "dispute_level": by_subject_max_disp.get(sid, 0), + "reproducibility_status_score": by_subject_best_repro.get(sid, 0), + "failure_boundary_count": len(failure_for_subject.get(sid, set())), + "claim_count": by_subject_claim_count.get(sid, 0), + "maturity": max(by_subject_best_repro.get(sid, 0), by_subject_max_ev.get(sid, 0)), + } + OUT.write_text(json.dumps(overlay, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + n = len(overlay["subjects"]) + print(f"OK wrote {OUT.relative_to(ROOT)} with overlay for {n} subject nodes") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/screenshot_regression.py b/tools/screenshot_regression.py new file mode 100644 index 0000000..65bee89 --- /dev/null +++ b/tools/screenshot_regression.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""Screenshot regression scaffold for the workbench and 3D atlas pages. + +This script captures page screenshots at two canonical viewports (desktop +1440x900 and mobile 390x844) and compares them against baseline PNGs stored +under `docs/data/research/baselines/`. It is designed to fail loudly when a +known-good baseline is missing or differs. + +Behaviour matrix: + - If Playwright is not installed, the script prints a clear setup message + and exits 0 (do not block CI on missing optional dependency). + - If Playwright is installed and `--bake` is passed, baselines are + regenerated. Useful when intentional visual changes happen. + - If Playwright is installed without `--bake`, each captured frame is + compared to its baseline. Any pixel-difference > threshold fails. + +Run from repo root: `python tools/screenshot_regression.py [--bake]`. +""" +from __future__ import annotations + +import argparse +import contextlib +import http.server +import socketserver +import subprocess +import sys +import threading +import time +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +DOCS = ROOT / "docs" +BASELINE_DIR = DOCS / "data" / "research" / "baselines" +PORT = 8766 + +VIEWPORTS = [ + ("desktop", 1440, 900), + ("mobile", 390, 844), +] +ROUTES = [ + ("workbench", "/workbench.html"), + ("workbench_papers", "/workbench.html?view=papers"), + ("workbench_failures", "/workbench.html?view=failures"), + ("atlas3d", "/index.html"), +] + + +def _try_import_playwright(): + try: + from playwright.sync_api import sync_playwright # noqa: F401 + return True + except Exception: + return False + + +def _start_server() -> tuple[socketserver.TCPServer, threading.Thread]: + handler = http.server.SimpleHTTPRequestHandler + # The handler serves files relative to CWD; chdir into docs before binding. + server = socketserver.TCPServer(("127.0.0.1", PORT), handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + return server, thread + + +@contextlib.contextmanager +def _docs_cwd(): + import os + prev = os.getcwd() + os.chdir(DOCS) + try: + yield + finally: + os.chdir(prev) + + +def _capture(playwright, base_url: str, out_dir: Path, bake: bool) -> tuple[int, list[str]]: + """Capture every (route, viewport) pair into out_dir; return (mismatch_count, error_list).""" + out_dir.mkdir(parents=True, exist_ok=True) + mismatches = 0 + errors: list[str] = [] + chromium = playwright.chromium.launch() + for vp_name, w, h in VIEWPORTS: + context = chromium.new_context(viewport={"width": w, "height": h}) + page = context.new_page() + for route_name, path in ROUTES: + url = f"{base_url}{path}" + try: + page.goto(url, wait_until="networkidle", timeout=15000) + # Give the workbench JS a moment to render the cards (no transitions). + page.wait_for_timeout(900) + except Exception as e: + errors.append(f"navigation failure for {url}: {e}") + continue + fname = f"{route_name}__{vp_name}.png" + target = out_dir / fname + page.screenshot(path=str(target), full_page=False) + if bake: + continue + baseline = BASELINE_DIR / fname + if not baseline.exists(): + errors.append(f"missing baseline {fname}; rerun with --bake after visual review") + mismatches += 1 + continue + if baseline.read_bytes() != target.read_bytes(): + errors.append(f"visual diff detected: {fname}") + mismatches += 1 + context.close() + chromium.close() + return mismatches, errors + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--bake", action="store_true", help="Regenerate baselines (use after intentional visual change)") + args = parser.parse_args() + + if not _try_import_playwright(): + print("screenshot_regression: Playwright not installed.") + print(" to enable visual regression: pip install playwright && python -m playwright install chromium") + print(" this scaffold exits 0 by design so CI is not blocked on the optional dependency.") + return 0 + + from playwright.sync_api import sync_playwright + + with _docs_cwd(): + server, _thread = _start_server() + try: + time.sleep(0.4) + BASELINE_DIR.mkdir(parents=True, exist_ok=True) + with sync_playwright() as p: + tmp_dir = ROOT / "tmp_screenshots" + target_dir = BASELINE_DIR if args.bake else tmp_dir + mismatches, errors = _capture(p, f"http://127.0.0.1:{PORT}", target_dir, args.bake) + if args.bake: + print(f"baked {len(VIEWPORTS) * len(ROUTES)} baseline screenshots into {BASELINE_DIR.relative_to(ROOT)}") + return 0 + if mismatches: + print(f"SCREENSHOT REGRESSION FAILED with {mismatches} mismatch(es):") + for e in errors: + print(f" - {e}") + return 1 + print(f"OK {len(VIEWPORTS) * len(ROUTES)} screenshots match baselines") + return 0 + finally: + server.shutdown() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/validate_research.py b/tools/validate_research.py new file mode 100644 index 0000000..1c7f20f --- /dev/null +++ b/tools/validate_research.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +"""Validate the structured research layer under docs/data/research/. + +Quality gates enforced (exit 1 on any failure): + + 1. Every claim has all required fields: id, subject, statement, evidence, + preconditions, counterexamples, boundaries, reproduction, publication_value, + dispute_level, evidence_strength, reproducibility_status. + 2. Every claim's `evidence` is a non-empty array; each item has kind, source, + finding. Kinds limited to the controlled vocabulary. + 3. Every argument chain has all 10 required argumentative fields plus related + scenarios/datasets/metrics. + 4. Every dataset has supports + limits + common_misuses (each non-empty). + 5. Every metric has formula + variables + assumptions + what_it_proves + + what_it_cannot_prove + known_misuses. The formula must reference each + declared variable symbol (or the symbol must appear inside another). + 6. Every failure mode has trigger_conditions + manifestation + + reproducible_setup + diagnostic_metrics + method_weakness + partial_solutions + + open_questions + publication_angles, all non-empty. + 7. Every experiment plan has all three tiers; each tier specifies purpose, + metrics or success_criteria, and an expected signal or compute budget. + 8. Cross-references resolve: a claim's subject must be either a known paper + node in docs/data/graph_extended.json (or graph.json), or marked as + `unresolved_subject: true` in the claim. Same for related_failure_modes + pointing to actual failure_mode ids. Argument chains' related_* ids + must resolve to known scenario/dataset/metric ids. + 9. No claim leaves preconditions, counterexamples or boundaries empty. +10. No formula contains stray TeX errors that we can detect cheaply + (mismatched $$ or empty $...$). +11. Every dataset that appears in a claim/argument chain must list at least + one covers_scenarios entry; every metric must list at least one + what_it_cannot_prove entry. + +Run: `python tools/validate_research.py` +""" +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +RESEARCH = ROOT / "docs" / "data" / "research" +GRAPH = ROOT / "docs" / "data" / "graph.json" +GRAPH_EXT = ROOT / "docs" / "data" / "graph_extended.json" + +ALLOWED_EVIDENCE_KINDS = {"ablation", "table", "theorem", "repro", "external_benchmark"} +ALLOWED_REPRO = {"verified", "partial", "inferred", "speculative"} + + +def _load(path: Path): + with path.open(encoding="utf-8") as f: + return json.load(f) + + +def _known_paper_ids() -> set[str]: + ids = set() + for p in (GRAPH, GRAPH_EXT): + if p.exists(): + data = _load(p) + for n in data.get("nodes", []): + if n.get("id"): + ids.add(n["id"]) + return ids + + +def _expect(cond: bool, msg: str, errors: list[str]) -> None: + if not cond: + errors.append(msg) + + +def _is_non_empty_str(v) -> bool: + return isinstance(v, str) and v.strip() != "" + + +def _is_non_empty_list(v) -> bool: + return isinstance(v, list) and len(v) > 0 + + +def validate_claims(errors: list[str], known_papers: set[str], known_fms: set[str], known_metrics: set[str], known_datasets: set[str]) -> int: + path = RESEARCH / "claims.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + claims = data.get("claims", []) + seen_ids = set() + for c in claims: + cid = c.get("id", "") + _expect(_is_non_empty_str(cid), f"claim missing id: {c}", errors) + _expect(cid not in seen_ids, f"claim duplicate id: {cid}", errors) + seen_ids.add(cid) + for field in ("subject", "statement", "publication_value", "reproducibility_status"): + _expect(_is_non_empty_str(c.get(field)), f"claim {cid} missing {field}", errors) + for field in ("preconditions", "counterexamples", "boundaries", "evidence"): + _expect(_is_non_empty_list(c.get(field)), f"claim {cid} field {field} must be non-empty list", errors) + _expect(c.get("reproducibility_status") in ALLOWED_REPRO, f"claim {cid} bad reproducibility_status {c.get('reproducibility_status')!r}", errors) + ev_strength = c.get("evidence_strength") + _expect(isinstance(ev_strength, int) and 0 <= ev_strength <= 3, f"claim {cid} evidence_strength must be int 0..3", errors) + disp = c.get("dispute_level") + _expect(isinstance(disp, int) and 0 <= disp <= 3, f"claim {cid} dispute_level must be int 0..3", errors) + for ev in c.get("evidence", []) or []: + _expect(ev.get("kind") in ALLOWED_EVIDENCE_KINDS, f"claim {cid} evidence has bad kind {ev.get('kind')!r}", errors) + _expect(_is_non_empty_str(ev.get("source")), f"claim {cid} evidence missing source", errors) + _expect(_is_non_empty_str(ev.get("finding")), f"claim {cid} evidence missing finding", errors) + repro = c.get("reproduction") or {} + _expect(_is_non_empty_str(repro.get("minimal")), f"claim {cid} reproduction.minimal missing", errors) + _expect(_is_non_empty_str(repro.get("public_data")), f"claim {cid} reproduction.public_data missing", errors) + _expect(_is_non_empty_str(repro.get("expected_output")), f"claim {cid} reproduction.expected_output missing", errors) + _expect(isinstance(repro.get("cost_hours"), (int, float)), f"claim {cid} reproduction.cost_hours missing", errors) + # cross-ref reproduction.public_data → known dataset + pd = repro.get("public_data") + if pd and known_datasets and pd not in known_datasets: + errors.append(f"claim {cid} reproduction.public_data {pd!r} not declared in datasets.json") + # cross-ref subject + if c.get("subject") and known_papers and c["subject"] not in known_papers and not c.get("unresolved_subject"): + errors.append(f"claim {cid} subject {c['subject']} not in known nodes; mark unresolved_subject=true if intentional") + for fm in c.get("related_failure_modes", []) or []: + _expect(fm in known_fms, f"claim {cid} related_failure_modes {fm} not declared", errors) + # Every paper that appears as a claim subject must touch at least one + # failure mode somewhere among its claims. This enforces the rule + # "every method node must have a failure boundary". + by_subject: dict[str, list] = {} + for c in claims: + by_subject.setdefault(c.get("subject"), []).append(c) + for sid, cs in by_subject.items(): + if not sid: + continue + fms = set() + for c in cs: + for fm in c.get("related_failure_modes") or []: + fms.add(fm) + if not fms: + errors.append(f"paper {sid} has claims but no related_failure_modes across them (every method node must declare a failure boundary)") + return len(claims) + + +def validate_chains(errors: list[str], known_papers: set[str], known_scenarios: set[str], known_datasets: set[str], known_metrics: set[str]) -> int: + path = RESEARCH / "argument_chains.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + chains = data.get("argument_chains", []) + required_text = ("research_gap", "core_claim", "method_mechanism") + required_lists = ("key_experiments", "strong_baselines", "ablations", "negative_results", "reviewer_attacks", "response_experiments", "figure_plan") + for ch in chains: + cid = ch.get("id", "") + for f in required_text: + _expect(_is_non_empty_str(ch.get(f)), f"chain {cid} missing {f}", errors) + for f in required_lists: + _expect(_is_non_empty_list(ch.get(f)), f"chain {cid} field {f} must be non-empty list", errors) + _expect(_is_non_empty_list(ch.get("subject_papers")), f"chain {cid} subject_papers must be non-empty", errors) + for p in ch.get("subject_papers", []) or []: + if known_papers and p not in known_papers: + errors.append(f"chain {cid} subject_papers references unknown node {p}") + for s in ch.get("related_scenarios", []) or []: + _expect(s in known_scenarios, f"chain {cid} related_scenarios {s} not declared", errors) + for d in ch.get("related_datasets", []) or []: + _expect(d in known_datasets, f"chain {cid} related_datasets {d} not declared", errors) + for m in ch.get("related_metrics", []) or []: + _expect(m in known_metrics, f"chain {cid} related_metrics {m} not declared", errors) + return len(chains) + + +def validate_scenarios(errors: list[str]) -> tuple[int, set[str]]: + path = RESEARCH / "scenarios.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + scenarios = data.get("scenarios", []) + ids: set[str] = set() + for s in scenarios: + sid = s.get("id", "") + ids.add(sid) + for f in ("label", "description", "why_hard"): + _expect(_is_non_empty_str(s.get(f)), f"scenario {sid} missing {f}", errors) + for f in ("available_datasets", "evaluation_metrics"): + _expect(_is_non_empty_list(s.get(f)), f"scenario {sid} {f} must be non-empty", errors) + # current_best_methods may be empty — an empty list intentionally signals + # that no public method has been credibly benchmarked on this scenario + # yet. Treat that absence as research-relevant data, not a missing field. + _expect(isinstance(s.get("current_best_methods"), list), f"scenario {sid} current_best_methods must be a list (possibly empty when no public report exists)", errors) + return len(scenarios), ids + + +def validate_datasets(errors: list[str], known_scenarios: set[str]) -> tuple[int, set[str]]: + path = RESEARCH / "datasets.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + datasets = data.get("datasets", []) + ids: set[str] = set() + for d in datasets: + did = d.get("id", "") + ids.add(did) + for f in ("label", "scale", "license"): + _expect(_is_non_empty_str(d.get(f)), f"dataset {did} missing {f}", errors) + for f in ("supports", "limits", "common_misuses", "covers_scenarios"): + _expect(_is_non_empty_list(d.get(f)), f"dataset {did} {f} must be non-empty", errors) + for s in d.get("covers_scenarios", []) or []: + _expect(s in known_scenarios, f"dataset {did} covers_scenarios {s} not declared", errors) + return len(datasets), ids + + +def validate_metrics(errors: list[str]) -> tuple[int, set[str]]: + path = RESEARCH / "metrics.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + metrics = data.get("metrics", []) + ids: set[str] = set() + for m in metrics: + mid = m.get("id", "") + ids.add(mid) + for f in ("label", "formula", "scope"): + _expect(_is_non_empty_str(m.get(f)), f"metric {mid} missing {f}", errors) + _expect(isinstance(m.get("variables"), dict) and m["variables"], f"metric {mid} variables must be a non-empty dict", errors) + for f in ("assumptions", "what_it_proves", "what_it_cannot_prove", "known_misuses"): + _expect(_is_non_empty_list(m.get(f)), f"metric {mid} {f} must be non-empty", errors) + formula = m.get("formula", "") + # cheap TeX sanity: balanced dollar pairs, no empty `$$` + if formula.count("$$") % 2 != 0: + errors.append(f"metric {mid} formula has unbalanced $$ delimiters") + if re.search(r"\$\s*\$", formula): + errors.append(f"metric {mid} formula has empty $...$ block") + # at least one declared variable symbol must literally appear in the formula + vars_dict = m.get("variables") or {} + if vars_dict: + present = sum(1 for sym in vars_dict if sym in formula) + if present == 0: + errors.append(f"metric {mid} formula references none of its declared variable symbols; the formula and the variables dictionary are out of sync") + return len(metrics), ids + + +def validate_scenarios_cross_refs(errors: list[str], known_datasets: set[str], known_metrics: set[str], known_fms: set[str]) -> None: + path = RESEARCH / "scenarios.json" + if not path.exists(): + return + scenarios = _load(path).get("scenarios", []) + for s in scenarios: + sid = s.get("id", "") + for d in s.get("available_datasets", []) or []: + if known_datasets and d not in known_datasets: + errors.append(f"scenario {sid} available_datasets {d} not declared in datasets.json") + for m in s.get("evaluation_metrics", []) or []: + if known_metrics and m not in known_metrics: + errors.append(f"scenario {sid} evaluation_metrics {m} not declared in metrics.json") + for fm in s.get("open_failure_modes", []) or []: + if known_fms and fm not in known_fms: + errors.append(f"scenario {sid} open_failure_modes {fm} not declared in failure_modes.json") + + +def validate_failures(errors: list[str], known_metrics: set[str]) -> tuple[int, set[str]]: + path = RESEARCH / "failure_modes.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + fms = data.get("failure_modes", []) + ids: set[str] = set() + for f in fms: + fid = f.get("id", "") + ids.add(fid) + for k in ("label", "manifestation", "reproducible_setup", "method_weakness"): + _expect(_is_non_empty_str(f.get(k)), f"failure_mode {fid} missing {k}", errors) + for k in ("trigger_conditions", "diagnostic_metrics", "partial_solutions", "open_questions", "publication_angles"): + _expect(_is_non_empty_list(f.get(k)), f"failure_mode {fid} {k} must be non-empty", errors) + for m in f.get("diagnostic_metrics", []) or []: + if known_metrics and m not in known_metrics: + errors.append(f"failure_mode {fid} diagnostic_metrics {m} not declared") + for sol in f.get("partial_solutions", []) or []: + _expect(_is_non_empty_str(sol.get("idea")), f"failure_mode {fid} partial_solution missing idea", errors) + _expect(_is_non_empty_str(sol.get("residual_gap")), f"failure_mode {fid} partial_solution missing residual_gap", errors) + return len(fms), ids + + +def validate_experiments(errors: list[str]) -> int: + path = RESEARCH / "experiment_plans.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + plans = data.get("experiment_plans", []) + tiers = ("tier_1_minimal_mechanism", "tier_2_public_benchmark", "tier_3_stress_test") + for p in plans: + pid = p.get("id", "") + for f in ("title", "subject"): + _expect(_is_non_empty_str(p.get(f)), f"experiment_plan {pid} missing {f}", errors) + for t in tiers: + tier = p.get(t) or {} + _expect(_is_non_empty_str(tier.get("purpose")), f"experiment_plan {pid} {t}.purpose missing", errors) + _expect(_is_non_empty_str(tier.get("success_criteria")), f"experiment_plan {pid} {t}.success_criteria missing", errors) + return len(plans) + + +def main() -> int: + errors: list[str] = [] + known_papers = _known_paper_ids() + scen_count, known_scenarios = validate_scenarios(errors) + metric_count, known_metrics = validate_metrics(errors) + fm_count, known_fms = validate_failures(errors, known_metrics) + ds_count, known_datasets = validate_datasets(errors, known_scenarios) + claim_count = validate_claims(errors, known_papers, known_fms, known_metrics, known_datasets) + chain_count = validate_chains(errors, known_papers, known_scenarios, known_datasets, known_metrics) + plan_count = validate_experiments(errors) + validate_scenarios_cross_refs(errors, known_datasets, known_metrics, known_fms) + + if errors: + print(f"RESEARCH VALIDATION FAILED with {len(errors)} error(s):") + for e in errors: + print(f" - {e}") + return 1 + print("OK research layer:") + print(f" claims={claim_count} chains={chain_count} scenarios={scen_count} " + f"datasets={ds_count} metrics={metric_count} failure_modes={fm_count} experiment_plans={plan_count}") + return 0 + + +if __name__ == "__main__": + sys.exit(main())