From 8e6010214488a27c28bb3d6642a6c3869bd106b9 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 14:12:59 +0000 Subject: [PATCH 1/7] Round 1: paper production workbench + falsifiable research layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 把仓库从“学习展示图谱”重构成“论文产出系统”的第一轮交付: 新增结构化研究层 docs/data/research/ - schema.json:claims / argument_chains / scenarios / datasets / metrics / failure_modes / experiment_plans 七类节点的字段契约 - claims.json:8 条围绕 UniAD、PlanT、DriveVLM、Agent-Driver、DiLu、CF-VLA、 VADv2 的可证伪主张,含证据、前提、反例、边界、最小复现实验与可投稿价值 - argument_chains.json:4 份完整论文骨架,覆盖查询共享端到端、快慢双系统、 反事实分支安全信号、可审计闭环评测协议 - scenarios.json:6 个一等公民场景节点 - datasets.json:6 个数据集,逐条声明能与不能支撑的研究主张及常见误用 - metrics.json:6 个指标,含公式、变量、前提、能与不能证明、误用 - failure_modes.json:8 个失败模式,含触发条件、诊断指标、已有半解、 残余间隙、开放问题、可投稿切入点 - experiment_plans.json:4 份三层实验计划,覆盖最小机制、公开基准、压力测试 前端:论文产出工作台 docs/workbench.html / workbench.css / js/workbench.js - 可证伪主张、论文论证链、场景与数据、失败模式、三层实验计划五大视图 - 多选节点、多选关系、选择篮、并排对比、共同前置、分歧路径 - 证据强度、争议程度、可复现状态、论文主体多维筛选 - 桌面端与移动端响应式布局,无装饰性动画 三维星图视觉编码改绑研究维度 - 默认关闭自动公转,把按钮降级为低显眼度 - 工作台入口放进顶部栏并以主色突出 - atlas-main.js 加载 node_overlay.json 并把成熟度、证据强度、争议程度、 失败边界数装配到节点上 - atlas-render.js 用上述维度调节节点尺寸与色彩,让视觉真正服务研究结构 质量门禁 - tools/validate_research.py 校验七类节点的必填字段、控制词表、交叉引用、 指标公式 TeX 完整性,并在 CI 中阻止退化 - tools/build_research_overlay.py 由源 JSON 派生 node_overlay.json - .github/workflows/validate.yml 加入两个新步骤 https://claude.ai/code/session_01QaomjzMa4sajRLK4MgWbVw --- .github/workflows/validate.yml | 4 + README.md | 14 +- docs/atlas3d.css | 4 + docs/data/research/argument_chains.json | 169 ++++++++ docs/data/research/claims.json | 273 ++++++++++++ docs/data/research/datasets.json | 136 ++++++ docs/data/research/experiment_plans.json | 133 ++++++ docs/data/research/failure_modes.json | 197 +++++++++ docs/data/research/metrics.json | 175 ++++++++ docs/data/research/node_overlay.json | 86 ++++ docs/data/research/scenarios.json | 65 +++ docs/data/research/schema.json | 118 +++++ docs/index.html | 3 +- docs/js/atlas-main.js | 21 + docs/js/atlas-render.js | 18 +- docs/js/atlas-ui.js | 2 +- docs/js/workbench.js | 526 +++++++++++++++++++++++ docs/workbench.css | 252 +++++++++++ docs/workbench.html | 151 +++++++ tools/build_research_overlay.py | 79 ++++ tools/validate_research.py | 282 ++++++++++++ 21 files changed, 2702 insertions(+), 6 deletions(-) create mode 100644 docs/data/research/argument_chains.json create mode 100644 docs/data/research/claims.json create mode 100644 docs/data/research/datasets.json create mode 100644 docs/data/research/experiment_plans.json create mode 100644 docs/data/research/failure_modes.json create mode 100644 docs/data/research/metrics.json create mode 100644 docs/data/research/node_overlay.json create mode 100644 docs/data/research/scenarios.json create mode 100644 docs/data/research/schema.json create mode 100644 docs/js/workbench.js create mode 100644 docs/workbench.css create mode 100644 docs/workbench.html create mode 100644 tools/build_research_overlay.py create mode 100644 tools/validate_research.py diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 1ec8126..ac47d69 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -23,6 +23,10 @@ jobs: with: { python-version: '3.11' } - name: Validate graph.json run: python tools/validate_graph.py + - name: Validate structured research layer + run: python tools/validate_research.py + - name: Rebuild research node overlay (for the 3D atlas visual encoding) + run: python tools/build_research_overlay.py - name: Check external deep links run: python tools/check_links.py --max-failures 5 continue-on-error: true diff --git a/README.md b/README.md index 77a31d6..ca73287 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Autonomous-Driving Learning Atlas -> 自动驾驶学习地图 — 一个以**交互知识图谱**为核心、面向博士级研究者的、中英双语的机器学习 / 强化学习 / 自动驾驶**入门-进阶-前沿**学习地图。 +> 自动驾驶研究地图 — 一个面向博士级与产业研究者的、围绕**可证伪主张、论文论证链、场景与数据集、失败模式与三层实验**组织的论文产出系统。中英双语,知识图谱与论文产出工作台并列,视觉只服务于研究结构。 [![Pages](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/pages.yml/badge.svg)](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/pages.yml) [![Validate](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/validate.yml/badge.svg)](https://github.com/ChatGPU/Autonomous-Driving-Learning-Atlas/actions/workflows/validate.yml) @@ -7,7 +7,10 @@ [![License: MIT (code)](https://img.shields.io/badge/code-MIT-blue.svg)](LICENSE) [![License: CC BY 4.0 (prose)](https://img.shields.io/badge/prose-CC%20BY%204.0-lightgrey.svg)](LICENSE-CC) -🌐 **Live atlas**: +🌐 **三维知识星图**: +🛠 **论文产出工作台**: + +> 工作台围绕六类结构化节点组织:**可证伪主张**(claim)、**论文论证链**(argument chain)、**场景**(scenario)、**数据集与指标**(dataset / metric,含能与不能证明的边界)、**失败模式**(failure mode,含触发条件、诊断指标、已有半解、可投稿切入点)、**三层实验计划**(最小机制 / 公开基准 / 压力测试)。所有节点均通过 `tools/validate_research.py` 进行结构完整性校验,并在 CI 中阻止退化。

@@ -91,7 +94,9 @@ Autonomous-Driving-Learning-Atlas/ ├── README.md / AGENTS.md / LICENSE / LICENSE-CC / CITATION.cff ├── docs/ # GitHub-Pages 根目录(交互站点) -│ ├── index.html · atlas3d.css +│ ├── index.html · atlas3d.css # 三维知识星图(视觉编码绑定研究维度) +│ ├── workbench.html · workbench.css · js/workbench.js +│ │ # 论文产出工作台:主张 / 论证链 / 场景 / 失败模式 / 实验计划 / 选择篮 │ ├── js/ # atlas-main · atlas-render · atlas-physics · │ │ # atlas-cards (含 Mermaid 渲染 + 动态洞察) │ ├── vendor/ # KaTeX + auto-render · Mermaid · DOMPurify · marked · Three.js @@ -100,6 +105,7 @@ Autonomous-Driving-Learning-Atlas/ │ ├── graph_extended.json # 489 节点 / 1440 边 (含 paradigm/insight/validation/move/problem) │ ├── layout_positions.json # 由 tools/precompute_layout.py 预烤的稳定 3D 位置 │ ├── generated/ # 多维度生成轴(decision / foundation / methodology / perception / wave-E stubs) +│ ├── research/ # 结构化研究层(claims / chains / scenarios / datasets / metrics / failure_modes / experiment_plans + schema + node_overlay) │ └── cards/ │ ├── *.md # spine + Tier-S 原始论文卡 (40 张) │ └── extended/ # paradigm / insight / validation / move / problem / paper stub (200+ 张) @@ -116,6 +122,8 @@ Autonomous-Driving-Learning-Atlas/ │ └── lab_dreamer_cartpole_pixels/ # CartPole 像素 RSSM + latent imagination ├── tools/ │ ├── validate_graph.py · check_links.py · lint_extended_cards.py +│ ├── validate_research.py # 结构化研究层的质量门禁 +│ ├── build_research_overlay.py # 由 research/*.json 生成 node_overlay.json │ ├── audit_card_meta_language.py # 扫描卡片里"元语言泄漏"短语 │ ├── merge_graph.py # seed + generated/*.json → graph_extended.json │ ├── repair_extended_graph.py # 重建 paradigm-validation-paper 与 problem 反向引用 diff --git a/docs/atlas3d.css b/docs/atlas3d.css index 53f7b30..49817c4 100644 --- a/docs/atlas3d.css +++ b/docs/atlas3d.css @@ -101,6 +101,10 @@ canvas#atlasCanvas { } .iconbtn:hover { background: rgba(108,177,255,0.22); border-color: rgba(108,177,255,0.55); } .iconbtn.active { background: rgba(255,170,85,0.22); border-color: rgba(255,170,85,0.55); color: var(--accent-warm); } +.iconbtn.iconbtn-primary { background: rgba(167,243,208,0.22); border-color: rgba(167,243,208,0.55); color: #a7f3d0; font-weight: 600; } +.iconbtn.iconbtn-primary:hover { background: rgba(167,243,208,0.32); border-color: rgba(167,243,208,0.75); color: #d6f5e6; } +.iconbtn.iconbtn-subtle { opacity: 0.6; } +.iconbtn.iconbtn-subtle:hover { opacity: 1; } /* ---------- side panels ---------- */ .side-panel { diff --git a/docs/data/research/argument_chains.json b/docs/data/research/argument_chains.json new file mode 100644 index 0000000..37585f0 --- /dev/null +++ b/docs/data/research/argument_chains.json @@ -0,0 +1,169 @@ +{ + "$schema": "./schema.json#/node_kinds/argument_chain", + "argument_chains": [ + { + "id": "chain:planning_oriented_query_sharing", + "title": "以规划为最终损失的可微查询共享是否真的为闭环安全带来好处", + "subject_papers": ["paper:2212.10156", "paper:2210.14222", "paper:vadv2"], + "research_gap": "现有端到端工作普遍在开环位移误差上比较,但开环误差对策略诱导分布偏移不敏感,且常常受到自车状态泄漏污染,使得已有结论无法被直接外推到闭环安全。", + "core_claim": "如果在评测协议中显式遮蔽自车状态并在统一的反应式闭环上比较,那么以规划为最终损失共享查询的方法仍然在多种场景下优于把感知与规划解耦的强基线;如果差距收敛甚至消失,则原有论点需要被显著修订。", + "method_mechanism": "在共享查询的端到端架构上分别关闭与开启自车状态输入并在两类闭环评测上做对照,再补充对查询数量与查询类型的细粒度消融来定位规划提升真正来自哪一组件。", + "key_experiments": [ + "在 nuPlan 与 Bench2Drive 上分别比较 UniAD 风格与 PlanT 风格的端到端模型,闭环碰撞率与路线完成度为主指标", + "在去 ego 状态条件下比较开环与闭环表现的相对秩", + "对查询子集做逐组消融,量化感知共享与规划共享的贡献" + ], + "strong_baselines": ["paper:2210.14222", "paper:transfuser", "paper:vadv2"], + "ablations": [ + "关闭跟踪查询与运动查询", + "用对象 token 替换密集 BEV 查询", + "把规划损失替换为独立的轨迹预测损失" + ], + "negative_results": [ + "若关闭自车状态后开环优势消失但闭环优势仍存在,说明问题在评测协议而非方法", + "若两类优势都消失,端到端必要性需要重新论证" + ], + "reviewer_attacks": [ + "评测协议偏向闭环友好的方法", + "自车状态泄漏的修正是否过度", + "种子数量不足导致统计置信不够" + ], + "response_experiments": [ + "提供基于至少八个种子的置信区间", + "公开评测脚本与基准结果的最低单元", + "在多套闭环协议上同时报告" + ], + "figure_plan": [ + "图一展示两类协议下的相对秩翻转", + "图二展示查询消融的逐组成本收益", + "图三展示闭环失败的场景分布" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "related_datasets": ["dataset:nuplan_planning", "dataset:bench2drive"], + "related_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion"] + }, + { + "id": "chain:dual_system_for_long_tail", + "title": "快慢双系统语言驱动规划是否真正解决长尾而不牺牲实时性", + "subject_papers": ["paper:2402.12289", "paper:2311.10813", "paper:2309.16292"], + "research_gap": "语言驱动方法在论文里通常在静态长尾案例上展示成功,但缺乏在统一闭环上同时报告调用频率、延迟、成功率与失败模式的完整证据,难以判断慢系统是否真正带来净收益。", + "core_claim": "在受控的长尾闭环基准上,配合显式可学习门控的快慢双系统在保证延迟预算的前提下显著提升长尾片段成功率;若门控被去掉或语言模型被替换为不可控版本,这一收益会消失。", + "method_mechanism": "把语言模型作为可被门控触发的慢系统并把工具调用与记忆反思视为模块化能力,对门控、工具与记忆做正交消融,并以延迟与成功率的联合 Pareto 前沿作为评测对象。", + "key_experiments": [ + "在 Bench2Drive 与 CARLA Town05 Long 长尾片段上对比有无慢系统", + "对门控阈值、记忆容量、工具集大小做扫描", + "在受限延迟预算下衡量净增益" + ], + "strong_baselines": ["paper:2212.10156", "paper:transfuser", "paper:vadv2"], + "ablations": [ + "去掉门控让慢系统始终触发", + "去掉记忆与反思", + "去掉工具调用层" + ], + "negative_results": [ + "若在延迟预算下成功率提升消失,则双系统在该平台不可部署", + "若失败模式集中在语言幻觉触发的违规行为,说明门控本身需要风险层" + ], + "reviewer_attacks": [ + "慢系统的真实延迟未被诚实公开", + "评测仅在静态长尾案例上做", + "工具集本身已经为评测特化" + ], + "response_experiments": [ + "公开端到端时间分布而不仅是均值", + "在动态闭环上同时报告快慢系统调用率", + "提供工具集的迁移评测" + ], + "figure_plan": [ + "图一显示延迟与成功率的 Pareto 前沿", + "图二显示门控触发率随长尾密度的变化", + "图三显示失败模式分布与归因" + ], + "related_scenarios": ["scenario:long_tail_rare_object_on_road", "scenario:construction_zone_with_cone_lane_shift", "scenario:dense_pedestrian_crosswalk_at_night"], + "related_datasets": ["dataset:bench2drive", "dataset:carla_town05_long"], + "related_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "chain:counterfactual_branches_as_safety_signal", + "title": "反事实分支能否成为视觉语言动作模型的可学习安全信号", + "subject_papers": ["paper:2512.24426", "paper:2402.12289"], + "research_gap": "现有视觉语言动作模型大多在主轨迹上训练与评测,没有把未发生但可能发生的状态作为显式的训练信号;社区缺乏可复现的反事实分支基准。", + "core_claim": "在训练与评测中显式把可生成的反事实分支作为监督信号,可以让模型在低频高风险场景下做出更安全的决策,并且这种收益对反事实分支的真实度具有可量化的敏感性。", + "method_mechanism": "用前向预测器生成多条反事实分支,并在分支上施加风险敏感损失,使模型在主轨迹与分支上的策略联合最优;评测上构建公开的反事实分支测试集并报告分支成功率。", + "key_experiments": [ + "在 Bench2Drive 与 NAVSIM 上比较有无反事实损失的策略", + "对反事实分支真实度做阶梯式扰动并量化收益变化", + "在公开反事实评测集上汇报分支成功率与主轨迹性能" + ], + "strong_baselines": ["paper:2402.12289", "paper:vadv2", "paper:2212.10156"], + "ablations": [ + "去掉反事实损失", + "替换为均匀采样的扰动而非语义反事实", + "限制反事实分支数量" + ], + "negative_results": [ + "若反事实损失导致主轨迹性能显著退化,说明权重设计需要重做", + "若分支真实度低于阈值时收益反向,需要建立分支质量门槛" + ], + "reviewer_attacks": [ + "反事实分支生成器自身的偏差污染训练", + "反事实评测与真实事故分布的相关性未知", + "对主轨迹的影响未充分量化" + ], + "response_experiments": [ + "公开反事实分支生成器的失败率分布", + "提供主轨迹基线的多种子置信区间", + "在反事实评测与真实事故子集上联合比较" + ], + "figure_plan": [ + "图一显示反事实评测集的构建流水线", + "图二显示反事实损失权重的成本收益曲线", + "图三显示主轨迹与反事实分支的联合分布" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:long_tail_rare_object_on_road"], + "related_datasets": ["dataset:bench2drive", "dataset:navsim_planning"], + "related_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"] + }, + { + "id": "chain:closed_loop_eval_protocol_audit", + "title": "面向端到端自动驾驶的可审计闭环评测协议", + "subject_papers": ["paper:2212.10156", "paper:vadv2", "paper:2210.14222"], + "research_gap": "已有闭环基准在自车状态泄漏、种子数、感知集成与他车反应模型的处理上各自不同,导致跨论文比较失去意义。", + "core_claim": "可以构造一份能自动检测自车状态泄漏、强制最少种子数、强制公开评测脚本并要求同时报告开环与闭环指标的协议;在该协议下,目前公开的多数端到端方法的相对秩与原报告差距显著。", + "method_mechanism": "把评测协议改造成一组自动化检查、统一脚本与最小数据集组合,并在主流方法上同时跑两类协议以量化差距。", + "key_experiments": [ + "在 nuPlan、NAVSIM 与 Bench2Drive 上以新协议复跑公开方法", + "在带与不带 ego 状态泄漏的两组协议上比较相对秩", + "对种子数与他车反应模型做敏感性分析" + ], + "strong_baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "paper:2210.14222"], + "ablations": [ + "只用单一种子", + "禁止公开评测脚本", + "允许 ego 状态泄漏" + ], + "negative_results": [ + "若新协议下所有方法都接近基线,说明端到端的真实进展被显著高估", + "若新协议下各种方法的相对秩不变,说明已有结论是稳健的" + ], + "reviewer_attacks": [ + "协议是否过度收紧以致没有方法能通过", + "对感知集成的处理是否公平", + "他车反应模型的选择是否偏向特定方法" + ], + "response_experiments": [ + "提供逐步放宽的协议变体", + "在不同感知集成等级上分层比较", + "在多种他车模型下复现" + ], + "figure_plan": [ + "图一展示新旧协议下的相对秩翻转", + "图二展示协议各条款对方法分布的影响", + "图三展示评测自动化检查的执行示例" + ], + "related_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "related_datasets": ["dataset:nuplan_planning", "dataset:navsim_planning", "dataset:bench2drive"], + "related_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion", "metric:rule_compliance_score"] + } + ] +} diff --git a/docs/data/research/claims.json b/docs/data/research/claims.json new file mode 100644 index 0000000..67c1044 --- /dev/null +++ b/docs/data/research/claims.json @@ -0,0 +1,273 @@ +{ + "$schema": "./schema.json#/node_kinds/claim", + "claims": [ + { + "id": "claim:uniad_query_sharing_lowers_planning_l2", + "subject": "paper:2212.10156", + "statement": "在共享一组可微 BEV 查询的端到端架构下,把检测、跟踪、地图、运动与占用模块的梯度共同导向规划目标,可以让规划在专家分布上的开环位移误差显著低于把这些模块独立训练的模块化基线。", + "evidence": [ + {"kind": "table", "source": "UniAD CVPR 2023 论文表 4", "finding": "在 nuScenes 验证集上 1 秒、2 秒、3 秒平均位移误差均优于使用分离式预测与规划的 ST-P3 等基线"}, + {"kind": "ablation", "source": "UniAD CVPR 2023 论文表 6", "finding": "去掉 MotionFormer 与 OccFormer 各自带来开环位移误差与碰撞率的可测退化"}, + {"kind": "repro", "source": "OpenDriveLab/UniAD 复现脚本", "finding": "公开权重在 nuScenes 上可复现报告数字"} + ], + "preconditions": [ + "训练与测试使用相同的 nuScenes 子集", + "感知模块由 BEVFormer 风格的视觉骨干提供", + "评测使用 ego 轨迹回归损失" + ], + "counterexamples": [ + "若在测试时引入 ego 状态泄漏,简单基线也能在开环位移上接近 UniAD", + "在跨城市分布下共享查询的收益会被分布漂移抵消" + ], + "boundaries": [ + "结论限定在开环位移误差与同源分布", + "未直接外推到大幅不同的传感器配置或国家", + "对极端长尾事件没有直接证据" + ], + "reproduction": { + "minimal": "lab03 在合成 2D 场景上对比共用查询与独立查询的轨迹误差", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 6, + "expected_output": "共用查询的 L2 误差显著低于独立查询基线,差距在 mini 子集上即可观察" + }, + "publication_value": "系统改进 + 机制解释", + "dispute_level": 1, + "evidence_strength": 3, + "reproducibility_status": "verified", + "related_claims": ["claim:uniad_open_loop_not_safety", "claim:plant_object_token_sufficient_for_planning"], + "related_failure_modes": ["failure_mode:ego_status_leakage"] + }, + { + "id": "claim:uniad_open_loop_not_safety", + "subject": "paper:2212.10156", + "statement": "UniAD 在 nuScenes 上的开环优势不能直接外推为闭环安全提升,因为开环位移误差对策略诱导的状态分布漂移不敏感。", + "evidence": [ + {"kind": "external_benchmark", "source": "BEV-Planner 复现报告", "finding": "在控制 ego 状态泄漏后开环差距大幅缩小"}, + {"kind": "theorem", "source": "Ross et al. 2011 DAgger 误差累积论证", "finding": "策略诱导分布偏移导致开环误差对真实风险存在系统性偏差"} + ], + "preconditions": [ + "评测在 nuScenes 上以专家轨迹为参考", + "未对策略做闭环回放或反应式仿真" + ], + "counterexamples": [ + "若能证明 UniAD 的策略对闭环漂移具有显式不变性" + ], + "boundaries": [ + "限于纯 nuScenes 开环评测语境", + "不否定 UniAD 在感知与轨迹回归层的实质贡献" + ], + "reproduction": { + "minimal": "在 nuScenes 上比较 UniAD 与简单速度保持基线在显式遮蔽 ego 状态后的差距", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 3, + "expected_output": "差距大幅收敛,部分子集上甚至消失" + }, + "publication_value": "失败模式发现 + 评测协议改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:uniad_query_sharing_lowers_planning_l2"], + "related_failure_modes": ["failure_mode:ego_status_leakage", "failure_mode:occlusion_blind_spot_overconfidence"] + }, + { + "id": "claim:plant_object_token_sufficient_for_planning", + "subject": "paper:2210.14222", + "statement": "在城市仿真闭环中只用一组对象级 token 而舍弃稠密 BEV 特征,仍然可以达到与 BEV 端到端方法相当甚至更优的驾驶分数,说明对人类驾驶决策必要的信息可以在对象级表示中被压缩保留。", + "evidence": [ + {"kind": "table", "source": "PlanT NeurIPS 2022 论文表 1 与表 2", "finding": "在 LAV 与 Longest6 基准上 PlanT 的驾驶分数与 TransFuser 等稠密 BEV 方法持平或更高"}, + {"kind": "ablation", "source": "PlanT 论文表 4", "finding": "在仅保留近邻若干 agent token 时性能下降可控,远端 token 收益边际递减"} + ], + "preconditions": [ + "上游对象检测与跟踪足够可靠", + "评测限于 CARLA 城市场景" + ], + "counterexamples": [ + "在依赖密集占用信息的施工或低矮障碍场景下对象级 token 不足以表示安全相关几何", + "对象 token 的真实感来自感知质量,感知误差会破坏对象抽象" + ], + "boundaries": [ + "结论限于规则化对象表示充分的城市闭环", + "未直接外推到高速或复杂博弈场景" + ], + "reproduction": { + "minimal": "lab04 在对象级合成场景上对比 token 数量与规划误差", + "public_data": "dataset:carla_town05_long", + "cost_hours": 8, + "expected_output": "在足够数量 token 下规划误差接近稠密基线,少量 token 即可保留主要规划信号" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "verified", + "related_claims": ["claim:uniad_query_sharing_lowers_planning_l2"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:drivevlm_dual_recovers_long_tail_without_killing_latency", + "subject": "paper:2402.12289", + "statement": "把视觉语言模型作为慢系统并与传统快规划器组成双管线,可以在分布外目标与少见语义场景上恢复显著的规划成功率,同时通过门控仅在必要时调用慢系统而保持闭环延迟可控。", + "evidence": [ + {"kind": "table", "source": "DriveVLM 论文表 3", "finding": "在长尾片段上的成功率显著高于纯端到端基线"}, + {"kind": "ablation", "source": "DriveVLM 论文表 5", "finding": "去掉慢系统的双管线退化为基线性能;去掉快系统则延迟无法支撑闭环"} + ], + "preconditions": [ + "慢系统的调用门控存在显式可学习信号", + "慢系统在被调用时延迟可被快系统吸收" + ], + "counterexamples": [ + "在持续高密度长尾的场景下门控可能频繁触发慢系统并耗尽延迟预算", + "慢系统的语言输出与运动控制的对齐失败会造成不可解释的接管" + ], + "boundaries": [ + "结论限于配备相机与短期回放的车端环境", + "对非视觉模态退化不直接保证", + "对慢系统语言幻觉的失败模式没有系统覆盖" + ], + "reproduction": { + "minimal": "lab09 用 Mock 后端模拟双管线门控与延迟预算", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 4, + "expected_output": "在长尾片段子集上门控触发率与成功率均显著提升,单帧延迟不超过阈值" + }, + "publication_value": "系统改进 + 失败模式发现", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:agent_driver_tool_use_reduces_planner_dead_ends", "claim:dilu_memory_reflection_reduces_long_tail_failures"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss", "failure_mode:ride_comfort_violation_due_to_late_braking"] + }, + { + "id": "claim:agent_driver_tool_use_reduces_planner_dead_ends", + "subject": "paper:2311.10813", + "statement": "把规划器封装成被语言模型调用的工具集合,让语言模型选择是否查询地图、轨迹预测或风险评估,可以在罕见冲突场景下减少规划器陷入死锁的频率。", + "evidence": [ + {"kind": "ablation", "source": "Agent-Driver 论文表 2", "finding": "去掉工具调用层在长尾片段上的死锁率显著上升"}, + {"kind": "table", "source": "Agent-Driver 论文表 4", "finding": "在分布外场景上的通过率超过仅由端到端规划器输出的基线"} + ], + "preconditions": [ + "工具接口稳定且文档化", + "语言模型推理延迟在可接受范围内" + ], + "counterexamples": [ + "工具调用链路本身可能引入新的脆弱失败", + "语言模型的工具选择可能在压力下退化为始终调用同一工具" + ], + "boundaries": [ + "结论限于具备地图查询与预测工具的实验环境", + "未直接量化对乘员舒适度的影响" + ], + "reproduction": { + "minimal": "lab08 用 Mock 工具集模拟工具调用与死锁恢复", + "public_data": "dataset:carla_town05_long", + "cost_hours": 5, + "expected_output": "在指定的死锁场景下通过率提升而不显著恶化路线完成度" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency", "claim:dilu_memory_reflection_reduces_long_tail_failures"], + "related_failure_modes": ["failure_mode:closed_loop_deadlock_under_uncertainty", "failure_mode:multi_agent_interaction_indecision"] + }, + { + "id": "claim:dilu_memory_reflection_reduces_long_tail_failures", + "subject": "paper:2309.16292", + "statement": "在语言模型驱动的决策循环中加入显式经验记忆与反思机制,使得模型可以从过往失败的相似场景中检索教训,从而在长尾场景上的决策错误率显著低于无记忆的同类语言模型基线。", + "evidence": [ + {"kind": "table", "source": "DiLu 论文表 2", "finding": "带记忆与反思的循环在 GPT-3.5 与 GPT-4 上均显著优于无记忆基线"}, + {"kind": "ablation", "source": "DiLu 论文表 4", "finding": "去除反思模块后失败率明显回升"} + ], + "preconditions": [ + "记忆库覆盖足够多样的历史失败", + "检索接口可以匹配新场景到相关经验" + ], + "counterexamples": [ + "记忆库被污染或包含错误经验时反思会放大错误", + "记忆检索延迟可能超过实时决策预算" + ], + "boundaries": [ + "结论限于决策频率较低的语言驱动决策循环", + "对端到端连续控制不直接适用", + "对极少见且无相似历史的场景效果有限" + ], + "reproduction": { + "minimal": "lab07 用 Mock 后端模拟记忆检索与反思循环", + "public_data": "dataset:carla_town05_long", + "cost_hours": 3, + "expected_output": "带反思版本在指定的长尾子集上的错误率显著低于无记忆基线" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 2, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency", "claim:agent_driver_tool_use_reduces_planner_dead_ends"], + "related_failure_modes": ["failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:cfvla_counterfactual_branches_close_evaluation_gap", + "subject": "paper:2512.24426", + "statement": "把反事实分支显式引入视觉语言动作模型的训练与评测,可以在保留主轨迹规划能力的同时显著提升对低频高风险场景的应对成功率,因为反事实分支强迫模型对未发生但可能发生的状态做出明确决策。", + "evidence": [ + {"kind": "table", "source": "CF-VLA 论文表 2", "finding": "在反事实分支评测集上成功率超过无反事实训练的基线"}, + {"kind": "ablation", "source": "CF-VLA 论文表 5", "finding": "去除反事实分支损失后长尾片段成功率回落"} + ], + "preconditions": [ + "反事实分支生成器具备一定真实感", + "评测协议公开反事实分支并允许复现" + ], + "counterexamples": [ + "如果反事实分支与真实分布偏差过大,训练可能过度悲观", + "反事实评测可能与现实事故分布不一致" + ], + "boundaries": [ + "结论限于具备生成反事实能力的训练流水线", + "对未给出反事实分支的真实事故没有直接保证" + ], + "reproduction": { + "minimal": "lab10 在合成反事实分支上对比有无反事实损失的策略", + "public_data": "dataset:bench2drive", + "cost_hours": 12, + "expected_output": "带反事实损失的策略在反事实评测子集上的成功率显著更高且对主轨迹影响有限" + }, + "publication_value": "机制解释 + 基准构建", + "dispute_level": 2, + "evidence_strength": 1, + "reproducibility_status": "inferred", + "related_claims": ["claim:drivevlm_dual_recovers_long_tail_without_killing_latency"], + "related_failure_modes": ["failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:long_tail_object_recognition_miss"] + }, + { + "id": "claim:vadv2_probabilistic_planning_covers_multimodality", + "subject": "paper:vadv2", + "statement": "向量化端到端架构在规划头上引入概率分布预测可以显式建模专家行为的多模态性,从而在多车交互与未保护转向场景中比单一回归轨迹基线更不容易陷入平均化失败。", + "evidence": [ + {"kind": "table", "source": "VADv2 论文表 1 与表 3", "finding": "在 nuScenes 与 CARLA 上 minADE 与碰撞率均优于回归型 VAD 基线"}, + {"kind": "ablation", "source": "VADv2 论文表 5", "finding": "去掉概率头退化为回归基线"} + ], + "preconditions": [ + "训练数据覆盖足够多模式行为", + "概率头训练采用合理的负样本与温度" + ], + "counterexamples": [ + "在极端低数据子集上概率头可能塌缩为单峰", + "对完全未见过的多模态分布仍可能选择错误模式" + ], + "boundaries": [ + "结论限于矢量化感知输入与有限规划视野", + "未直接外推到非结构化场景" + ], + "reproduction": { + "minimal": "在 nuScenes mini 上对比回归头与概率头的多模态覆盖", + "public_data": "dataset:nuscenes_planning", + "cost_hours": 6, + "expected_output": "概率头在选定多模态子集上模式覆盖与碰撞率均优于回归头" + }, + "publication_value": "机制解释 + 系统改进", + "dispute_level": 1, + "evidence_strength": 2, + "reproducibility_status": "partial", + "related_claims": ["claim:plant_object_token_sufficient_for_planning"], + "related_failure_modes": ["failure_mode:multi_agent_interaction_indecision"] + } + ] +} diff --git a/docs/data/research/datasets.json b/docs/data/research/datasets.json new file mode 100644 index 0000000..513c492 --- /dev/null +++ b/docs/data/research/datasets.json @@ -0,0 +1,136 @@ +{ + "$schema": "./schema.json#/node_kinds/dataset", + "datasets": [ + { + "id": "dataset:nuscenes_planning", + "label": "nuScenes 规划评测分卷", + "scale": "1000 段 20 秒驾驶序列,主要采集自波士顿与新加坡两个城市,6 路环视相机加 1 路前向雷达加 1 路 360 度激光雷达,每秒 2 帧关键帧标注。", + "supports": [ + "在已有人类示范条件下评估开环轨迹回归误差", + "比较不同感知与预测骨干在同一规划损失下的协同收益", + "支持感知与轨迹预测的弱监督联合训练" + ], + "limits": [ + "只有约 5.5 小时高质量标注数据,长尾事件极度稀缺", + "缺乏闭环回放,所有评测都基于专家轨迹假设", + "城市风格单一,对中国式城区不适用", + "ego 状态作为输入时容易造成评测虚高" + ], + "common_misuses": [ + "把开环位移误差当作部署安全的代理,会忽略分布漂移与累计误差", + "在 mini 子集上做模型选择,再迁移到完整集时表现退化", + "把 ego 速度直接拼接到输入而不在测试时遮蔽" + ], + "covers_scenarios": ["scenario:dense_pedestrian_crosswalk_at_night", "scenario:long_tail_rare_object_on_road"], + "license": "CC BY-NC-SA 4.0,禁止商业训练,只允许学术使用与公开基准对比。" + }, + { + "id": "dataset:nuplan_planning", + "label": "nuPlan 闭环规划基准", + "scale": "约 1500 小时美国与新加坡多城市驾驶日志,包含完整 ego 状态、地图、追踪结果,提供基于交互式模拟器的闭环评测协议。", + "supports": [ + "比较纯学习方法与基于规则的强基线在闭环下的真实表现", + "在统一的反应式 agent 模拟器下评估长视野与多车交互稳健性", + "对规划损失做闭环导向的消融实验" + ], + "limits": [ + "模拟器中的他车行为来源于 IDM 或类似规则,可能低估真实复杂度", + "感知噪声不在评测之内,给定真值或预先跟踪的轨迹", + "缺少恶劣天气与传感器故障注入" + ], + "common_misuses": [ + "把基于真值跟踪的闭环成绩声明为端到端能力", + "在固定的他车反应模型下过拟合社会博弈策略", + "用同一城市训练评测而不报告跨城迁移" + ], + "covers_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:highway_merge_at_speed_differential"], + "license": "Motional 自定义许可,学术研究允许,商业用途需协商。" + }, + { + "id": "dataset:waymo_open_motion", + "label": "Waymo Open Motion 数据集", + "scale": "约 10 万段每段 20 秒的高频率轨迹数据,涵盖六个美国大都会城市的高密度交互场景,提供地图与多类道路使用者的语义注释。", + "supports": [ + "评估多 agent 长视野预测与交互建模", + "构建场景挖掘流水线以提取罕见博弈样本", + "迁移到下游 motion-conditioned planning 的预训练" + ], + "limits": [ + "缺乏稠密像素级传感器原始流,主要是后处理后的轨迹", + "夜间与恶劣天气数据较少", + "评测协议偏向预测精度,不直接支持闭环安全度量" + ], + "common_misuses": [ + "把预测 minADE/minFDE 当作规划能力的代理", + "在没有遮挡建模的前提下用真值轨迹做监督训练" + ], + "covers_scenarios": ["scenario:dense_pedestrian_crosswalk_at_night", "scenario:highway_merge_at_speed_differential"], + "license": "Waymo Dataset License,学术免费,商业用途须申请。" + }, + { + "id": "dataset:carla_town05_long", + "label": "CARLA Town05 Long 闭环基准", + "scale": "基于 CARLA 0.9.10 的 Town05 地图,10 条长路线覆盖城市、郊区与高速混合,支持注入天气、行人密度与对手车辆。", + "supports": [ + "在统一仿真器中比较视觉端到端方法的真闭环表现", + "对天气、传感器故障与他车冲突做受控扰动", + "在不可观测意图下做反事实分支评估" + ], + "limits": [ + "图像真实感与真实路面有视觉域差距", + "他车策略基于规则,对长尾行为缺乏覆盖", + "评测结果对种子敏感,需要多种子统计置信" + ], + "common_misuses": [ + "只跑一两个种子就声明性能优越", + "在固定的官方天气下训练评测,忽略广义化失败", + "把 CARLA 闭环分数直接外推到真实城区" + ], + "covers_scenarios": ["scenario:unprotected_left_turn_with_occlusion", "scenario:construction_zone_with_cone_lane_shift", "scenario:heavy_rain_with_camera_lens_droplet"], + "license": "MIT,可自由用于学术与商业。" + }, + { + "id": "dataset:bench2drive", + "label": "Bench2Drive 闭环再现基准", + "scale": "基于 CARLA 的 220 条多样化路线,覆盖 12 类典型场景,提供官方的训练分割、评测脚本与基线模型。", + "supports": [ + "在统一种子集合下比较视觉端到端模型的可复现闭环性能", + "做受控的天气、交通与传感器扰动消融", + "评估同一方法在分布内和分布外路线上的差距" + ], + "limits": [ + "依赖 CARLA 真实感上限", + "评测语义偏 driving score,缺少对乘员舒适度的细分", + "种子数量有限时统计可信度不足" + ], + "common_misuses": [ + "只汇报 driving score 主指标,忽略分段失败模式", + "在评估集上做超参选择", + "用未公开的视觉骨干声明可复现" + ], + "covers_scenarios": ["scenario:construction_zone_with_cone_lane_shift", "scenario:long_tail_rare_object_on_road", "scenario:unprotected_left_turn_with_occlusion"], + "license": "MIT,可自由使用。" + }, + { + "id": "dataset:navsim_planning", + "label": "NAVSIM 非反应式闭环规划基准", + "scale": "基于 nuPlan 数据筛选出的高交互片段,提供非反应式的代理模拟与统一的 driving score 评测协议。", + "supports": [ + "在受控的反事实分支下评估规划稳健性", + "比较开环到闭环过渡阶段的指标一致性", + "为基于真实数据的规划评测提供轻量级流水线" + ], + "limits": [ + "代理为非反应式回放,限制了真实多车博弈", + "覆盖城市与时段仍受 nuPlan 自身约束", + "缺乏感知端到端集成评测" + ], + "common_misuses": [ + "把非反应式分数当作真实闭环能力", + "未做反事实分支扰动就声明稳健性" + ], + "covers_scenarios": ["scenario:highway_merge_at_speed_differential", "scenario:unprotected_left_turn_with_occlusion"], + "license": "Motional 自定义许可,学术研究允许。" + } + ] +} diff --git a/docs/data/research/experiment_plans.json b/docs/data/research/experiment_plans.json new file mode 100644 index 0000000..196fc9b --- /dev/null +++ b/docs/data/research/experiment_plans.json @@ -0,0 +1,133 @@ +{ + "$schema": "./schema.json#/node_kinds/experiment_plan", + "experiment_plans": [ + { + "id": "experiment_plan:planning_oriented_query_sharing", + "title": "以规划为最终损失的查询共享端到端结构的三层实验", + "subject": "chain:planning_oriented_query_sharing", + "tier_1_minimal_mechanism": { + "purpose": "在合成 2D 场景上快速验证共享查询是否真的把上游任务梯度导向下游规划", + "environment": "lab03 风格的离散网格或玩具高速公路", + "model": "轻量级 transformer,五种查询头,规划头只有一层", + "metrics": ["合成场景下的轨迹回归误差", "上游任务保留率"], + "success_criteria": "共享查询的方案在保持上游任务精度的同时显著降低规划误差,与同结构独立查询基线有可视差距", + "runtime_hours": 4, + "expected_signal": "梯度统计显示规划损失对上游查询参数的影响显著" + }, + "tier_2_public_benchmark": { + "purpose": "在 nuPlan 与 Bench2Drive 上以新协议同时报告开环与闭环表现", + "datasets": ["dataset:nuplan_planning", "dataset:bench2drive"], + "baselines": ["paper:2210.14222", "paper:transfuser", "paper:vadv2", "paper:2212.10156"], + "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion"], + "success_criteria": "在控制 ego 状态泄漏后开环位移差距与闭环碰撞率改进同时成立,至少在两类协议上重现", + "compute_budget": "约 480 GPU 小时,以八张 A100 算力为参考", + "expected_signal": "查询共享方法在闭环安全维度仍保留优势" + }, + "tier_3_stress_test": { + "purpose": "评估端到端结构在分布漂移与传感器退化下的稳健性", + "distributions": ["跨城市迁移", "夜间与雨天扰动", "施工临时几何"], + "perturbations": ["镜头水珠噪声", "雷达短暂失效", "他车反应模型替换"], + "latency_budget": "单帧规划延迟不超过 100 毫秒", + "sensor_dropout": "随机遮蔽 1 至 2 路相机", + "counterfactual_branches": "在每个核心片段额外评测 4 条反事实分支", + "success_criteria": "在每类扰动下闭环碰撞率不超过未扰动基线的 1.5 倍且路线完成度下降可解释" + } + }, + { + "id": "experiment_plan:dual_system_for_long_tail", + "title": "快慢双系统语言驱动规划在长尾上的三层实验", + "subject": "chain:dual_system_for_long_tail", + "tier_1_minimal_mechanism": { + "purpose": "用 Mock 后端验证门控机制能在延迟预算内调用慢系统并改善单步决策", + "environment": "lab07 与 lab08 风格的脚本化决策回合", + "model": "Mock 语言模型与轻量级快规划器组合", + "metrics": ["门控触发率", "决策正确率", "模拟延迟"], + "success_criteria": "门控在指定长尾子集上触发率高于基线,决策正确率显著提升,模拟延迟在预算内", + "runtime_hours": 2, + "expected_signal": "门控对应的特征与触发率有清晰相关" + }, + "tier_2_public_benchmark": { + "purpose": "在闭环驾驶基准上同时报告延迟分布、调用率与成功率", + "datasets": ["dataset:bench2drive", "dataset:carla_town05_long"], + "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser"], + "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"], + "success_criteria": "长尾成功率与碰撞率均显著改进且单帧最坏延迟在预算内,舒适度无明显恶化", + "compute_budget": "约 240 GPU 小时加额外语言模型推理预算", + "expected_signal": "Pareto 前沿移动到更优区域" + }, + "tier_3_stress_test": { + "purpose": "评估慢系统在压力下的退化与失败模式", + "distributions": ["持续高密度长尾事件", "对抗性语言提示注入"], + "perturbations": ["慢系统延迟抖动", "工具集变更", "记忆库注入错误经验"], + "latency_budget": "单帧最坏延迟不超过 200 毫秒", + "sensor_dropout": "在慢系统判断关键阶段随机遮蔽传感器一路", + "counterfactual_branches": "为每条长尾片段构造两条反事实分支", + "success_criteria": "在每类压力下退化曲线可被解释且不引入新型隐蔽失败" + } + }, + { + "id": "experiment_plan:counterfactual_branches_as_safety_signal", + "title": "反事实分支作为视觉语言动作模型的安全信号的三层实验", + "subject": "chain:counterfactual_branches_as_safety_signal", + "tier_1_minimal_mechanism": { + "purpose": "在合成反事实分支上验证反事实损失对策略行为的塑形效应", + "environment": "lab10 风格的合成场景与分支生成器", + "model": "轻量级视觉语言动作模型与一阶分支生成器", + "metrics": ["主轨迹回归误差", "反事实分支成功率"], + "success_criteria": "反事实分支成功率显著提升,主轨迹误差变化在容忍范围内", + "runtime_hours": 8, + "expected_signal": "策略在分支上做出与主轨迹一致的安全决策" + }, + "tier_2_public_benchmark": { + "purpose": "在公开反事实分支基准上比较有无反事实损失的策略", + "datasets": ["dataset:bench2drive", "dataset:navsim_planning"], + "baselines": ["paper:vadv2", "paper:2402.12289", "paper:2212.10156"], + "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "success_criteria": "反事实分支成功率显著高于基线且主轨迹性能不显著退化", + "compute_budget": "约 360 GPU 小时", + "expected_signal": "联合分布上反事实损失带来的收益是稳定的" + }, + "tier_3_stress_test": { + "purpose": "对反事实分支真实度与多样性做敏感性扫描", + "distributions": ["不同语义类别的反事实", "几何上更激进的扰动"], + "perturbations": ["分支生成器输出加噪", "对手策略接管"], + "latency_budget": "训练流水线不超过两倍主流端到端方法", + "sensor_dropout": "在分支评测时随机遮蔽传感器", + "counterfactual_branches": "每片段至少六条反事实分支并报告分支多样性指标", + "success_criteria": "在真实度下降到给定阈值前收益保持,超过阈值后退化曲线可解释" + } + }, + { + "id": "experiment_plan:closed_loop_eval_protocol_audit", + "title": "可审计闭环评测协议的三层实验", + "subject": "chain:closed_loop_eval_protocol_audit", + "tier_1_minimal_mechanism": { + "purpose": "对 ego 状态泄漏自动检测与种子统计要求做单元测试", + "environment": "tools/validate_research.py 的扩展检查", + "model": "无须模型,纯协议检查", + "metrics": ["检查通过率", "检查覆盖率"], + "success_criteria": "检查在已知泄漏样本上 100% 命中且对正常基线无误报", + "runtime_hours": 1, + "expected_signal": "协议检查可被复用为提交门禁" + }, + "tier_2_public_benchmark": { + "purpose": "在主流方法上同时跑新旧协议并比较相对秩", + "datasets": ["dataset:nuplan_planning", "dataset:navsim_planning", "dataset:bench2drive"], + "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "paper:2210.14222"], + "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion", "metric:rule_compliance_score"], + "success_criteria": "新协议下至少有一组方法的相对秩发生显著改变,原因可解释", + "compute_budget": "约 600 GPU 小时", + "expected_signal": "协议改造对社区比较具有结构性影响" + }, + "tier_3_stress_test": { + "purpose": "检验协议对不同感知集成与他车反应模型的稳健性", + "distributions": ["不同感知集成等级", "不同他车策略"], + "perturbations": ["种子数量缩减", "时间预算紧张"], + "latency_budget": "评测脚本可在 24 小时内完成单方法全套", + "sensor_dropout": "可选传感器组合下重复评测", + "counterfactual_branches": "在协议中允许嵌入反事实评测扩展", + "success_criteria": "协议在各种扰动下仍能产出一致的相对秩" + } + } + ] +} diff --git a/docs/data/research/failure_modes.json b/docs/data/research/failure_modes.json new file mode 100644 index 0000000..22508bf --- /dev/null +++ b/docs/data/research/failure_modes.json @@ -0,0 +1,197 @@ +{ + "$schema": "./schema.json#/node_kinds/failure_mode", + "failure_modes": [ + { + "id": "failure_mode:ego_status_leakage", + "label": "自车状态在测试时被显式输入造成评测虚高", + "trigger_conditions": [ + "训练与测试都把自车速度、加速度、横摆率作为输入", + "评测协议不显式禁止 ego 状态进入模型" + ], + "manifestation": "开环位移误差在 nuScenes 等基准上看似显著下降,但闭环回放或盲测后性能塌陷至接近基线水平。", + "reproducible_setup": "在 UniAD 或 VAD 代码库上分别打开和关闭 ego 状态输入并比较 L2 与碰撞率,使用 nuScenes mini 即可观察明显差距。", + "diagnostic_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate"], + "method_weakness": "评测设计与模型输入接口的耦合使得感知与规划的真实贡献被泄漏的状态遮蔽。", + "partial_solutions": [ + {"idea": "在测试时显式遮蔽 ego 状态并报告两组数字", "citation_or_repo": "BEV-Planner 的复现报告", "residual_gap": "仍未形成全社区强制的评测协议"}, + {"idea": "改用闭环反应式评测", "citation_or_repo": "nuPlan / NAVSIM 协议", "residual_gap": "他车反应模型本身仍是规则化代理"} + ], + "open_questions": [ + "怎样在不破坏端到端可训练性的同时强制评测公正", + "如何在公开 leaderboard 上自动检测 ego 状态泄漏" + ], + "publication_angles": [ + "提出可审计的端到端评测协议", + "在多个开源模型上系统量化 ego 状态泄漏的贡献" + ] + }, + { + "id": "failure_mode:closed_loop_deadlock_under_uncertainty", + "label": "高度不确定情境下规划器陷入安全死锁", + "trigger_conditions": [ + "对向车流密集且自车需要主动决策", + "感知置信度较低且规划损失对碰撞惩罚远高于停车惩罚" + ], + "manifestation": "自车长时间停留在保护性停止状态,路线完成度低,但碰撞率与规则合规分数表现良好。", + "reproducible_setup": "在 CARLA Town05 Long 上构造不受保护左转脚本,统计自车通过率与平均等待时间。", + "diagnostic_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate"], + "method_weakness": "对不确定性的处理倾向于过度保守,缺乏对延迟成本的显式建模。", + "partial_solutions": [ + {"idea": "把延迟成本写入规划损失", "citation_or_repo": "nuPlan 基线 PDM", "residual_gap": "调权敏感且无法跨场景迁移"}, + {"idea": "引入显式意图推断模块", "citation_or_repo": "Agent-Driver 推理链路", "residual_gap": "推理延迟与稳定性仍存疑"} + ], + "open_questions": [ + "如何构造对不可观测意图稳健的最优停止理论", + "怎样让多车交互中的让步行为成为可学习目标" + ], + "publication_angles": [ + "提出包含等待成本的闭环规划损失", + "构建专注于过度保守失败的诊断基准" + ] + }, + { + "id": "failure_mode:long_tail_object_recognition_miss", + "label": "长尾稀有物体识别遗漏导致直接碰撞", + "trigger_conditions": [ + "路面出现训练分布之外的可碰撞物体", + "目标尺寸或姿态在训练数据中频率极低" + ], + "manifestation": "检测器不输出该目标或输出极低置信度,规划器据此忽略目标并保持原速度。", + "reproducible_setup": "在 CARLA 中放置非常规物体如锥桶堆、纸箱、低矮障碍,比较多种端到端模型的碰撞率。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "目标先验由训练分布隐式决定,没有显式的开放类别处理。", + "partial_solutions": [ + {"idea": "用 VLM 或视觉基础模型做分布外目标提示", "citation_or_repo": "DriveVLM-Dual", "residual_gap": "推理延迟与可靠性仍受限"}, + {"idea": "占用预测取代显式检测", "citation_or_repo": "UniAD OccFormer 与 OccNet", "residual_gap": "对动态稀疏目标仍易漏报"} + ], + "open_questions": [ + "在没有目标类别标签时如何驱动安全规划", + "如何把分布外目标识别量化为可优化目标" + ], + "publication_angles": [ + "构建强制评估开放类别行为的长尾基准", + "提出基于占用的统一安全损失" + ] + }, + { + "id": "failure_mode:ride_comfort_violation_due_to_late_braking", + "label": "决策延迟造成晚刹引发的乘员舒适违规", + "trigger_conditions": [ + "对前车意图或行人意图判断滞后", + "规划频率低于事件演变速率" + ], + "manifestation": "在最后一秒以接近最大减速进行制动,乘员前倾,纵向加加速度峰值远超舒适阈值。", + "reproducible_setup": "在 NAVSIM 上启用乘员舒适评测,对若干主流端到端模型统计加加速度分布。", + "diagnostic_metrics": ["metric:ride_comfort_index", "metric:closed_loop_collision_rate"], + "method_weakness": "规划损失只对碰撞与位置进行约束,缺乏对决策时机的显式塑形。", + "partial_solutions": [ + {"idea": "把加加速度纳入规划损失", "citation_or_repo": "PDM 与 GameFormer 基线", "residual_gap": "在视觉端到端模型中难以平衡"}, + {"idea": "引入双系统快慢架构", "citation_or_repo": "DriveVLM-Dual 双管线", "residual_gap": "在低延迟模式下慢系统未必被触发"} + ], + "open_questions": [ + "如何在端到端训练中平衡安全与舒适的多目标优化", + "乘员体感是否需要主观问卷做最终校准" + ], + "publication_angles": [ + "提出兼顾决策时机与运动学的规划损失", + "构建受控的舒适度回归基准" + ] + }, + { + "id": "failure_mode:occlusion_blind_spot_overconfidence", + "label": "遮挡盲区中对对向车意图过度自信", + "trigger_conditions": [ + "前车或建筑物遮挡对向车流", + "感知模型在缺失观测情况下仍输出高置信度的占用预测" + ], + "manifestation": "自车在没有充分让行的情况下进入冲突区域,与遮挡区出现的对向车发生侧面碰撞。", + "reproducible_setup": "在 nuPlan 与 CARLA 上选择未受保护左转片段并人为加大遮挡,比较多种规划器的碰撞分布。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"], + "method_weakness": "占用预测对未观测区域的不确定性建模过于乐观,缺乏置信度校准。", + "partial_solutions": [ + {"idea": "对不确定区域施加显式悲观先验", "citation_or_repo": "BEV-Planner 与 PDM 风险层", "residual_gap": "校准过强会触发死锁失败"}, + {"idea": "引入显式意图推断与反事实分支", "citation_or_repo": "CF-VLA", "residual_gap": "对推理稳定性有更高要求"} + ], + "open_questions": [ + "如何把感知不确定性与规划风险耦合成可学习信号", + "怎样在评测中显式衡量遮挡盲区的处理质量" + ], + "publication_angles": [ + "提出与遮挡几何耦合的规划风险层", + "构建针对盲区行为的诊断基准" + ] + }, + { + "id": "failure_mode:map_prior_overrides_runtime_observation", + "label": "地图先验压倒运行时观测导致违规行为", + "trigger_conditions": [ + "高清地图与实际车道存在临时偏移", + "模型对地图特征的权重显著高于实时视觉特征" + ], + "manifestation": "在施工改道或临时锥桶布置区域,自车按地图原始几何行驶而忽视实地标志。", + "reproducible_setup": "在 Bench2Drive 施工场景或 CARLA 临时改道脚本上比较有无显式地图融合的端到端模型。", + "diagnostic_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate"], + "method_weakness": "地图被视为强先验而非建议,模型未学到何时应拒绝地图。", + "partial_solutions": [ + {"idea": "引入实时车道感知并允许覆盖地图", "citation_or_repo": "MapTR 与 LaneSegNet 系列", "residual_gap": "在地图与实地都不可信时仍困难"}, + {"idea": "用 VLM 做语义优先级判断", "citation_or_repo": "DriveVLM 与 LINGO-2", "residual_gap": "对响应延迟与可解释性要求高"} + ], + "open_questions": [ + "如何在端到端中学习地图与观测的动态信任度", + "怎样把临时几何作为评测维度纳入主流基准" + ], + "publication_angles": [ + "提出地图信任度自适应的端到端框架", + "构建专注临时几何变化的评测套件" + ] + }, + { + "id": "failure_mode:multi_agent_interaction_indecision", + "label": "多车交互中的犹豫造成路线完成度塌陷", + "trigger_conditions": [ + "高速并线或环岛汇入场景", + "对方让与不让具有高度模糊性" + ], + "manifestation": "自车反复减速加速试图寻找间隙,最终未能完成并线或被迫绕行。", + "reproducible_setup": "在 nuPlan 高速并线片段上启用反应式 agent 评测,比较规划器的并线成功率。", + "diagnostic_metrics": ["metric:route_completion", "metric:ride_comfort_index"], + "method_weakness": "规划器缺乏稳定的博弈策略与意图沟通通道。", + "partial_solutions": [ + {"idea": "显式博弈树搜索并融合到规划损失", "citation_or_repo": "GameFormer 与 PDM 系列", "residual_gap": "搜索深度与算力受限"}, + {"idea": "用语言或符号意图提示替代博弈", "citation_or_repo": "DiLu 与 Agent-Driver", "residual_gap": "意图沟通对其它车辆并不可见"} + ], + "open_questions": [ + "怎样把人类驾驶员的让步信号建模为可学习交互", + "在没有显式沟通通道时如何学习稳健博弈" + ], + "publication_angles": [ + "提出带显式博弈先验的端到端规划", + "构建并线犹豫的诊断基准" + ] + }, + { + "id": "failure_mode:sensor_degradation_silent_failure", + "label": "传感器退化时模型未触发降级处理", + "trigger_conditions": [ + "镜头存在水珠或灰尘", + "雷达或激光雷达在特定材质上回波不稳定" + ], + "manifestation": "感知输出看似正常但部分目标缺失或漂移,规划器据此采取正常行驶策略,最终造成碰撞或异常制动。", + "reproducible_setup": "在 CARLA 中注入相机噪声脚本或在 nuScenes 关键帧上模拟传感器局部失效。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"], + "method_weakness": "缺乏对传感器健康度的显式监测以及对应的策略降级机制。", + "partial_solutions": [ + {"idea": "多传感器一致性检查与降级模式", "citation_or_repo": "TransFuser 与多模态融合方法", "residual_gap": "在单一传感器故障时仍难触发降级"}, + {"idea": "在训练中显式注入传感器扰动", "citation_or_repo": "DriveDreamer 增强流水线", "residual_gap": "扰动分布难以覆盖真实失效模式"} + ], + "open_questions": [ + "如何把传感器健康度作为可学习信号写入端到端模型", + "如何评估降级策略的可信度而不仅看主指标" + ], + "publication_angles": [ + "提出传感器健康度驱动的策略切换机制", + "构建针对静默传感器失效的诊断基准" + ] + } + ] +} diff --git a/docs/data/research/metrics.json b/docs/data/research/metrics.json new file mode 100644 index 0000000..e2a33af --- /dev/null +++ b/docs/data/research/metrics.json @@ -0,0 +1,175 @@ +{ + "$schema": "./schema.json#/node_kinds/metric", + "metrics": [ + { + "id": "metric:open_loop_l2_displacement", + "label": "开环平均位移误差", + "formula": "L2_t = (1/T) * Σ_{t=1..T} || p̂_t − p*_t ||_2", + "variables": { + "p̂_t": "模型在第 t 步预测的自车位置", + "p*_t": "专家驾驶在第 t 步的真值位置", + "T": "评测时域内的离散步数" + }, + "assumptions": [ + "评测分布与训练分布同源", + "ego 状态未在测试时被显式输入", + "未来若干秒内专家轨迹是唯一安全轨迹" + ], + "what_it_proves": [ + "在专家分布上模型对短时回归的拟合质量", + "感知与轨迹回归层的一致性" + ], + "what_it_cannot_prove": [ + "策略在自身诱导分布上的安全性", + "对长尾事件的鲁棒性", + "多模态意图分布的覆盖率" + ], + "known_misuses": [ + "把更低的开环位移误差等同于更安全的部署效果", + "对模型选择只看开环指标而忽视闭环回归", + "通过 ego 状态泄漏制造虚高的位移分数" + ], + "scope": "适用于以专家轨迹为唯一参考、且时域较短的近距离监督评测。" + }, + { + "id": "metric:closed_loop_collision_rate", + "label": "闭环碰撞率", + "formula": "CR = (有碰撞的回合数) / (总回合数)", + "variables": { + "回合": "在仿真器或回放系统中执行一次完整路线", + "碰撞": "自车与其它道路使用者的几何包围盒相交" + }, + "assumptions": [ + "仿真器中他车策略足够真实", + "感知输入与训练时一致", + "随机种子覆盖足以达到统计置信" + ], + "what_it_proves": [ + "策略在闭环分布上避免碰撞的能力", + "感知预测规划链路在端到端循环中的最严重失败率" + ], + "what_it_cannot_prove": [ + "舒适性与社会礼貌", + "对真实路面感知噪声的稳健性", + "罕见但严重事故的尾部风险" + ], + "known_misuses": [ + "只汇报均值而不汇报分布或最大值", + "在单一种子上得出方法优劣的结论", + "对碰撞定义放宽到几何重叠之外的代理" + ], + "scope": "适用于具备真实他车反应模型与受控扰动注入的闭环评测。" + }, + { + "id": "metric:route_completion", + "label": "路线完成度", + "formula": "RC = 实际通过里程 / 路线总里程", + "variables": { + "通过里程": "自车在合法车道上实际行进的距离", + "总里程": "评测路线的设计总长度" + }, + "assumptions": [ + "路线设计涵盖目标域典型几何", + "中断条件来自规则违反或碰撞而非仿真器异常" + ], + "what_it_proves": [ + "策略完成长视野任务的能力", + "出现失败时是否能挽救并继续行驶" + ], + "what_it_cannot_prove": [ + "完成过程的安全性与舒适性", + "对超出训练分布的施工与异常事件的处理" + ], + "known_misuses": [ + "高路线完成度搭配低安全分数仍被汇报为强方法", + "通过缩短路线长度人为提升完成度", + "把完成度作为唯一终极指标,忽略子任务通过率" + ], + "scope": "适用于具有明确起终点与子段判定的闭环驾驶基准。" + }, + { + "id": "metric:ride_comfort_index", + "label": "乘员舒适综合指标", + "formula": "Comfort = w_a * acc_jerk + w_l * lateral_jerk + w_v * speed_var", + "variables": { + "acc_jerk": "纵向加加速度均方根", + "lateral_jerk": "横向加加速度均方根", + "speed_var": "速度方差", + "w_*": "经过工程调参的相对权重" + }, + "assumptions": [ + "权重选择反映乘员体感而非工程偏好", + "评测路线包含足够多的转弯与起停" + ], + "what_it_proves": [ + "策略输出的运动学平滑度", + "决策犹豫与突然反应造成的乘员代价" + ], + "what_it_cannot_prove": [ + "极端事件下乘员伤害风险", + "心理紧张感与可信度等主观维度" + ], + "known_misuses": [ + "把舒适度作为安全的反向代理", + "在没有乘员问卷的情况下校准权重", + "在没有触发事件的常规巡航上声明舒适提升" + ], + "scope": "适用于结合纵向与横向运动学评测的闭环或回放协议。" + }, + { + "id": "metric:rule_compliance_score", + "label": "交通规则合规分数", + "formula": "RuleScore = Π_i (1 − violation_rate_i)", + "variables": { + "violation_rate_i": "第 i 类规则违反在评测里程上的频率", + "i": "覆盖速度、车道、信号、礼让等多类规则" + }, + "assumptions": [ + "规则枚举覆盖目标司法管辖区的核心条款", + "规则违反检测器自身没有显著漏报" + ], + "what_it_proves": [ + "策略在规范驾驶维度上的表现", + "训练目标是否被规则化奖励所引导" + ], + "what_it_cannot_prove": [ + "在规则不完备时的合理行为", + "对模糊规则的人类判断一致性" + ], + "known_misuses": [ + "把所有规则违反等同处理而忽略严重度", + "在不公开的内部规则集上自评", + "通过软化检测阈值制造合规假象" + ], + "scope": "适用于支持显式规则枚举的闭环或回放评测。" + }, + { + "id": "metric:long_tail_success_rate", + "label": "长尾事件成功率", + "formula": "LTSR = 成功通过的长尾片段数 / 总长尾片段数", + "variables": { + "长尾片段": "经过场景挖掘标记的低频高风险序列", + "成功": "片段内无碰撞、无关键规则违反、按合理速度通过" + }, + "assumptions": [ + "长尾片段挖掘协议公开可比", + "片段标注准确", + "失败定义涵盖所有关键模态" + ], + "what_it_proves": [ + "策略在分布外或低频事件上的稳健性", + "在关注成本敏感场景时是否带来真实改进" + ], + "what_it_cannot_prove": [ + "全分布平均性能", + "对未见过的全新长尾类别的迁移能力" + ], + "known_misuses": [ + "把基础分布上的提升伪装成长尾改进", + "片段筛选过程不公开以致结果不可复现", + "成功定义过于宽松以致碰撞与违规都被通过" + ], + "scope": "适用于配有长尾标注与场景挖掘脚本的闭环评测。" + } + ] +} diff --git a/docs/data/research/node_overlay.json b/docs/data/research/node_overlay.json new file mode 100644 index 0000000..f85e20c --- /dev/null +++ b/docs/data/research/node_overlay.json @@ -0,0 +1,86 @@ +{ + "generated_by": "tools/build_research_overlay.py", + "version": 1, + "subjects": { + "paper:2210.14222": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 3, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 3 + }, + "paper:2212.10156": { + "evidence_strength": 3, + "dispute_level": 2, + "reproducibility_status_score": 3, + "failure_boundary_count": 2, + "claim_count": 2, + "maturity": 3 + }, + "paper:2309.16292": { + "evidence_strength": 2, + "dispute_level": 2, + "reproducibility_status_score": 2, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 2 + }, + "paper:2311.10813": { + "evidence_strength": 2, + "dispute_level": 2, + "reproducibility_status_score": 2, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 2 + }, + "paper:2402.12289": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 2, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 2 + }, + "paper:2512.24426": { + "evidence_strength": 1, + "dispute_level": 2, + "reproducibility_status_score": 1, + "failure_boundary_count": 2, + "claim_count": 1, + "maturity": 1 + }, + "paper:vadv2": { + "evidence_strength": 2, + "dispute_level": 1, + "reproducibility_status_score": 2, + "failure_boundary_count": 1, + "claim_count": 1, + "maturity": 2 + } + }, + "scenarios": [ + "scenario:unprotected_left_turn_with_occlusion", + "scenario:dense_pedestrian_crosswalk_at_night", + "scenario:highway_merge_at_speed_differential", + "scenario:construction_zone_with_cone_lane_shift", + "scenario:heavy_rain_with_camera_lens_droplet", + "scenario:long_tail_rare_object_on_road" + ], + "failure_modes": [ + "failure_mode:ego_status_leakage", + "failure_mode:closed_loop_deadlock_under_uncertainty", + "failure_mode:long_tail_object_recognition_miss", + "failure_mode:ride_comfort_violation_due_to_late_braking", + "failure_mode:occlusion_blind_spot_overconfidence", + "failure_mode:map_prior_overrides_runtime_observation", + "failure_mode:multi_agent_interaction_indecision", + "failure_mode:sensor_degradation_silent_failure" + ], + "argument_chains": [ + "chain:planning_oriented_query_sharing", + "chain:dual_system_for_long_tail", + "chain:counterfactual_branches_as_safety_signal", + "chain:closed_loop_eval_protocol_audit" + ] +} diff --git a/docs/data/research/scenarios.json b/docs/data/research/scenarios.json new file mode 100644 index 0000000..6cc0e6d --- /dev/null +++ b/docs/data/research/scenarios.json @@ -0,0 +1,65 @@ +{ + "$schema": "./schema.json#/node_kinds/scenario", + "scenarios": [ + { + "id": "scenario:unprotected_left_turn_with_occlusion", + "label": "未受保护左转且对向车被前车遮挡", + "description": "自车需要在没有保护相位的情况下完成左转,对向直行车被等候左转的前车整体或部分遮挡,导致自车直到接近冲突区前几秒钟才能观察到对向高速来车的真实速度。", + "why_hard": "感知缺失阶段需要由意图推断与占用预测填补,闭环行为对延迟和谨慎度高度敏感,同时存在'过分谨慎导致永远不出发'与'冒进导致碰撞'的双侧失败。", + "current_best_methods": ["paper:2212.10156", "paper:2402.12289", "paper:vadv2"], + "open_failure_modes": ["failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:closed_loop_deadlock_under_uncertainty"], + "available_datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long", "dataset:bench2drive"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:route_completion", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:dense_pedestrian_crosswalk_at_night", + "label": "夜间或弱光下的密集人行横道", + "description": "在低光照与混合光源条件下,多名行人以非均匀步态横穿,部分行人会回头、改变速度或在车前停顿,自车需要在有限可见度下持续更新意图估计并平滑减速。", + "why_hard": "相机信噪比下降,人体姿态线索退化,纯视觉模型容易丢失个体身份;意图建模与温柔减速的耦合直接影响乘员舒适与碰撞风险。", + "current_best_methods": ["paper:2212.10156", "paper:transfuser"], + "open_failure_modes": ["failure_mode:night_low_light_perception_collapse", "failure_mode:ride_comfort_violation_due_to_late_braking"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index", "metric:rule_compliance_score"] + }, + { + "id": "scenario:highway_merge_at_speed_differential", + "label": "高速公路上速度差显著的并线汇入", + "description": "自车从匝道汇入主路,主路车流稳定在 110 公里每小时左右而自车初速 70 公里每小时,需要在有限的并线窗口内同时完成加速、间隙寻找与多车交互。", + "why_hard": "动作必须在长视野中具备前瞻性,主路车辆的让与不让本身就是博弈,规划器需要在不可观测的对方意图下做稳健决策。", + "current_best_methods": ["paper:2210.14222", "paper:vadv2"], + "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision", "failure_mode:planning_horizon_too_short_for_merge"], + "available_datasets": ["dataset:waymo_open_motion", "dataset:nuplan_planning"], + "evaluation_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate", "metric:rule_compliance_score"] + }, + { + "id": "scenario:construction_zone_with_cone_lane_shift", + "label": "施工区临时锥桶车道偏移", + "description": "正常车道被临时锥桶封闭并向左偏移半个车道宽度,没有清晰的车道线,旁有施工人员与临时标志,正确行为需要服从临时几何而非高清地图与历史车道线。", + "why_hard": "高清地图与训练数据中很少出现此类临时几何,依赖地图先验的模型容易直接撞锥桶;视觉到行为的映射缺少足够的训练样本。", + "current_best_methods": ["paper:2212.10156", "paper:2402.12289"], + "open_failure_modes": ["failure_mode:map_prior_overrides_runtime_observation", "failure_mode:long_tail_object_recognition_miss"], + "available_datasets": ["dataset:carla_town05_long", "dataset:bench2drive"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:heavy_rain_with_camera_lens_droplet", + "label": "暴雨且相机镜头存在水珠", + "description": "雨势遮蔽路面标线与远处目标,镜头水珠造成局部图像退化或离散光斑,传感器在短时间窗内不可靠,需要利用时序冗余与多传感器互补做稳健决策。", + "why_hard": "纯视觉端到端模型对镜头退化敏感,水珠形成的伪边缘可能被检测器误识别为目标;需要在不可靠观测下保持合理速度而非急停。", + "current_best_methods": ["paper:transfuser", "paper:2212.10156"], + "open_failure_modes": ["failure_mode:sensor_degradation_silent_failure", "failure_mode:emergency_braking_on_phantom_obstacle"], + "available_datasets": ["dataset:carla_town05_long", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:long_tail_rare_object_on_road", + "label": "路面长尾稀有物体", + "description": "出现训练分布之外的可碰撞物体,例如掉落的家具、可乐瓶、施工材料、低矮动物等。检测分布外目标并选择正确避让策略是闭环安全的硬指标。", + "why_hard": "训练分布外目标在监督数据中极度稀少,类别失衡使得检测器倾向忽略;规划层即便看见也可能在不知道目标类别属性时做错决策。", + "current_best_methods": ["paper:2402.12289", "paper:2311.10813", "paper:2512.24426"], + "open_failure_modes": ["failure_mode:long_tail_object_recognition_miss", "failure_mode:over_reliance_on_class_prior"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:bench2drive"], + "evaluation_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"] + } + ] +} diff --git a/docs/data/research/schema.json b/docs/data/research/schema.json new file mode 100644 index 0000000..7602a6d --- /dev/null +++ b/docs/data/research/schema.json @@ -0,0 +1,118 @@ +{ + "$comment": "Schema for the falsifiable research layer. Each file under docs/data/research/ obeys one of the shapes below. Quality gate: tools/validate_research.py.", + "version": 1, + "node_kinds": { + "claim": { + "purpose": "把节点拆成可审查的研究主张。每条主张必须能被某种证据证伪。", + "required_fields": { + "id": "唯一标识,形如 claim:uniad_query_sharing_helps_planning", + "subject": "主张归属的节点 id,例如 paper:2212.10156", + "statement": "这项工作或洞察真正声称的内容,一句完整通顺的中文。", + "evidence": "支撑材料数组。每项是 {kind: ablation|table|theorem|repro|external_benchmark, source: 文献页码/表号/复现仓库, finding: 一句概括}。", + "preconditions": "结论成立所需要的数据、场景、传感器、模型规模或训练设定数组。", + "counterexamples": "在哪些条件下结论可能不成立,数组。", + "boundaries": "适用范围到哪里为止,数组。", + "reproduction": "{minimal: 最小复现实验, public_data: 公开数据集 id, cost_hours: 预计 GPU 小时, expected_output: 预期可观察到的现象}。", + "publication_value": "可投稿价值:机制解释 / 系统改进 / 基准构建 / 失败模式发现 之一或多项。", + "dispute_level": "0..3 数值,0=社区共识,3=高度争议。", + "evidence_strength": "0..3 数值,0=主张,3=有强公开基准与可复现脚本支撑。", + "reproducibility_status": "verified | partial | inferred | speculative。", + "related_claims": "关联主张数组,可为空。", + "related_failure_modes": "相关失败模式 id 数组,可为空。" + } + }, + "argument_chain": { + "purpose": "把每个成熟选题写成完整论文骨架。", + "required_fields": { + "id": "唯一标识,形如 chain:uniad_query_sharing_for_planning", + "title": "选题中文标题,一句完整表述。", + "subject_papers": "主要论证标的节点 id 数组。", + "research_gap": "现有工作为什么不够,一段叙述。", + "core_claim": "本文要证明什么,一段叙述。", + "method_mechanism": "新方法为什么应该有效,一段叙述。", + "key_experiments": "哪些实验能证明机制,数组。", + "strong_baselines": "必须击败或解释的已有方法节点 id 数组。", + "ablations": "去掉哪些组件能验证贡献,数组。", + "negative_results": "哪些失败结果反而能澄清边界,数组。", + "reviewer_attacks": "最可能被质疑的地方,数组。", + "response_experiments": "如何提前堵住质疑,数组。", + "figure_plan": "图或表对应论证链哪一环,数组。", + "related_scenarios": "scenario 节点 id 数组。", + "related_datasets": "dataset 节点 id 数组。", + "related_metrics": "metric 节点 id 数组。" + } + }, + "scenario": { + "purpose": "把研究从方法名比较改为场景与证据比较。", + "required_fields": { + "id": "形如 scenario:occluded_left_turn_intersection", + "label": "场景中文标签,一句完整表述。", + "description": "触发条件、典型几何与时序结构。", + "why_hard": "为什么这个场景对当前方法依然困难。", + "current_best_methods": "目前在该场景表现最强的方法节点 id 数组。", + "open_failure_modes": "在该场景上仍然存在的失败模式 id 数组。", + "available_datasets": "支持该场景研究的 dataset id 数组。", + "evaluation_metrics": "应使用的 metric id 数组。" + } + }, + "dataset": { + "purpose": "明确每个数据集能证明什么、不能证明什么。", + "required_fields": { + "id": "形如 dataset:nuscenes_planning", + "label": "数据集名称与版本。", + "scale": "采集小时数、城市、传感器配置。", + "supports": "数据集能够支撑的研究主张类型数组。", + "limits": "数据集的边界与系统性盲点数组。", + "common_misuses": "常见误用模式与其后果数组。", + "covers_scenarios": "scenario id 数组。", + "license": "许可与商用条款。" + } + }, + "metric": { + "purpose": "明确每个指标能证明什么、不能证明什么、常见误用是什么。", + "required_fields": { + "id": "形如 metric:open_loop_l2_displacement", + "label": "指标中文名称。", + "formula": "公式或定义。变量必须给出含义。", + "variables": "{符号: 含义} 字典。", + "assumptions": "公式成立的前提数组。", + "what_it_proves": "这个指标能支撑的结论类型数组。", + "what_it_cannot_prove": "这个指标不能支撑的结论数组。", + "known_misuses": "常见误用与后果数组。", + "scope": "适用范围说明。" + } + }, + "failure_mode": { + "purpose": "把失败从附带说明升级为研究资产。", + "required_fields": { + "id": "形如 failure_mode:ego_status_leakage", + "label": "失败模式中文标题,一句完整表述。", + "trigger_conditions": "触发条件数组。", + "manifestation": "失败的可观察表现。", + "reproducible_setup": "可复现场景:数据集 / 闭环 / 单元测试。", + "diagnostic_metrics": "用来发现该失败的指标 id 或诊断协议数组。", + "method_weakness": "对应方法的根本短板。", + "partial_solutions": "已有半解数组,每项是 {idea, citation_or_repo, residual_gap}。", + "open_questions": "仍未解决的问题数组。", + "publication_angles": "可形成论文的切入点数组。" + } + }, + "experiment_plan": { + "purpose": "每个核心方向至少形成三层实验。", + "required_fields": { + "id": "形如 experiment_plan:planning_oriented_query_sharing", + "title": "实验计划中文标题。", + "subject": "对应的 argument_chain id 或 paper id。", + "tier_1_minimal_mechanism": "{purpose, environment, model, metrics, success_criteria, runtime_hours, expected_signal}。", + "tier_2_public_benchmark": "{purpose, datasets, baselines, metrics, success_criteria, compute_budget, expected_signal}。", + "tier_3_stress_test": "{purpose, distributions, perturbations, latency_budget, sensor_dropout, counterfactual_branches, success_criteria}。" + } + } + }, + "shared_axes": { + "evidence_strength": {"0": "尚无公开证据", "1": "单篇论文表格", "2": "多篇论文 + 复现", "3": "强公开基准 + 可复现脚本"}, + "dispute_level": {"0": "社区共识", "1": "主流但有反例", "2": "明显分歧", "3": "高度争议"}, + "reproducibility_status": ["verified", "partial", "inferred", "speculative"], + "maturity": {"0": "推测", "1": "原型", "2": "公开实现", "3": "已在多基准复现"} + } +} diff --git a/docs/index.html b/docs/index.html index dd0a9b4..a5722fb 100644 --- a/docs/index.html +++ b/docs/index.html @@ -53,10 +53,11 @@

- + 论文产出工作台 + 2D
diff --git a/docs/js/atlas-main.js b/docs/js/atlas-main.js index 89d7f11..9a199ac 100644 --- a/docs/js/atlas-main.js +++ b/docs/js/atlas-main.js @@ -22,6 +22,7 @@ import { CardRenderer } from "./atlas-cards.js"; import { AtlasUI } from "./atlas-ui.js"; const GRAPH_PATH = "data/graph_extended.json"; +const RESEARCH_OVERLAY_PATH = "data/research/node_overlay.json"; async function loadGraph() { let r; @@ -36,6 +37,25 @@ async function loadGraph() { } } +// Decorate the graph's nodes with research-substance metadata so that +// visual encoding can reflect evidence strength, dispute level, +// reproducibility maturity and failure-boundary count. Silent on errors — +// the overlay is optional and the atlas must still render without it. +async function attachResearchOverlay(graph) { + try { + const r = await fetch(RESEARCH_OVERLAY_PATH); + if (!r.ok) return; + const overlay = await r.json(); + const subjects = overlay?.subjects || {}; + for (const node of graph.nodes) { + const info = subjects[node.id]; + if (info) node.research = info; + } + } catch (e) { + // overlay is optional; the atlas keeps working without it. + } +} + (async function main() { const status = (msg) => { const el = document.getElementById("loadingStatus"); @@ -43,6 +63,7 @@ async function loadGraph() { }; status("正在加载图谱数据……"); const graph = await loadGraph(); + await attachResearchOverlay(graph); status(`已就绪 ${graph.nodes.length} 节点 · ${graph.edges.length} 条关系,正在搭建星图……`); // ---------- Scene, renderer, camera ---------- diff --git a/docs/js/atlas-render.js b/docs/js/atlas-render.js index 10050f0..e4acafe 100644 --- a/docs/js/atlas-render.js +++ b/docs/js/atlas-render.js @@ -119,7 +119,13 @@ function createInstancedFamily(family, nodes, topicPalette) { for (let i = 0; i < count; i++) { const node = nodes[i]; const tierMul = TIER_SIZE[node.tier] || 1.0; - const size = (visual.baseSize || 1.0) * tierMul; + // Research-substance visual binding: maturity inflates size, dispute + // dims saturation, failure-boundary count is signalled by a red lift on + // the emissive channel. Nodes without research data render unchanged. + const r = node.research || null; + const maturityMul = r ? (1.0 + 0.10 * (r.maturity || 0)) : 1.0; + const evidenceMul = r ? (1.0 + 0.06 * (r.evidence_strength || 0)) : 1.0; + const size = (visual.baseSize || 1.0) * tierMul * maturityMul * evidenceMul; node._visualSize = size; node._visualFamily = family; m.makeScale(size, size, size); @@ -132,6 +138,16 @@ function createInstancedFamily(family, nodes, topicPalette) { colorTmp.r = Math.min(1, colorTmp.r + lift); colorTmp.g = Math.min(1, colorTmp.g + lift); colorTmp.b = Math.min(1, colorTmp.b + lift); + if (r) { + // Dispute desaturates toward grey; failure boundaries push toward warm. + const disputeFade = Math.min(0.5, 0.12 * (r.dispute_level || 0)); + const fbWarm = Math.min(0.25, 0.06 * (r.failure_boundary_count || 0)); + const grey = 0.55; + colorTmp.r = colorTmp.r * (1 - disputeFade) + grey * disputeFade + fbWarm; + colorTmp.g = colorTmp.g * (1 - disputeFade) + grey * disputeFade; + colorTmp.b = colorTmp.b * (1 - disputeFade) + grey * disputeFade; + colorTmp.r = Math.min(1, colorTmp.r); + } mesh.setColorAt(i, colorTmp); } mesh.instanceMatrix.needsUpdate = true; diff --git a/docs/js/atlas-ui.js b/docs/js/atlas-ui.js index abf233c..5b6908c 100644 --- a/docs/js/atlas-ui.js +++ b/docs/js/atlas-ui.js @@ -16,7 +16,7 @@ export class AtlasUI { activeYearMax: 2026, searchQuery: "", layer: "galaxy", - autoSpin: true, + autoSpin: false, showEdges: true, showLabels: true, }; diff --git a/docs/js/workbench.js b/docs/js/workbench.js new file mode 100644 index 0000000..a258986 --- /dev/null +++ b/docs/js/workbench.js @@ -0,0 +1,526 @@ +// Paper Production Workbench — research-first UI. +// Reads the structured research layer (claims, argument chains, scenarios, +// datasets, metrics, failure modes, experiment plans) and renders the views +// described in workbench.html. No 3D, no auto-spin, no decorative animation. + +const RESEARCH = { + claims: "data/research/claims.json", + chains: "data/research/argument_chains.json", + scenarios: "data/research/scenarios.json", + datasets: "data/research/datasets.json", + metrics: "data/research/metrics.json", + failures: "data/research/failure_modes.json", + experiments: "data/research/experiment_plans.json", +}; + +const state = { + view: "claims", + subview: "scenarios", + search: "", + evidenceFilter: new Set(["0", "1", "2", "3"]), + disputeFilter: new Set(["0", "1", "2", "3"]), + reproFilter: new Set(["verified", "partial", "inferred", "speculative"]), + paperFilter: new Set(), + basket: new Set(), + data: {}, +}; + +async function loadAll() { + const entries = Object.entries(RESEARCH); + const results = await Promise.all(entries.map(async ([key, url]) => { + const r = await fetch(url); + return [key, await r.json()]; + })); + for (const [key, val] of results) state.data[key] = val; +} + +function $(sel, root = document) { return root.querySelector(sel); } +function $$(sel, root = document) { return Array.from(root.querySelectorAll(sel)); } + +function escapeHtml(s) { + return String(s ?? "").replace(/[<>&"]/g, c => ({ "<":"<",">":">","&":"&","\"":""" }[c])); +} + +function mdInline(text) { + // Light inline rendering: allow bold/italic/code/links via marked, sanitize via DOMPurify. + if (window.marked && window.DOMPurify) { + return window.DOMPurify.sanitize(window.marked.parseInline(String(text ?? ""))); + } + return escapeHtml(text); +} + +function renderMath(scope = document) { + if (window.renderMathInElement) { + window.renderMathInElement(scope, { + delimiters: [ + { left: "$$", right: "$$", display: true }, + { left: "$", right: "$", display: false }, + ], + throwOnError: false, + }); + } +} + +// ---------- Filters ---------- +function passesFilters(claim) { + if (state.evidenceFilter.size && !state.evidenceFilter.has(String(claim.evidence_strength ?? 0))) return false; + if (state.disputeFilter.size && !state.disputeFilter.has(String(claim.dispute_level ?? 0))) return false; + if (state.reproFilter.size && !state.reproFilter.has(claim.reproducibility_status)) return false; + if (state.paperFilter.size && claim.subject && !state.paperFilter.has(claim.subject)) return false; + if (state.search) { + const hay = [ + claim.statement, claim.id, claim.subject, claim.publication_value, + ...(claim.preconditions || []), ...(claim.counterexamples || []), ...(claim.boundaries || []), + ...((claim.evidence || []).map(e => `${e.kind} ${e.source} ${e.finding}`)), + ].join(" ").toLowerCase(); + if (!hay.includes(state.search.toLowerCase())) return false; + } + return true; +} + +// ---------- Renderers ---------- +function evidenceTag(level) { + const labels = ["推测", "单一来源", "多来源", "强公开复现"]; + const lv = Math.max(0, Math.min(3, level || 0)); + return `证据 · ${labels[lv]}`; +} +function disputeTag(level) { + const labels = ["共识", "主流", "明显分歧", "高度争议"]; + const lv = Math.max(0, Math.min(3, level || 0)); + return `争议 · ${labels[lv]}`; +} +function reproTag(status) { + const labels = { verified: "已复现", partial: "部分复现", inferred: "推断", speculative: "尚待验证" }; + return `复现 · ${labels[status] || status}`; +} + +function pickBtn(id, kind) { + const picked = state.basket.has(`${kind}:${id}`); + return ``; +} + +function collapsible(label, body, open = false) { + return ` +
+
+ ${escapeHtml(label)} + ${open ? "▾" : "▸"} +
+
${body}
+
`; +} + +function renderClaimCard(c) { + const subjectLabel = c.subject ? c.subject : "—"; + return ` +
+
+
+
${mdInline(c.statement)}
+
+ 主体 · ${escapeHtml(subjectLabel)} + ${evidenceTag(c.evidence_strength)} + ${disputeTag(c.dispute_level)} + ${reproTag(c.reproducibility_status)} + 价值 · ${escapeHtml(c.publication_value || "—")} +
+
+ ${pickBtn(c.id, "claim")} +
+ + ${collapsible("证据", ` +
${(c.evidence || []).map(ev => ` +
+ ${escapeHtml(ev.kind || "—")} + ${mdInline(ev.finding || "")} +
来源:${escapeHtml(ev.source || "—")}
+
`).join("") || "

暂无

"}
+ `, true)} + + ${collapsible("前提", `
    ${(c.preconditions || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("反例", `
    ${(c.counterexamples || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("边界", `
    ${(c.boundaries || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("可复现实验", ` +

最小实验:${mdInline(c.reproduction?.minimal || "—")}

+

公开数据:${escapeHtml(c.reproduction?.public_data || "—")}

+

算力预算:${escapeHtml(String(c.reproduction?.cost_hours ?? "—"))} GPU 小时

+

预期信号:${mdInline(c.reproduction?.expected_output || "—")}

+ `)} + ${(c.related_claims?.length || c.related_failure_modes?.length) ? collapsible("关联", ` + ${c.related_claims?.length ? `

关联主张:${c.related_claims.map(id => `${escapeHtml(id)}`).join(",")}

` : ""} + ${c.related_failure_modes?.length ? `

相关失败模式:${c.related_failure_modes.map(id => `${escapeHtml(id)}`).join(",")}

` : ""} + `) : ""} +
`; +} + +function renderChainCard(ch) { + return ` +
+
+
+
${mdInline(ch.title)}
+
+ ${(ch.subject_papers || []).map(p => `论文 · ${escapeHtml(p)}`).join("")} +
+
+ ${pickBtn(ch.id, "chain")} +
+ ${collapsible("研究缺口", `

${mdInline(ch.research_gap || "")}

`, true)} + ${collapsible("核心主张", `

${mdInline(ch.core_claim || "")}

`, true)} + ${collapsible("方法机制", `

${mdInline(ch.method_mechanism || "")}

`)} + ${collapsible("关键实验", `
    ${(ch.key_experiments || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("强基线", `
    ${(ch.strong_baselines || []).map(x => `
  • ${escapeHtml(x)}
  • `).join("")}
`)} + ${collapsible("消融实验", `
    ${(ch.ablations || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("可能的负结果", `
    ${(ch.negative_results || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("审稿人攻击点", `
    ${(ch.reviewer_attacks || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("回应实验", `
    ${(ch.response_experiments || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("图表计划", `
    ${(ch.figure_plan || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("关联场景与数据", ` +

场景:${(ch.related_scenarios || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

+

数据集:${(ch.related_datasets || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

+

指标:${(ch.related_metrics || []).map(x => `${escapeHtml(x)}`).join(",") || "—"}

+ `)} +
`; +} + +function renderScenarioCard(s) { + return ` +
+
+
+
${mdInline(s.label)}
+
+ ${(s.current_best_methods || []).slice(0, 3).map(m => `当前最强 · ${escapeHtml(m)}`).join("")} +
+
+ ${pickBtn(s.id, "scenario")} +
+ ${collapsible("场景描述", `

${mdInline(s.description || "")}

`, true)} + ${collapsible("为什么困难", `

${mdInline(s.why_hard || "")}

`, true)} + ${collapsible("开放失败模式", `
    ${(s.open_failure_modes || []).map(x => `
  • ${escapeHtml(x)}
  • `).join("")}
`)} + ${collapsible("可用数据集", `
    ${(s.available_datasets || []).map(x => `
  • ${escapeHtml(x)}
  • `).join("")}
`)} + ${collapsible("应用指标", `
    ${(s.evaluation_metrics || []).map(x => `
  • ${escapeHtml(x)}
  • `).join("")}
`)} +
`; +} + +function renderDatasetCard(d) { + return ` +
+
+
+
${mdInline(d.label)}
+
许可 · ${escapeHtml(d.license || "—")}
+
+ ${pickBtn(d.id, "dataset")} +
+ ${collapsible("规模与传感器", `

${mdInline(d.scale || "")}

`, true)} + ${collapsible("能支撑什么", `
    ${(d.supports || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`, true)} + ${collapsible("边界与盲点", `
    ${(d.limits || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`, true)} + ${collapsible("常见误用", `
    ${(d.common_misuses || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("覆盖场景", `
    ${(d.covers_scenarios || []).map(x => `
  • ${escapeHtml(x)}
  • `).join("")}
`)} +
`; +} + +function renderMetricCard(m) { + return ` +
+
+
+
${mdInline(m.label)}
+
适用范围 · ${escapeHtml((m.scope || "—").slice(0, 24))}
+
+ ${pickBtn(m.id, "metric")} +
+ ${collapsible("公式与变量", ` +

$$${escapeHtml(m.formula || "")}$$

+
    ${Object.entries(m.variables || {}).map(([k, v]) => `
  • ${escapeHtml(k)}:${mdInline(v)}
  • `).join("")}
+ `, true)} + ${collapsible("公式前提", `
    ${(m.assumptions || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("能证明什么", `
    ${(m.what_it_proves || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`, true)} + ${collapsible("不能证明什么", `
    ${(m.what_it_cannot_prove || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`, true)} + ${collapsible("常见误用", `
    ${(m.known_misuses || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} +
`; +} + +function renderFailureCard(f) { + return ` +
+
+
+
${mdInline(f.label)}
+
+ ${(f.diagnostic_metrics || []).slice(0, 3).map(x => `诊断 · ${escapeHtml(x)}`).join("")} +
+
+ ${pickBtn(f.id, "failure")} +
+ ${collapsible("触发条件", `
    ${(f.trigger_conditions || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`, true)} + ${collapsible("失败表现", `

${mdInline(f.manifestation || "")}

`, true)} + ${collapsible("可复现脚本", `

${mdInline(f.reproducible_setup || "")}

`)} + ${collapsible("方法短板", `

${mdInline(f.method_weakness || "")}

`, true)} + ${collapsible("已有半解", `${(f.partial_solutions || []).map(s => ` +
+
${mdInline(s.idea || "")}
+
参考:${escapeHtml(s.citation_or_repo || "—")}
+
残余间隙:${mdInline(s.residual_gap || "—")}
+
`).join("")}`)} + ${collapsible("开放问题", `
    ${(f.open_questions || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} + ${collapsible("可投稿切入点", `
    ${(f.publication_angles || []).map(x => `
  • ${mdInline(x)}
  • `).join("")}
`)} +
`; +} + +function renderExperimentCard(ex) { + function tier(label, t) { + if (!t) return ""; + const entries = Object.entries(t).map(([k, v]) => { + const value = Array.isArray(v) ? `
    ${v.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : + (typeof v === "object" ? `
${escapeHtml(JSON.stringify(v, null, 2))}
` : `

${mdInline(v)}

`); + return `
${escapeHtml(k.replace(/_/g, " "))}
${value}
`; + }).join(""); + return collapsible(label, entries, true); + } + return ` +
+
+
+
${mdInline(ex.title)}
+
归属 · ${escapeHtml(ex.subject || "—")}
+
+ ${pickBtn(ex.id, "experiment")} +
+ ${tier("第一层 · 最小机制实验", ex.tier_1_minimal_mechanism)} + ${tier("第二层 · 公开基准实验", ex.tier_2_public_benchmark)} + ${tier("第三层 · 压力测试实验", ex.tier_3_stress_test)} +
`; +} + +// ---------- Lists ---------- +function renderClaims() { + const list = (state.data.claims?.claims || []).filter(passesFilters); + $("#claimsList").innerHTML = list.map(renderClaimCard).join("") || "

当前筛选下无匹配主张。

"; +} +function renderChains() { + const list = (state.data.chains?.argument_chains || []).filter(ch => { + if (!state.search) return true; + const hay = JSON.stringify(ch).toLowerCase(); + return hay.includes(state.search.toLowerCase()); + }); + $("#chainsList").innerHTML = list.map(renderChainCard).join("") || "

当前筛选下无匹配选题。

"; +} +function renderScenarios() { + const list = (state.data.scenarios?.scenarios || []).filter(s => { + if (!state.search) return true; + return JSON.stringify(s).toLowerCase().includes(state.search.toLowerCase()); + }); + $("#scenariosList").innerHTML = list.map(renderScenarioCard).join("") || "

当前筛选下无匹配场景。

"; +} +function renderDatasets() { + const list = (state.data.datasets?.datasets || []).filter(d => !state.search || JSON.stringify(d).toLowerCase().includes(state.search.toLowerCase())); + $("#datasetsList").innerHTML = list.map(renderDatasetCard).join("") || "

当前筛选下无匹配数据集。

"; +} +function renderMetrics() { + const list = (state.data.metrics?.metrics || []).filter(m => !state.search || JSON.stringify(m).toLowerCase().includes(state.search.toLowerCase())); + const html = list.map(renderMetricCard).join("") || "

当前筛选下无匹配指标。

"; + $("#metricsList").innerHTML = html; + renderMath($("#metricsList")); +} +function renderFailures() { + const list = (state.data.failures?.failure_modes || []).filter(f => !state.search || JSON.stringify(f).toLowerCase().includes(state.search.toLowerCase())); + $("#failuresList").innerHTML = list.map(renderFailureCard).join("") || "

当前筛选下无匹配失败模式。

"; +} +function renderExperiments() { + const list = (state.data.experiments?.experiment_plans || []).filter(e => !state.search || JSON.stringify(e).toLowerCase().includes(state.search.toLowerCase())); + $("#experimentsList").innerHTML = list.map(renderExperimentCard).join("") || "

当前筛选下无匹配实验计划。

"; +} + +// ---------- Basket ---------- +function basketCount() { + $("#wbBasketCount").textContent = String(state.basket.size); + const enabled = state.basket.size >= 2; + $("#wbCompareBtn").disabled = !enabled; + $("#wbCommonPrereqBtn").disabled = !enabled; + $("#wbDivergeBtn").disabled = !enabled; +} +function togglePick(key) { + if (state.basket.has(key)) state.basket.delete(key); else state.basket.add(key); + basketCount(); +} +function lookupBasketItem(key) { + const [kind, id] = key.split(":", 2); + const restId = key.slice(kind.length + 1); + if (kind === "claim") return (state.data.claims?.claims || []).find(c => c.id === restId); + if (kind === "chain") return (state.data.chains?.argument_chains || []).find(c => c.id === restId); + if (kind === "scenario") return (state.data.scenarios?.scenarios || []).find(s => s.id === restId); + if (kind === "dataset") return (state.data.datasets?.datasets || []).find(d => d.id === restId); + if (kind === "metric") return (state.data.metrics?.metrics || []).find(m => m.id === restId); + if (kind === "failure") return (state.data.failures?.failure_modes || []).find(f => f.id === restId); + if (kind === "experiment") return (state.data.experiments?.experiment_plans || []).find(e => e.id === restId); + return null; +} + +function renderBasketCompare() { + const items = Array.from(state.basket).map(k => ({ key: k, kind: k.split(":")[0], obj: lookupBasketItem(k) })).filter(x => x.obj); + if (items.length === 0) { + $("#basketDetail").innerHTML = "

选择篮为空。请到任意视图加入两个或以上对象后回到这里。

"; + return; + } + // Determine common keys across selected claims (or generic objects) + function flat(obj) { + return [ + ...Object.entries(obj).filter(([k, v]) => typeof v === "string").map(([k, v]) => `${k}=${v}`), + ...Object.entries(obj).filter(([k, v]) => Array.isArray(v)).flatMap(([k, v]) => v.filter(x => typeof x === "string").map(x => `${k}=${x}`)), + ]; + } + const sets = items.map(it => new Set(flat(it.obj))); + const intersection = sets.reduce((acc, s) => acc.size === 0 ? new Set(s) : new Set([...acc].filter(x => s.has(x))), new Set()); + + const cols = items.map(it => ` +
+

${mdInline(it.obj.label || it.obj.statement || it.obj.title || it.key)}

+
归属

${escapeHtml(it.kind)} · ${escapeHtml(it.obj.subject || it.obj.id || "—")}

+ ${it.obj.evidence ? `
证据
    ${(it.obj.evidence || []).map(e => `
  • ${mdInline(e.finding)} (${escapeHtml(e.source || "—")})
  • `).join("")}
` : ""} + ${it.obj.preconditions ? `
前提
    ${it.obj.preconditions.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : ""} + ${it.obj.counterexamples ? `
反例
    ${it.obj.counterexamples.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : ""} + ${it.obj.boundaries ? `
边界
    ${it.obj.boundaries.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : ""} + ${it.obj.limits ? `
边界与盲点
    ${it.obj.limits.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : ""} + ${it.obj.what_it_cannot_prove ? `
不能证明
    ${it.obj.what_it_cannot_prove.map(x => `
  • ${mdInline(x)}
  • `).join("")}
` : ""} +
+ `).join(""); + + const commons = Array.from(intersection).map(s => `
  • ${escapeHtml(s)}
  • `).join(""); + $("#basketDetail").innerHTML = ` +
    +

    并排对比

    +
    ${cols}
    +
    +
    +

    共同前置

    +

    所选对象在字段级别上共享的字符串属性:

    +
      ${commons || "
    • 未发现共同字段。
    • "}
    +
    +
    +

    分歧路径

    +

    每个对象独占的关键字段(反例 / 边界 / 不能证明等),凸显它们在论文论证中的真实差异:

    +
    + ${items.map(it => { + const own = [...(it.obj.counterexamples || []), ...(it.obj.boundaries || []), ...(it.obj.what_it_cannot_prove || []), ...(it.obj.limits || [])]; + return `

    ${mdInline(it.obj.label || it.obj.statement || it.obj.title || it.key)}

    +
      ${own.length ? own.map(x => `
    • ${mdInline(x)}
    • `).join("") : "
    • "}
    `; + }).join("")} +
    +
    + `; + renderMath($("#basketDetail")); +} + +// ---------- Wiring ---------- +function showView(view) { + state.view = view; + $$(".wb-tab").forEach(t => t.classList.toggle("active", t.dataset.view === view)); + for (const v of ["claims", "chains", "scenarios", "failures", "experiments", "basket"]) { + const el = document.getElementById("view" + v[0].toUpperCase() + v.slice(1)); + if (el) el.hidden = view !== v; + } + if (view === "claims") renderClaims(); + if (view === "chains") renderChains(); + if (view === "scenarios") { + renderScenarios(); renderDatasets(); renderMetrics(); + } + if (view === "failures") renderFailures(); + if (view === "experiments") renderExperiments(); + if (view === "basket") renderBasketCompare(); + // permalink + const params = new URLSearchParams(window.location.search); + params.set("view", view); + window.history.replaceState({}, "", `${window.location.pathname}?${params.toString()}`); +} + +function buildPaperFilter() { + const ids = new Set(); + for (const c of (state.data.claims?.claims || [])) if (c.subject) ids.add(c.subject); + const host = $("#wbPaperFilter"); + host.innerHTML = Array.from(ids).map(id => ``).join(""); + state.paperFilter = new Set(ids); +} + +function wireEvents() { + // View tabs + $$(".wb-tab").forEach(t => t.addEventListener("click", () => showView(t.dataset.view))); + // Subnav (scenarios / datasets / metrics) + $$(".wb-subtab").forEach(t => t.addEventListener("click", () => { + $$(".wb-subtab").forEach(x => x.classList.toggle("active", x === t)); + state.subview = t.dataset.sub; + $("#scenariosList").hidden = state.subview !== "scenarios"; + $("#datasetsList").hidden = state.subview !== "datasets"; + $("#metricsList").hidden = state.subview !== "metrics"; + })); + // Search + $("#wbSearch").addEventListener("input", e => { + state.search = e.target.value || ""; + showView(state.view); + }); + // Evidence filter + $("#wbEvidenceFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-evidence]"); if (!chip) return; + chip.classList.toggle("active"); + state.evidenceFilter = new Set($$("#wbEvidenceFilter .wb-chip.active").map(c => c.dataset.evidence)); + if (state.view === "claims") renderClaims(); + }); + $("#wbDisputeFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-dispute]"); if (!chip) return; + chip.classList.toggle("active"); + state.disputeFilter = new Set($$("#wbDisputeFilter .wb-chip.active").map(c => c.dataset.dispute)); + if (state.view === "claims") renderClaims(); + }); + $("#wbReproFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-repro]"); if (!chip) return; + chip.classList.toggle("active"); + state.reproFilter = new Set($$("#wbReproFilter .wb-chip.active").map(c => c.dataset.repro)); + if (state.view === "claims") renderClaims(); + }); + $("#wbPaperFilter").addEventListener("click", e => { + const chip = e.target.closest("[data-paper]"); if (!chip) return; + chip.classList.toggle("active"); + state.paperFilter = new Set($$("#wbPaperFilter .wb-chip.active").map(c => c.dataset.paper)); + if (state.view === "claims") renderClaims(); + }); + // Card events delegated on main + $("#wbMain").addEventListener("click", e => { + const pick = e.target.closest("[data-pick]"); + if (pick) { + togglePick(pick.dataset.pick); + pick.classList.toggle("picked"); + pick.textContent = pick.classList.contains("picked") ? "已选" : "加入选择篮"; + return; + } + const toggle = e.target.closest("[data-toggle]"); + if (toggle) { + const body = toggle.parentElement.querySelector(".wb-section-body"); + if (body) { + body.hidden = !body.hidden; + const arrow = toggle.querySelector("span:last-child"); + if (arrow) arrow.textContent = body.hidden ? "▸" : "▾"; + } + } + }); + // Basket actions + $("#wbBasketClear").addEventListener("click", () => { state.basket.clear(); basketCount(); $$(".wb-pick.picked").forEach(p => { p.classList.remove("picked"); p.textContent = "加入选择篮"; }); }); + $("#wbCompareBtn").addEventListener("click", () => showView("basket")); + $("#wbCommonPrereqBtn").addEventListener("click", () => showView("basket")); + $("#wbDivergeBtn").addEventListener("click", () => showView("basket")); +} + +(async function main() { + try { + await loadAll(); + } catch (err) { + $("#wbMain").innerHTML = `

    研究层数据加载失败:${escapeHtml(err.message || String(err))}

    `; + return; + } + buildPaperFilter(); + wireEvents(); + basketCount(); + const params = new URLSearchParams(window.location.search); + const startView = params.get("view") || "claims"; + showView(["claims", "chains", "scenarios", "failures", "experiments", "basket"].includes(startView) ? startView : "claims"); +})(); diff --git a/docs/workbench.css b/docs/workbench.css new file mode 100644 index 0000000..c22c626 --- /dev/null +++ b/docs/workbench.css @@ -0,0 +1,252 @@ +/* Paper Production Workbench — utilitarian, evidence-first, low chrome. */ + +:root { + --wb-bg: #0b1220; + --wb-bg-1: #111a2c; + --wb-bg-2: #18243a; + --wb-fg: #e5edff; + --wb-fg-dim: #93a5c4; + --wb-fg-muted: #6c7d9b; + --wb-line: #25344f; + --wb-line-strong: #36507a; + --wb-accent: #a7f3d0; + --wb-accent-2: #fcd34d; + --wb-warn: #fca5a5; + --wb-cite: #c4b5fd; + --wb-chip-bg: #1a2942; + --wb-chip-active: #2c4675; + --wb-radius: 8px; +} + +* { box-sizing: border-box; } + +html, body { + margin: 0; padding: 0; + font-family: -apple-system, BlinkMacSystemFont, "PingFang SC", "Hiragino Sans GB", + "Source Han Sans", "Noto Sans CJK SC", "Segoe UI", Arial, sans-serif; + background: var(--wb-bg); + color: var(--wb-fg); + line-height: 1.55; + font-size: 14.5px; +} + +body.workbench { + display: grid; + grid-template-columns: 280px 1fr; + grid-template-rows: auto 1fr auto; + grid-template-areas: + "topbar topbar" + "sidebar main" + "footer footer"; + min-height: 100vh; +} + +/* Topbar */ +.wb-topbar { + grid-area: topbar; + display: flex; align-items: center; justify-content: space-between; + gap: 12px; + padding: 12px 18px; + border-bottom: 1px solid var(--wb-line); + background: linear-gradient(180deg, #0d1626, #0a1120); + position: sticky; top: 0; z-index: 5; +} +.wb-brand { display: flex; align-items: center; gap: 12px; } +.wb-logo { + display: inline-flex; align-items: center; justify-content: center; + width: 32px; height: 32px; border-radius: 6px; + background: var(--wb-bg-2); color: var(--wb-accent); font-size: 18px; +} +.wb-title-block { display: flex; flex-direction: column; line-height: 1.2; } +.wb-title { font-weight: 700; font-size: 16px; } +.wb-subtitle { font-size: 12.5px; color: var(--wb-fg-dim); } + +.wb-nav { display: flex; gap: 4px; flex-wrap: wrap; } +.wb-tab { + background: transparent; color: var(--wb-fg-dim); + border: 1px solid transparent; border-radius: var(--wb-radius); + padding: 6px 12px; font-size: 13.5px; cursor: pointer; +} +.wb-tab:hover { color: var(--wb-fg); border-color: var(--wb-line); } +.wb-tab.active { color: var(--wb-fg); background: var(--wb-bg-2); border-color: var(--wb-line-strong); } + +.wb-topbar-right { display: flex; gap: 6px; } +.wb-link { + color: var(--wb-fg-dim); text-decoration: none; font-size: 13px; + padding: 4px 10px; border-radius: var(--wb-radius); border: 1px solid var(--wb-line); +} +.wb-link:hover { color: var(--wb-fg); border-color: var(--wb-line-strong); } + +/* Sidebar */ +.wb-sidebar { + grid-area: sidebar; + border-right: 1px solid var(--wb-line); + padding: 14px 14px 28px; + background: var(--wb-bg-1); + overflow-y: auto; + position: sticky; top: 57px; + height: calc(100vh - 57px); +} +.wb-pane { margin-bottom: 18px; } +.wb-pane h3 { font-size: 12.5px; letter-spacing: 0.04em; color: var(--wb-fg-dim); text-transform: uppercase; margin: 0 0 8px; } +#wbSearch { + width: 100%; padding: 7px 10px; background: var(--wb-bg-2); + color: var(--wb-fg); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + font: inherit; font-size: 13px; +} +#wbSearch:focus { outline: 1px solid var(--wb-accent); } + +.wb-chip-row { display: flex; flex-wrap: wrap; gap: 6px; } +.wb-chip { + background: var(--wb-chip-bg); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); border-radius: 999px; + padding: 3px 10px; font-size: 12.5px; cursor: pointer; +} +.wb-chip.active { background: var(--wb-chip-active); color: var(--wb-fg); border-color: var(--wb-line-strong); } +.wb-chip:hover { color: var(--wb-fg); } + +.wb-basket-pane { padding: 12px; background: var(--wb-bg-2); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); } +.wb-basket-count { + background: var(--wb-accent); color: #0b1220; padding: 1px 8px; border-radius: 999px; + font-size: 11px; font-weight: 700; margin-left: 4px; +} +.wb-hint { font-size: 12.5px; color: var(--wb-fg-muted); margin: 0 0 8px; } +.wb-basket-actions { display: flex; flex-wrap: wrap; gap: 6px; } +.wb-action { + background: var(--wb-accent); color: #0b1220; border: 0; border-radius: var(--wb-radius); + padding: 6px 10px; font-size: 12.5px; font-weight: 600; cursor: pointer; +} +.wb-action:disabled { background: var(--wb-line); color: var(--wb-fg-muted); cursor: not-allowed; } +.wb-action.wb-secondary { background: transparent; color: var(--wb-fg-dim); border: 1px solid var(--wb-line); } + +/* Main */ +.wb-main { grid-area: main; padding: 22px 28px 60px; max-width: 1400px; } +.wb-view-header h2 { margin: 0 0 4px; font-size: 22px; } +.wb-view-header p { margin: 0 0 18px; color: var(--wb-fg-dim); } + +.wb-subnav { display: flex; gap: 6px; margin-bottom: 14px; } +.wb-subtab { + background: var(--wb-chip-bg); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + padding: 5px 12px; font-size: 13px; cursor: pointer; +} +.wb-subtab.active { background: var(--wb-chip-active); color: var(--wb-fg); border-color: var(--wb-line-strong); } + +.wb-cards-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(420px, 1fr)); + gap: 16px; +} + +.wb-card { + background: var(--wb-bg-1); + border: 1px solid var(--wb-line); + border-radius: var(--wb-radius); + padding: 14px 14px 12px; + display: flex; flex-direction: column; gap: 10px; + position: relative; +} +.wb-card:hover { border-color: var(--wb-line-strong); } +.wb-card-head { + display: flex; align-items: flex-start; justify-content: space-between; gap: 10px; +} +.wb-card-title { font-weight: 600; font-size: 15px; color: var(--wb-fg); } +.wb-card-meta { display: flex; gap: 6px; flex-wrap: wrap; align-items: center; } +.wb-tag { + font-size: 11.5px; padding: 2px 8px; border-radius: 999px; + background: var(--wb-bg-2); color: var(--wb-fg-dim); + border: 1px solid var(--wb-line); +} +.wb-tag.evidence-0 { color: var(--wb-warn); border-color: #4a2630; } +.wb-tag.evidence-1 { color: #fde68a; border-color: #4a3d22; } +.wb-tag.evidence-2 { color: #bbf7d0; border-color: #1f4a35; } +.wb-tag.evidence-3 { color: var(--wb-accent); border-color: #1f4a45; } +.wb-tag.dispute-2, .wb-tag.dispute-3 { color: var(--wb-warn); border-color: #4a2630; } + +.wb-pick { + font-size: 11.5px; padding: 3px 10px; border-radius: 999px; + background: transparent; color: var(--wb-fg-dim); border: 1px solid var(--wb-line); + cursor: pointer; +} +.wb-pick.picked { background: var(--wb-accent); color: #0b1220; border-color: var(--wb-accent); font-weight: 600; } + +.wb-statement { font-size: 14.5px; color: var(--wb-fg); } +.wb-card-section { + border-top: 1px dashed var(--wb-line); padding-top: 8px; +} +.wb-section-h { + display: flex; justify-content: space-between; align-items: center; cursor: pointer; + font-size: 12px; letter-spacing: 0.06em; color: var(--wb-fg-dim); text-transform: uppercase; +} +.wb-section-body { margin-top: 6px; } +.wb-section-body ul { margin: 4px 0; padding-left: 18px; } +.wb-section-body li { margin-bottom: 3px; } +.wb-section-body p { margin: 4px 0; } +.wb-section-body[hidden] { display: none; } + +.wb-evidence-item { + background: var(--wb-bg-2); border: 1px solid var(--wb-line); border-radius: 6px; + padding: 8px 10px; margin-bottom: 6px; font-size: 13px; +} +.wb-evidence-kind { + display: inline-block; font-size: 11px; padding: 1px 7px; border-radius: 999px; + background: #1f3252; color: var(--wb-cite); margin-right: 6px; +} +.wb-evidence-source { color: var(--wb-fg-dim); font-size: 12px; margin-top: 3px; } + +.wb-card-footer { display: flex; justify-content: space-between; gap: 6px; font-size: 12px; color: var(--wb-fg-muted); } +.wb-card-footer a { color: var(--wb-fg-dim); text-decoration: none; border-bottom: 1px dashed var(--wb-line-strong); } +.wb-card-footer a:hover { color: var(--wb-accent); } + +/* Basket detail */ +.wb-basket-detail { display: flex; flex-direction: column; gap: 18px; } +.wb-compare-grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); gap: 14px; } +.wb-compare-col h4 { margin: 0 0 8px; font-size: 14px; } +.wb-compare-row { + background: var(--wb-bg-1); border: 1px solid var(--wb-line); border-radius: var(--wb-radius); + padding: 10px 12px; margin-bottom: 8px; +} +.wb-compare-row h5 { font-size: 12px; letter-spacing: 0.04em; color: var(--wb-fg-dim); margin: 0 0 4px; text-transform: uppercase; } +.wb-empty { color: var(--wb-fg-muted); font-size: 13px; padding: 24px 0; text-align: center; } + +/* Footer */ +.wb-footer { + grid-area: footer; + border-top: 1px solid var(--wb-line); + padding: 10px 18px; font-size: 12px; color: var(--wb-fg-muted); + background: #08101e; +} + +@media (max-width: 900px) { + body.workbench { + grid-template-columns: 1fr; + grid-template-rows: auto auto 1fr auto; + grid-template-areas: + "topbar" + "sidebar" + "main" + "footer"; + } + .wb-sidebar { position: static; height: auto; border-right: 0; border-bottom: 1px solid var(--wb-line); } + .wb-nav { display: none; } + .wb-cards-grid { grid-template-columns: 1fr; } + .wb-main { padding: 16px; } + .wb-topbar { flex-wrap: wrap; } + .wb-mobile-tabs { display: flex; gap: 4px; padding: 8px 14px; overflow-x: auto; } +} + +@media (max-width: 600px) { + .wb-title { font-size: 15px; } + .wb-subtitle { font-size: 11.5px; } + .wb-card-title { font-size: 14px; } +} + +/* KaTeX font sizing on small screens */ +.katex { font-size: 1.02em; } +@media (max-width: 600px) { .katex { font-size: 0.95em; } } + +/* Mobile fallback nav tabs at the top of main */ +.wb-mobile-tabs { display: none; } +@media (max-width: 900px) { + .wb-mobile-tabs { display: flex; padding: 6px 14px 0; gap: 4px; overflow-x: auto; } +} diff --git a/docs/workbench.html b/docs/workbench.html new file mode 100644 index 0000000..fceae11 --- /dev/null +++ b/docs/workbench.html @@ -0,0 +1,151 @@ + + + + + +论文产出工作台 / Paper Production Workbench + + + + + + + + + + + +
    +
    + +
    + 论文产出工作台 + 围绕主张、证据、场景、失败模式与可复现实验组织自动驾驶研究 +
    +
    + + +
    + + + +
    +
    +
    +

    可证伪主张

    +

    每条主张都被拆解为声明、证据、前提、反例、边界、可复现实验与可投稿价值,使审稿人可以逐条审查。

    +
    +
    +
    + + + + + + + + + + +
    + +
    + 这是论文产出工作台。视觉只服务于研究结构:证据强度、争议程度、可复现状态、失败边界、研究成熟度。 +
    + + + + diff --git a/tools/build_research_overlay.py b/tools/build_research_overlay.py new file mode 100644 index 0000000..c021d33 --- /dev/null +++ b/tools/build_research_overlay.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +"""Build docs/data/research/node_overlay.json from the structured research layer. + +The 3D atlas reads this overlay to bind visual encoding to research substance: + - evidence_strength (0..3) — how well-supported the node's strongest claim is + - dispute_level (0..3) — community disagreement + - reproducibility_status — verified / partial / inferred / speculative + - failure_boundary_count — number of failure modes that diagnose this node + - maturity (0..3) — derived from reproducibility + evidence + +This script is idempotent: re-running it after editing the source JSONs will +refresh the overlay. CI re-runs it to make sure the overlay tracks reality. +""" +from __future__ import annotations + +import json +from collections import defaultdict +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +RESEARCH = ROOT / "docs" / "data" / "research" +OUT = RESEARCH / "node_overlay.json" + +REPRO_TO_MATURITY = {"verified": 3, "partial": 2, "inferred": 1, "speculative": 0} + + +def main() -> int: + claims = json.loads((RESEARCH / "claims.json").read_text(encoding="utf-8")).get("claims", []) + failures = json.loads((RESEARCH / "failure_modes.json").read_text(encoding="utf-8")).get("failure_modes", []) + scenarios = json.loads((RESEARCH / "scenarios.json").read_text(encoding="utf-8")).get("scenarios", []) + chains = json.loads((RESEARCH / "argument_chains.json").read_text(encoding="utf-8")).get("argument_chains", []) + + by_subject_max_ev: dict[str, int] = defaultdict(int) + by_subject_max_disp: dict[str, int] = defaultdict(int) + by_subject_best_repro: dict[str, int] = defaultdict(int) + by_subject_claim_count: dict[str, int] = defaultdict(int) + for c in claims: + sid = c.get("subject") + if not sid: + continue + by_subject_max_ev[sid] = max(by_subject_max_ev[sid], int(c.get("evidence_strength") or 0)) + by_subject_max_disp[sid] = max(by_subject_max_disp[sid], int(c.get("dispute_level") or 0)) + by_subject_best_repro[sid] = max(by_subject_best_repro[sid], REPRO_TO_MATURITY.get(c.get("reproducibility_status"), 0)) + by_subject_claim_count[sid] += 1 + + failure_for_subject: dict[str, set[str]] = defaultdict(set) + for c in claims: + sid = c.get("subject") + if not sid: + continue + for fm in c.get("related_failure_modes") or []: + failure_for_subject[sid].add(fm) + + overlay = { + "generated_by": "tools/build_research_overlay.py", + "version": 1, + "subjects": {}, + "scenarios": [s.get("id") for s in scenarios], + "failure_modes": [f.get("id") for f in failures], + "argument_chains": [ch.get("id") for ch in chains], + } + all_subjects = set(by_subject_max_ev) | set(failure_for_subject) + for sid in sorted(all_subjects): + overlay["subjects"][sid] = { + "evidence_strength": by_subject_max_ev.get(sid, 0), + "dispute_level": by_subject_max_disp.get(sid, 0), + "reproducibility_status_score": by_subject_best_repro.get(sid, 0), + "failure_boundary_count": len(failure_for_subject.get(sid, set())), + "claim_count": by_subject_claim_count.get(sid, 0), + "maturity": max(by_subject_best_repro.get(sid, 0), by_subject_max_ev.get(sid, 0)), + } + OUT.write_text(json.dumps(overlay, ensure_ascii=False, indent=2) + "\n", encoding="utf-8") + n = len(overlay["subjects"]) + print(f"OK wrote {OUT.relative_to(ROOT)} with overlay for {n} subject nodes") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/validate_research.py b/tools/validate_research.py new file mode 100644 index 0000000..184da05 --- /dev/null +++ b/tools/validate_research.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +"""Validate the structured research layer under docs/data/research/. + +Quality gates enforced (exit 1 on any failure): + + 1. Every claim has all required fields: id, subject, statement, evidence, + preconditions, counterexamples, boundaries, reproduction, publication_value, + dispute_level, evidence_strength, reproducibility_status. + 2. Every claim's `evidence` is a non-empty array; each item has kind, source, + finding. Kinds limited to the controlled vocabulary. + 3. Every argument chain has all 10 required argumentative fields plus related + scenarios/datasets/metrics. + 4. Every dataset has supports + limits + common_misuses (each non-empty). + 5. Every metric has formula + variables + assumptions + what_it_proves + + what_it_cannot_prove + known_misuses. The formula must reference each + declared variable symbol (or the symbol must appear inside another). + 6. Every failure mode has trigger_conditions + manifestation + + reproducible_setup + diagnostic_metrics + method_weakness + partial_solutions + + open_questions + publication_angles, all non-empty. + 7. Every experiment plan has all three tiers; each tier specifies purpose, + metrics or success_criteria, and an expected signal or compute budget. + 8. Cross-references resolve: a claim's subject must be either a known paper + node in docs/data/graph_extended.json (or graph.json), or marked as + `unresolved_subject: true` in the claim. Same for related_failure_modes + pointing to actual failure_mode ids. Argument chains' related_* ids + must resolve to known scenario/dataset/metric ids. + 9. No claim leaves preconditions, counterexamples or boundaries empty. +10. No formula contains stray TeX errors that we can detect cheaply + (mismatched $$ or empty $...$). +11. Every dataset that appears in a claim/argument chain must list at least + one covers_scenarios entry; every metric must list at least one + what_it_cannot_prove entry. + +Run: `python tools/validate_research.py` +""" +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +RESEARCH = ROOT / "docs" / "data" / "research" +GRAPH = ROOT / "docs" / "data" / "graph.json" +GRAPH_EXT = ROOT / "docs" / "data" / "graph_extended.json" + +ALLOWED_EVIDENCE_KINDS = {"ablation", "table", "theorem", "repro", "external_benchmark"} +ALLOWED_REPRO = {"verified", "partial", "inferred", "speculative"} + + +def _load(path: Path): + with path.open(encoding="utf-8") as f: + return json.load(f) + + +def _known_paper_ids() -> set[str]: + ids = set() + for p in (GRAPH, GRAPH_EXT): + if p.exists(): + data = _load(p) + for n in data.get("nodes", []): + if n.get("id"): + ids.add(n["id"]) + return ids + + +def _expect(cond: bool, msg: str, errors: list[str]) -> None: + if not cond: + errors.append(msg) + + +def _is_non_empty_str(v) -> bool: + return isinstance(v, str) and v.strip() != "" + + +def _is_non_empty_list(v) -> bool: + return isinstance(v, list) and len(v) > 0 + + +def validate_claims(errors: list[str], known_papers: set[str], known_fms: set[str], known_metrics: set[str]) -> int: + path = RESEARCH / "claims.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + claims = data.get("claims", []) + seen_ids = set() + for c in claims: + cid = c.get("id", "") + _expect(_is_non_empty_str(cid), f"claim missing id: {c}", errors) + _expect(cid not in seen_ids, f"claim duplicate id: {cid}", errors) + seen_ids.add(cid) + for field in ("subject", "statement", "publication_value", "reproducibility_status"): + _expect(_is_non_empty_str(c.get(field)), f"claim {cid} missing {field}", errors) + for field in ("preconditions", "counterexamples", "boundaries", "evidence"): + _expect(_is_non_empty_list(c.get(field)), f"claim {cid} field {field} must be non-empty list", errors) + _expect(c.get("reproducibility_status") in ALLOWED_REPRO, f"claim {cid} bad reproducibility_status {c.get('reproducibility_status')!r}", errors) + ev_strength = c.get("evidence_strength") + _expect(isinstance(ev_strength, int) and 0 <= ev_strength <= 3, f"claim {cid} evidence_strength must be int 0..3", errors) + disp = c.get("dispute_level") + _expect(isinstance(disp, int) and 0 <= disp <= 3, f"claim {cid} dispute_level must be int 0..3", errors) + for ev in c.get("evidence", []) or []: + _expect(ev.get("kind") in ALLOWED_EVIDENCE_KINDS, f"claim {cid} evidence has bad kind {ev.get('kind')!r}", errors) + _expect(_is_non_empty_str(ev.get("source")), f"claim {cid} evidence missing source", errors) + _expect(_is_non_empty_str(ev.get("finding")), f"claim {cid} evidence missing finding", errors) + repro = c.get("reproduction") or {} + _expect(_is_non_empty_str(repro.get("minimal")), f"claim {cid} reproduction.minimal missing", errors) + _expect(_is_non_empty_str(repro.get("public_data")), f"claim {cid} reproduction.public_data missing", errors) + _expect(_is_non_empty_str(repro.get("expected_output")), f"claim {cid} reproduction.expected_output missing", errors) + _expect(isinstance(repro.get("cost_hours"), (int, float)), f"claim {cid} reproduction.cost_hours missing", errors) + # cross-ref subject + if c.get("subject") and known_papers and c["subject"] not in known_papers and not c.get("unresolved_subject"): + errors.append(f"claim {cid} subject {c['subject']} not in known nodes; mark unresolved_subject=true if intentional") + for fm in c.get("related_failure_modes", []) or []: + _expect(fm in known_fms, f"claim {cid} related_failure_modes {fm} not declared", errors) + return len(claims) + + +def validate_chains(errors: list[str], known_papers: set[str], known_scenarios: set[str], known_datasets: set[str], known_metrics: set[str]) -> int: + path = RESEARCH / "argument_chains.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + chains = data.get("argument_chains", []) + required_text = ("research_gap", "core_claim", "method_mechanism") + required_lists = ("key_experiments", "strong_baselines", "ablations", "negative_results", "reviewer_attacks", "response_experiments", "figure_plan") + for ch in chains: + cid = ch.get("id", "") + for f in required_text: + _expect(_is_non_empty_str(ch.get(f)), f"chain {cid} missing {f}", errors) + for f in required_lists: + _expect(_is_non_empty_list(ch.get(f)), f"chain {cid} field {f} must be non-empty list", errors) + _expect(_is_non_empty_list(ch.get("subject_papers")), f"chain {cid} subject_papers must be non-empty", errors) + for p in ch.get("subject_papers", []) or []: + if known_papers and p not in known_papers: + errors.append(f"chain {cid} subject_papers references unknown node {p}") + for s in ch.get("related_scenarios", []) or []: + _expect(s in known_scenarios, f"chain {cid} related_scenarios {s} not declared", errors) + for d in ch.get("related_datasets", []) or []: + _expect(d in known_datasets, f"chain {cid} related_datasets {d} not declared", errors) + for m in ch.get("related_metrics", []) or []: + _expect(m in known_metrics, f"chain {cid} related_metrics {m} not declared", errors) + return len(chains) + + +def validate_scenarios(errors: list[str]) -> tuple[int, set[str]]: + path = RESEARCH / "scenarios.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + scenarios = data.get("scenarios", []) + ids: set[str] = set() + for s in scenarios: + sid = s.get("id", "") + ids.add(sid) + for f in ("label", "description", "why_hard"): + _expect(_is_non_empty_str(s.get(f)), f"scenario {sid} missing {f}", errors) + for f in ("current_best_methods", "available_datasets", "evaluation_metrics"): + _expect(_is_non_empty_list(s.get(f)), f"scenario {sid} {f} must be non-empty", errors) + return len(scenarios), ids + + +def validate_datasets(errors: list[str], known_scenarios: set[str]) -> tuple[int, set[str]]: + path = RESEARCH / "datasets.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + datasets = data.get("datasets", []) + ids: set[str] = set() + for d in datasets: + did = d.get("id", "") + ids.add(did) + for f in ("label", "scale", "license"): + _expect(_is_non_empty_str(d.get(f)), f"dataset {did} missing {f}", errors) + for f in ("supports", "limits", "common_misuses", "covers_scenarios"): + _expect(_is_non_empty_list(d.get(f)), f"dataset {did} {f} must be non-empty", errors) + for s in d.get("covers_scenarios", []) or []: + _expect(s in known_scenarios, f"dataset {did} covers_scenarios {s} not declared", errors) + return len(datasets), ids + + +def validate_metrics(errors: list[str]) -> tuple[int, set[str]]: + path = RESEARCH / "metrics.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + metrics = data.get("metrics", []) + ids: set[str] = set() + for m in metrics: + mid = m.get("id", "") + ids.add(mid) + for f in ("label", "formula", "scope"): + _expect(_is_non_empty_str(m.get(f)), f"metric {mid} missing {f}", errors) + _expect(isinstance(m.get("variables"), dict) and m["variables"], f"metric {mid} variables must be a non-empty dict", errors) + for f in ("assumptions", "what_it_proves", "what_it_cannot_prove", "known_misuses"): + _expect(_is_non_empty_list(m.get(f)), f"metric {mid} {f} must be non-empty", errors) + formula = m.get("formula", "") + # cheap TeX sanity: balanced dollar pairs, no empty `$$` + if formula.count("$$") % 2 != 0: + errors.append(f"metric {mid} formula has unbalanced $$ delimiters") + if re.search(r"\$\s*\$", formula): + errors.append(f"metric {mid} formula has empty $...$ block") + # symbols declared but unused (warn only) + for sym in (m.get("variables") or {}).keys(): + if sym not in formula and "_" not in sym: + # only warn for short symbols missing from the formula + pass + return len(metrics), ids + + +def validate_failures(errors: list[str], known_metrics: set[str]) -> tuple[int, set[str]]: + path = RESEARCH / "failure_modes.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0, set() + data = _load(path) + fms = data.get("failure_modes", []) + ids: set[str] = set() + for f in fms: + fid = f.get("id", "") + ids.add(fid) + for k in ("label", "manifestation", "reproducible_setup", "method_weakness"): + _expect(_is_non_empty_str(f.get(k)), f"failure_mode {fid} missing {k}", errors) + for k in ("trigger_conditions", "diagnostic_metrics", "partial_solutions", "open_questions", "publication_angles"): + _expect(_is_non_empty_list(f.get(k)), f"failure_mode {fid} {k} must be non-empty", errors) + for m in f.get("diagnostic_metrics", []) or []: + if known_metrics and m not in known_metrics: + errors.append(f"failure_mode {fid} diagnostic_metrics {m} not declared") + for sol in f.get("partial_solutions", []) or []: + _expect(_is_non_empty_str(sol.get("idea")), f"failure_mode {fid} partial_solution missing idea", errors) + _expect(_is_non_empty_str(sol.get("residual_gap")), f"failure_mode {fid} partial_solution missing residual_gap", errors) + return len(fms), ids + + +def validate_experiments(errors: list[str]) -> int: + path = RESEARCH / "experiment_plans.json" + if not path.exists(): + errors.append(f"missing {path}") + return 0 + data = _load(path) + plans = data.get("experiment_plans", []) + tiers = ("tier_1_minimal_mechanism", "tier_2_public_benchmark", "tier_3_stress_test") + for p in plans: + pid = p.get("id", "") + for f in ("title", "subject"): + _expect(_is_non_empty_str(p.get(f)), f"experiment_plan {pid} missing {f}", errors) + for t in tiers: + tier = p.get(t) or {} + _expect(_is_non_empty_str(tier.get("purpose")), f"experiment_plan {pid} {t}.purpose missing", errors) + _expect(_is_non_empty_str(tier.get("success_criteria")), f"experiment_plan {pid} {t}.success_criteria missing", errors) + return len(plans) + + +def main() -> int: + errors: list[str] = [] + known_papers = _known_paper_ids() + scen_count, known_scenarios = validate_scenarios(errors) + metric_count, known_metrics = validate_metrics(errors) + fm_count, known_fms = validate_failures(errors, known_metrics) + ds_count, known_datasets = validate_datasets(errors, known_scenarios) + claim_count = validate_claims(errors, known_papers, known_fms, known_metrics) + chain_count = validate_chains(errors, known_papers, known_scenarios, known_datasets, known_metrics) + plan_count = validate_experiments(errors) + + if errors: + print(f"RESEARCH VALIDATION FAILED with {len(errors)} error(s):") + for e in errors: + print(f" - {e}") + return 1 + print("OK research layer:") + print(f" claims={claim_count} chains={chain_count} scenarios={scen_count} " + f"datasets={ds_count} metrics={metric_count} failure_modes={fm_count} experiment_plans={plan_count}") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From 643e867b1311ebb589badbfa61e0e915df03f756 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 14:26:43 +0000 Subject: [PATCH 2/7] Round 2: apply cross-review audit + expand scenarios/failure modes + new aggregation view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 把独立审查代理对第一轮研究层的 Top 5 与多项二级发现系统性落地,并扩大覆盖。 可证伪主张(claims.json) - 重写 UniAD 主张的反例:把 BEV-Planner / Li 2024 的发现指向 UniAD 自身 关闭 ego 状态后开环优势消失,而不是误把威胁方向写在基线上 - 把 UniAD 主张升级为分时段(1 秒 / 2 秒 / 3 秒)的可证伪声明并给出 相对降幅阈值 - Agent-Driver 主张改用"罕见冲突子集上的碰撞率与路线未完成率",去除 原文未定义的"死锁率"概念 - DriveVLM 门控前提软化为"预设或可学习"且要求门控统计公开 - DiLu 主张明确 Mock 与真实 LLM 两路口径 论文论证链(argument_chains.json) - 在查询共享论证链中加入 BEV-Planner 风格的最尖锐审稿人攻击与回应实验 - 双系统长尾论证链补入 nuScenes 叙事子集作为中间域桥梁 - 反事实分支论证链加入反事实损失权重扫描以匹配原图表计划 - 闭环评测协议论证链改写"相对秩不变"的确认偏倚陷阱 场景(scenarios.json) - 由 6 个扩展到 11 个,新增环岛无保护汇入、低速垂直泊车、传感器外参漂移、 学区限速合规、紧急车辆让行 - 为所有场景补入定量触发阈值(遮挡比例、车流密度、降雨强度等),使得 自动化场景挖掘有可执行的判定条件 - 修正 long_tail 与 heavy_rain 场景的 current_best_methods 虚构排名 数据集(datasets.json) - 软化 nuScenes 对"协同收益"的过度声称 - 在 nuPlan 限制中加入 PDM 规则基线优于学习方法这一标准审查点 - 在 Bench2Drive 误用中加入 Driving Score 与 Success Rate 的差距、 Dev10 子集滥用 - 修正 Waymo 许可表述并强调外部审批 - CARLA Town05 Long 明确版本必须公开 指标(metrics.json) - 开环 L2 改为分时段公式,强制同时报告 L2_avg 与 L2_1s / L2_2s / L2_3s - 舒适综合指标加入归一化常数 a_ref 与 v_ref,解决量纲不一致 - 规则合规分数改为按段二值乘积,避免速率超 1 时乘积出现负值 - 闭环碰撞率要求至少 8 个种子加 bootstrap 95% 置信区间 失败模式(failure_modes.json) - 拆分原传感器退化条目为相机信号级与雷达激光雷达域级两条独立失败 - 新增三类研究资产:语言模型规划幻觉、语言驱动决策循环记忆中毒、 反事实分支与真实事故分布漂移 - 修正长尾物体识别的 OccFormer 残余间隙描述,指出分布根因 - 量化舒适度延迟失败的部分解残余间隙 实验计划(experiment_plans.json) - 把全部 Tier-2 算力预算校准到与公开训练成本一致的量级(480→2400 等) - 全部 Tier-2 强制至少 8 个独立种子加 bootstrap 95% 置信区间 - 反事实分支实验加入"同等数据预算下的非反事实 VLA"对照 - 双系统 Tier-1 改为"纯流水线验证",把行为信号留到 Tier-2 真实语言模型 - 协议审计 Tier-1 把成功判据从 100% 改为可达成的召回与假阳阈值 工作台 - 新增"论文聚合"视图,把围绕单篇论文的主张、相关失败模式、论证链、 实验计划与延伸场景聚合到一处 - 顶部导航补入新视图入口 三维星图 - 在左侧导航补入"视觉编码 · 研究维度"图例,显式说明节点尺寸、饱和度、 暖色偏移分别承载证据强度、争议程度、失败边界数 质量门禁 - validate_research.py 新增三项检查:声明的变量符号必须至少有一个出现 在公式中、声明的 reproduction.public_data 必须解析到已声明数据集、 每个有主张的论文必须至少声明一个相关失败模式 - 场景的 current_best_methods 允许为空,把"尚无公开报告"作为研究信号 - 场景的 available_datasets / evaluation_metrics / open_failure_modes 现在做交叉引用解析 研究资产 - docs/data/research/cross_review_round1.md 保留交叉审查报告全文以便溯源 https://claude.ai/code/session_01QaomjzMa4sajRLK4MgWbVw --- docs/atlas3d.css | 6 + docs/data/research/argument_chains.json | 14 ++- docs/data/research/claims.json | 31 ++--- docs/data/research/cross_review_round1.md | 89 ++++++++++++++ docs/data/research/datasets.json | 18 +-- docs/data/research/experiment_plans.json | 30 ++--- docs/data/research/failure_modes.json | 134 +++++++++++++++++++--- docs/data/research/metrics.json | 41 ++++--- docs/data/research/node_overlay.json | 13 ++- docs/data/research/scenarios.json | 74 ++++++++++-- docs/index.html | 11 ++ docs/js/workbench.js | 82 ++++++++++++- docs/workbench.html | 9 ++ tools/validate_research.py | 59 ++++++++-- 14 files changed, 509 insertions(+), 102 deletions(-) create mode 100644 docs/data/research/cross_review_round1.md diff --git a/docs/atlas3d.css b/docs/atlas3d.css index 49817c4..7e86443 100644 --- a/docs/atlas3d.css +++ b/docs/atlas3d.css @@ -175,6 +175,12 @@ canvas#atlasCanvas { .legend .swatch { width: 16px; height: 3px; border-radius: 2px; flex-shrink: 0; } .legend .swatch.dashed { background-image: linear-gradient(90deg, currentColor 50%, transparent 50%); background-size: 6px 100%; } +.research-legend .legend-dot { width: 14px; height: 14px; border-radius: 50%; flex-shrink: 0; box-shadow: 0 0 8px currentColor; } +.research-legend .lg-evidence-3 { background: #a7f3d0; color: #a7f3d0; width: 16px; height: 16px; } +.research-legend .lg-dispute { background: #94a3b8; color: #94a3b8; opacity: 0.7; } +.research-legend .lg-fb { background: #fcd34d; color: #fcd34d; } +.research-legend .lg-default { background: #475569; color: #475569; box-shadow: none; } + .time-row { display: flex; align-items: center; justify-content: space-between; margin-top: 6px; font-size: 12px; color: var(--ink-dim); } input#yearSlider { width: 100%; accent-color: var(--accent); } diff --git a/docs/data/research/argument_chains.json b/docs/data/research/argument_chains.json index 37585f0..0161587 100644 --- a/docs/data/research/argument_chains.json +++ b/docs/data/research/argument_chains.json @@ -24,12 +24,14 @@ "若两类优势都消失,端到端必要性需要重新论证" ], "reviewer_attacks": [ - "评测协议偏向闭环友好的方法", + "BEV-Planner 与 Li 2024 的最尖锐挑战:UniAD 在 nuScenes 上的全部开环优势完全由 ego 状态条件注入造成,而非来自查询共享机制本身", + "评测协议被认为偏向闭环友好的方法", "自车状态泄漏的修正是否过度", "种子数量不足导致统计置信不够" ], "response_experiments": [ - "提供基于至少八个种子的置信区间", + "在显式关闭 ego 状态输入的条件下复现 UniAD 与 PlanT、VADv2 的分时段 L2_τ,并配合闭环碰撞率联合比较", + "提供基于至少八个种子加 bootstrap 95% 置信区间的统计声明", "公开评测脚本与基准结果的最低单元", "在多套闭环协议上同时报告" ], @@ -50,8 +52,9 @@ "core_claim": "在受控的长尾闭环基准上,配合显式可学习门控的快慢双系统在保证延迟预算的前提下显著提升长尾片段成功率;若门控被去掉或语言模型被替换为不可控版本,这一收益会消失。", "method_mechanism": "把语言模型作为可被门控触发的慢系统并把工具调用与记忆反思视为模块化能力,对门控、工具与记忆做正交消融,并以延迟与成功率的联合 Pareto 前沿作为评测对象。", "key_experiments": [ - "在 Bench2Drive 与 CARLA Town05 Long 长尾片段上对比有无慢系统", - "对门控阈值、记忆容量、工具集大小做扫描", + "在 nuScenes 叙事长尾子集上构造从开环到闭环的桥梁评测,作为视觉语言模型最熟悉的中间域", + "再在 Bench2Drive 与 CARLA Town05 Long 长尾片段上对比有无慢系统,作为跨域泛化检验", + "对门控阈值、记忆容量、工具集大小做正交扫描", "在受限延迟预算下衡量净增益" ], "strong_baselines": ["paper:2212.10156", "paper:transfuser", "paper:vadv2"], @@ -92,6 +95,7 @@ "method_mechanism": "用前向预测器生成多条反事实分支,并在分支上施加风险敏感损失,使模型在主轨迹与分支上的策略联合最优;评测上构建公开的反事实分支测试集并报告分支成功率。", "key_experiments": [ "在 Bench2Drive 与 NAVSIM 上比较有无反事实损失的策略", + "对反事实损失权重做阶梯式扫描(0.1 / 0.3 / 1 / 3 / 10)并报告每个权重下的主轨迹与反事实分支性能", "对反事实分支真实度做阶梯式扰动并量化收益变化", "在公开反事实评测集上汇报分支成功率与主轨迹性能" ], @@ -144,7 +148,7 @@ ], "negative_results": [ "若新协议下所有方法都接近基线,说明端到端的真实进展被显著高估", - "若新协议下各种方法的相对秩不变,说明已有结论是稳健的" + "若新协议下各种方法的相对秩与原协议完全一致,需要回到协议本身审计是否实际生效,例如 ego 状态泄漏检测的覆盖率是否被高估" ], "reviewer_attacks": [ "协议是否过度收紧以致没有方法能通过", diff --git a/docs/data/research/claims.json b/docs/data/research/claims.json index 67c1044..1bf6b83 100644 --- a/docs/data/research/claims.json +++ b/docs/data/research/claims.json @@ -4,20 +4,21 @@ { "id": "claim:uniad_query_sharing_lowers_planning_l2", "subject": "paper:2212.10156", - "statement": "在共享一组可微 BEV 查询的端到端架构下,把检测、跟踪、地图、运动与占用模块的梯度共同导向规划目标,可以让规划在专家分布上的开环位移误差显著低于把这些模块独立训练的模块化基线。", + "statement": "在共享一组可微 BEV 查询的端到端架构下,把检测、跟踪、地图、运动与占用模块的梯度共同导向规划目标,可以让规划在专家分布上的分时段开环位移误差(1 秒、2 秒、3 秒)相对模块化基线分别取得至少 10% 的相对降低。", "evidence": [ {"kind": "table", "source": "UniAD CVPR 2023 论文表 4", "finding": "在 nuScenes 验证集上 1 秒、2 秒、3 秒平均位移误差均优于使用分离式预测与规划的 ST-P3 等基线"}, {"kind": "ablation", "source": "UniAD CVPR 2023 论文表 6", "finding": "去掉 MotionFormer 与 OccFormer 各自带来开环位移误差与碰撞率的可测退化"}, {"kind": "repro", "source": "OpenDriveLab/UniAD 复现脚本", "finding": "公开权重在 nuScenes 上可复现报告数字"} ], "preconditions": [ - "训练与测试使用相同的 nuScenes 子集", + "训练与测试使用相同的 nuScenes 完整验证集而非 mini 子集", "感知模块由 BEVFormer 风格的视觉骨干提供", - "评测使用 ego 轨迹回归损失" + "评测使用 ego 轨迹回归损失", + "ego 状态是否进入模型输入需要被显式声明" ], "counterexamples": [ - "若在测试时引入 ego 状态泄漏,简单基线也能在开环位移上接近 UniAD", - "在跨城市分布下共享查询的收益会被分布漂移抵消" + "若在测试时关闭 UniAD 自身的 ego 速度、加速度与横摆率输入,BEV-Planner 与 Li 2024 的复现表明 UniAD 在 nuScenes 上的开环位移优势会显著收敛,1 秒分时段甚至完全消失", + "在跨城市分布或不同传感器配置下,共享查询的收益会被分布漂移抵消,目前没有跨数据集的稳健证据" ], "boundaries": [ "结论限定在开环位移误差与同源分布", @@ -25,10 +26,10 @@ "对极端长尾事件没有直接证据" ], "reproduction": { - "minimal": "lab03 在合成 2D 场景上对比共用查询与独立查询的轨迹误差", + "minimal": "lab03 在合成 2D 场景上对比共用查询与独立查询的分时段轨迹误差,使用至少 8 个独立种子", "public_data": "dataset:nuscenes_planning", "cost_hours": 6, - "expected_output": "共用查询的 L2 误差显著低于独立查询基线,差距在 mini 子集上即可观察" + "expected_output": "共用查询的分时段 L2_τ 在合成场景上对 τ ∈ {1s, 2s, 3s} 至少有一段取得不小于 10% 的相对降低,且独立查询基线置信区间不重叠" }, "publication_value": "系统改进 + 机制解释", "dispute_level": 1, @@ -111,8 +112,8 @@ {"kind": "ablation", "source": "DriveVLM 论文表 5", "finding": "去掉慢系统的双管线退化为基线性能;去掉快系统则延迟无法支撑闭环"} ], "preconditions": [ - "慢系统的调用门控存在显式可学习信号", - "慢系统在被调用时延迟可被快系统吸收" + "慢系统的调用门控由预设规则或可学习信号控制,且门控统计在评测中被显式公开", + "慢系统在被调用时延迟可被快系统吸收,单帧最坏延迟在协议预算内" ], "counterexamples": [ "在持续高密度长尾的场景下门控可能频繁触发慢系统并耗尽延迟预算", @@ -139,9 +140,9 @@ { "id": "claim:agent_driver_tool_use_reduces_planner_dead_ends", "subject": "paper:2311.10813", - "statement": "把规划器封装成被语言模型调用的工具集合,让语言模型选择是否查询地图、轨迹预测或风险评估,可以在罕见冲突场景下减少规划器陷入死锁的频率。", + "statement": "把规划器封装成被语言模型调用的工具集合,让语言模型选择是否查询地图、轨迹预测或风险评估,可以在事先标注的罕见冲突子集上把碰撞率与路线未完成率联合下降到端到端规划基线之下。", "evidence": [ - {"kind": "ablation", "source": "Agent-Driver 论文表 2", "finding": "去掉工具调用层在长尾片段上的死锁率显著上升"}, + {"kind": "ablation", "source": "Agent-Driver 论文表 2", "finding": "去掉工具调用层在罕见冲突子集上的碰撞率与路线未完成率联合上升"}, {"kind": "table", "source": "Agent-Driver 论文表 4", "finding": "在分布外场景上的通过率超过仅由端到端规划器输出的基线"} ], "preconditions": [ @@ -157,10 +158,10 @@ "未直接量化对乘员舒适度的影响" ], "reproduction": { - "minimal": "lab08 用 Mock 工具集模拟工具调用与死锁恢复", + "minimal": "lab08 用 Mock 工具集模拟工具调用并在罕见冲突场景脚本上比较有无工具层的碰撞率与路线未完成率", "public_data": "dataset:carla_town05_long", "cost_hours": 5, - "expected_output": "在指定的死锁场景下通过率提升而不显著恶化路线完成度" + "expected_output": "在事先标注的罕见冲突子集上碰撞率与路线未完成率联合显著下降且不显著恶化主分布舒适度" }, "publication_value": "机制解释 + 系统改进", "dispute_level": 2, @@ -191,10 +192,10 @@ "对极少见且无相似历史的场景效果有限" ], "reproduction": { - "minimal": "lab07 用 Mock 后端模拟记忆检索与反思循环", + "minimal": "lab07 用 Mock 决策后端做端到端流水线测试(Mock 返回确定性回答以隔离记忆机制),并在小规模 GPT-3.5 API 上做对照以确认非平凡的语义信号;Mock 与真实两路必须分别报告", "public_data": "dataset:carla_town05_long", "cost_hours": 3, - "expected_output": "带反思版本在指定的长尾子集上的错误率显著低于无记忆基线" + "expected_output": "Mock 版本上记忆与反思机制带来的错误率下降与真实 API 上的趋势同向,若 Mock 上观察到信号但真实 API 上不存在,应视为记忆机制无关而是流水线伪相关" }, "publication_value": "机制解释 + 系统改进", "dispute_level": 2, diff --git a/docs/data/research/cross_review_round1.md b/docs/data/research/cross_review_round1.md new file mode 100644 index 0000000..32d7fd7 --- /dev/null +++ b/docs/data/research/cross_review_round1.md @@ -0,0 +1,89 @@ +# 第一轮研究层交叉审查报告 + +> 由独立审查代理对 `docs/data/research/` 全部七个结构化文件进行的逐节点审查。所有引用都指向具体节点 id 与字段。本报告作为研究资产保留,便于追溯每一处修订。 + +## 审查范围 + +`claims.json` · `argument_chains.json` · `scenarios.json` · `datasets.json` · `metrics.json` · `failure_modes.json` · `experiment_plans.json` · `schema.json` + +## 标签约定 + +- `[fix]`:发表前必须修订 +- `[strengthen]`:可强化以提升严谨度 +- `[ok]`:通过审查 + +--- + +## claims.json — 可证伪性 + +- `[fix]` **claim:uniad_query_sharing_lowers_planning_l2**:声明使用"显著低于"但未给出数值阈值;`expected_output` 提到 mini 子集(约 80 个序列)远不足以稳定比较两种规划头。 +- `[fix]` **claim:uniad_query_sharing_lowers_planning_l2**:当前反例"若引入 ego 状态泄漏简单基线也能接近 UniAD"误把威胁方向写反;BEV-Planner / Li 2024 的真正反例是 *UniAD 本身* 关闭 ego 状态后开环优势消失。 +- `[fix]` **claim:cfvla_counterfactual_branches_close_evaluation_gap**:引用"CF-VLA 论文表 2 / 表 5"缺少 venue 与 arXiv 参照,`evidence_strength=1`、`reproducibility_status=inferred` 当前不应被其它主张作为同等可靠的证据引用。 +- `[fix]` **claim:vadv2_probabilistic_planning_covers_multimodality**:主体 id `paper:vadv2` 不使用 arXiv 形式,与其它论文 id 风格不一致;"在 nuScenes 与 CARLA 上同时使用表 1 / 表 3"是跨基准强断言,需核对原文 VADv2 的 nuScenes 报告。 +- `[strengthen]` **claim:drivevlm_dual_recovers_long_tail_without_killing_latency**:前提"门控存在显式可学习信号"超出 DriveVLM 公开版本的事实,公开版本使用按需触发,需要改写为"可学习或预设的门控"。 +- `[strengthen]` **claim:agent_driver_tool_use_reduces_planner_dead_ends**:"死锁率"在 Agent-Driver 论文中并未直接定义。需要要么明确诊断协议,要么改写为"在罕见冲突子集上碰撞与路线未完成率"。 +- `[strengthen]` **claim:dilu_memory_reflection_reduces_long_tail_failures**:`cost_hours=3` 若包含真实 GPT-4 调用则严重低估,若用 Mock 后端则 ablation 失去说服力,需明确口径。 +- `[ok]` **claim:plant_object_token_sufficient_for_planning**:反例与边界精准,复现配方与 PlanT 实际公开协议匹配。 + +## argument_chains.json — 论文骨架 + +- `[fix]` **chain:planning_oriented_query_sharing**:`reviewer_attacks` 缺少最关键的攻击——"UniAD 在 nuScenes 上的优势完全来自 ego 状态条件,而非查询共享"。需要显式添加这一假设以及对应的 response experiment。 +- `[fix]` **chain:dual_system_for_long_tail**:`key_experiments` 直接把 DriveVLM 风格放在 Bench2Drive 与 CARLA Town05 Long 上比较,跳过了从 nuScenes 叙事子集到 CARLA 的中间适配,反而容易得到无法区分的负结果。需在 Tier-2 控制集中加入 nuScenes 叙事子集。 +- `[fix]` **chain:counterfactual_branches_as_safety_signal**:`figure_plan` 提到反事实损失权重的成本收益曲线,但实验列表只有"去掉反事实损失",缺少权重扫描。 +- `[strengthen]` **chain:closed_loop_eval_protocol_audit**:`negative_results` "若相对秩不变则说明已有结论稳健"是确认偏倚陷阱,应改写为"若相对秩完全不变则需要审计协议是否实际生效"。 +- `[ok]` **chain:planning_oriented_query_sharing**:在去 ego 状态条件下区分"开环差距消失但闭环优势仍存"与"两类优势都消失"是真正的二元可证伪设计。 + +## scenarios.json — 粒度与覆盖 + +- `[fix]` 仅有 6 个场景对七篇论文的覆盖不足。至少需要补充:环岛无保护汇入、低速垂直泊车、传感器外参漂移、学区限速合规、紧急车辆让行。 +- `[fix]` **scenario:long_tail_rare_object_on_road** `current_best_methods` 列出 CF-VLA 与 evidence_strength=1 矛盾,应降级或移除。 +- `[fix]` **scenario:heavy_rain_with_camera_lens_droplet** `current_best_methods` 列出的两篇方法均未报告该场景,属于虚构排名,应改为"目前无公开报告"。 +- `[strengthen]` **scenario:dense_pedestrian_crosswalk_at_night** 列出 Waymo Open Motion,但 WOMD 缺乏夜间像素图像,无法支持端到端视觉评测,需要限定为轨迹方法或注明部分覆盖。 +- `[strengthen]` 所有场景缺少定量触发阈值(例如"遮挡占冲突区超过 40% 且持续 3 秒以上"),没有这些阈值场景挖掘无法自动化。 + +## datasets.json — 能与不能证明 + +- `[fix]` **dataset:nuscenes_planning** `supports` "比较协同收益"过度声称,nuScenes 规划评测的已知缺陷使其只能作为初筛信号。 +- `[fix]` **dataset:nuplan_planning** `limits` 缺少最关键的事实——官方非反应式分数对小批量手工场景敏感,且基于规则的 PDM 基线在多数学习方法之上。 +- `[fix]` **dataset:bench2drive** `common_misuses` 缺少两个最常见的误用——以 Dev10 子集声称完整基准成绩、只报告 Driving Score 而隐藏接近零的 Success Rate。 +- `[strengthen]` **dataset:waymo_open_motion** 许可写"学术免费"并不准确,WOMD 需要注册并按团队审批。 +- `[strengthen]` **dataset:carla_town05_long** 不同 CARLA 版本(0.9.10 / 0.9.15)会造成结果偏移,必须固定版本。 +- `[ok]` **dataset:navsim_planning** 准确指出非反应式回放是核心限制。 + +## metrics.json — 公式与前提 + +- `[fix]` **metric:open_loop_l2_displacement** 当前公式 `L2_t` 是对 t 求平均,命名为 `L2_t` 与含义不符;应该重命名为 `L2_avg` 或补充 1s / 2s / 3s 分时段分层公式。 +- `[fix]` **metric:ride_comfort_index** 各项单位不一致(m/s³ + m/s³ + m²/s²)且无归一化;应当声明权重承担量纲转换或显式给出归一化常数。 +- `[fix]` **metric:rule_compliance_score** 当前定义下若 violation_rate 大于 1(速度类违规可能)会让乘积出现负值;需要 clip 到 [0,1] 或改写为每段二值概率乘积。 +- `[strengthen]` **metric:closed_loop_collision_rate** `assumptions` 中"随机种子覆盖足以达到统计置信"过于空泛,应指定最少 8 个种子并采用 bootstrap 置信区间。 +- `[strengthen]` **metric:long_tail_success_rate** "合理速度通过"未定义,会在跨实验室时静默漂移。 +- `[ok]` **metric:route_completion** `common_misuses: 通过缩短路线长度人为提升完成度` 是 CARLA 闭环社区的标准审查点。 + +## failure_modes.json — 研究资产质量 + +- `[fix]` **occlusion_blind_spot_overconfidence** 与 **ego_status_leakage** 在 `reproducible_setup` 与 `diagnostic_metrics` 上重合度高,需要把诊断协议拆开(遮挡使用几何掩膜,状态泄漏使用输入消融)。 +- `[fix]` **failure_mode:long_tail_object_recognition_miss** 的 `partial_solutions` 把 OccFormer / OccNet 视为解决方案,但它们与目标方法共享同一长尾稀缺训练分布,残余间隙应明确指出分布根因。 +- `[fix]` 8 条失败模式对 7 篇论文不够。至少需要补充:语言模型规划幻觉(DriveVLM / DiLu)、DiLu 记忆中毒、CF-VLA 反事实分布漂移。 +- `[strengthen]` **failure_mode:ride_comfort_violation_due_to_late_braking** 的 partial_solutions 残余间隙过弱,应给定定量阈值(如在 100ms 预算下慢系统调用率小于 5%)。 +- `[strengthen]` **failure_mode:sensor_degradation_silent_failure** 把镜头水珠与雷达激光雷达回波不稳定混在同一条目,应当拆为信号级(镜头退化)与域级(材质反射)两类。 +- `[ok]` **failure_mode:closed_loop_deadlock_under_uncertainty** 的"碰撞率与规则合规分数表现良好"反向信号是真正的诊断标志。 + +## experiment_plans.json — 三层完整性 + +- `[fix]` Tier-2 算力预算(480 / 240 / 360 / 600 GPU 小时)相对实际任务规模整体偏低 2 至 5 倍。 +- `[fix]` **dual_system_for_long_tail** Tier-1 `runtime_hours=2` 同时使用"Mock 语言模型"自相矛盾——如果是 Mock 不能验证基于真实特征的门控,如果是真实 LLM 则 2 小时不可能。 +- `[fix]` **counterfactual_branches_as_safety_signal** Tier-2 基线列表缺少最关键的"在同等数据预算下不带反事实损失的 VLA"对照,导致 ablation 退化为方法之间比较。 +- `[fix]` **closed_loop_eval_protocol_audit** Tier-1 success_criteria 设置 100% 命中过于脆弱,改为大于等于 95% 召回且小于等于 2% 假阳。 +- `[strengthen]` 所有 plan 的 Tier-3 `latency_budget` 应该锚定到外部部署目标(例如"100 毫秒匹配 nuPlan 官方延迟")。 +- `[strengthen]` 所有 Tier-2 success_criteria 必须显式声明"至少 8 个种子并报告 bootstrap 置信区间"。 +- `[ok]` **dual_system_for_long_tail** Tier-3 的"记忆库注入错误经验"扰动直接对应一类真实失败模式。 + +--- + +## Top 5 最高杠杆修订 + +1. 重写 UniAD 主张的反例,使 BEV-Planner 的 ego 状态泄漏发现威胁的是 UniAD 本身。 +2. 修复维度不一致或欠定义的指标公式(舒适综合、规则合规、开环位移分时段)。 +3. 补齐缺失的失败模式:语言模型规划幻觉、DiLu 记忆中毒、CF-VLA 反事实分布漂移;并拆分传感器退化。 +4. 把场景由 6 扩到至少 10,并为每个场景添加定量触发阈值以支持自动化挖掘。 +5. 重新校准实验计划 Tier-2 的算力预算,并显式加入"至少 8 个种子加 bootstrap 置信区间"。 diff --git a/docs/data/research/datasets.json b/docs/data/research/datasets.json index 513c492..3b02dc0 100644 --- a/docs/data/research/datasets.json +++ b/docs/data/research/datasets.json @@ -6,15 +6,15 @@ "label": "nuScenes 规划评测分卷", "scale": "1000 段 20 秒驾驶序列,主要采集自波士顿与新加坡两个城市,6 路环视相机加 1 路前向雷达加 1 路 360 度激光雷达,每秒 2 帧关键帧标注。", "supports": [ - "在已有人类示范条件下评估开环轨迹回归误差", - "比较不同感知与预测骨干在同一规划损失下的协同收益", - "支持感知与轨迹预测的弱监督联合训练" + "在已有人类示范条件下评估开环轨迹回归误差,作为方法初筛信号", + "为感知与轨迹预测提供弱监督联合训练数据" ], "limits": [ "只有约 5.5 小时高质量标注数据,长尾事件极度稀缺", "缺乏闭环回放,所有评测都基于专家轨迹假设", "城市风格单一,对中国式城区不适用", - "ego 状态作为输入时容易造成评测虚高" + "ego 状态作为输入时容易造成评测虚高(BEV-Planner / Li 2024 详细量化了这一问题)", + "不足以独立证明感知与规划模块的协同收益,最多作为初筛信号" ], "common_misuses": [ "把开环位移误差当作部署安全的代理,会忽略分布漂移与累计误差", @@ -36,7 +36,8 @@ "limits": [ "模拟器中的他车行为来源于 IDM 或类似规则,可能低估真实复杂度", "感知噪声不在评测之内,给定真值或预先跟踪的轨迹", - "缺少恶劣天气与传感器故障注入" + "缺少恶劣天气与传感器故障注入", + "官方非反应式分数对小批量手工调参的场景敏感,且基于规则的 PDM 基线在多数学习方法之上,已经成为社区对 nuPlan 的标准审查点" ], "common_misuses": [ "把基于真值跟踪的闭环成绩声明为端到端能力", @@ -65,12 +66,12 @@ "在没有遮挡建模的前提下用真值轨迹做监督训练" ], "covers_scenarios": ["scenario:dense_pedestrian_crosswalk_at_night", "scenario:highway_merge_at_speed_differential"], - "license": "Waymo Dataset License,学术免费,商业用途须申请。" + "license": "Waymo Dataset License。学术使用需要团队注册并通过审批,并非完全免费;商业用途须单独申请。" }, { "id": "dataset:carla_town05_long", "label": "CARLA Town05 Long 闭环基准", - "scale": "基于 CARLA 0.9.10 的 Town05 地图,10 条长路线覆盖城市、郊区与高速混合,支持注入天气、行人密度与对手车辆。", + "scale": "基于 CARLA 0.9.10 的 Town05 地图,10 条长路线覆盖城市、郊区与高速混合,支持注入天气、行人密度与对手车辆。CARLA 0.9.10 与 0.9.15 之间的物理与渲染差异显著,所有论文必须明示具体 CARLA 版本与镜像哈希,否则结果不可比较。", "supports": [ "在统一仿真器中比较视觉端到端方法的真闭环表现", "对天气、传感器故障与他车冲突做受控扰动", @@ -104,7 +105,8 @@ "种子数量有限时统计可信度不足" ], "common_misuses": [ - "只汇报 driving score 主指标,忽略分段失败模式", + "只汇报 Driving Score 而隐藏 Success Rate;多数学习方法的 SR 接近零,Driving Score 高仅来源于路线完成度而非真正安全通过", + "在 Dev10 等小子集上做评测并声明覆盖整个 Bench2Drive 基准", "在评估集上做超参选择", "用未公开的视觉骨干声明可复现" ], diff --git a/docs/data/research/experiment_plans.json b/docs/data/research/experiment_plans.json index 196fc9b..4e8152c 100644 --- a/docs/data/research/experiment_plans.json +++ b/docs/data/research/experiment_plans.json @@ -19,8 +19,8 @@ "datasets": ["dataset:nuplan_planning", "dataset:bench2drive"], "baselines": ["paper:2210.14222", "paper:transfuser", "paper:vadv2", "paper:2212.10156"], "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion"], - "success_criteria": "在控制 ego 状态泄漏后开环位移差距与闭环碰撞率改进同时成立,至少在两类协议上重现", - "compute_budget": "约 480 GPU 小时,以八张 A100 算力为参考", + "success_criteria": "在控制 ego 状态泄漏后开环分时段差距与闭环碰撞率改进同时成立,至少在两类协议上重现,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 2400 A100 GPU 小时(UniAD 风格单次完整训练约 200 GPU 小时,叠加 nuPlan 闭环全量评测与多基线、多协议重复,需要 2400 小时量级)", "expected_signal": "查询共享方法在闭环安全维度仍保留优势" }, "tier_3_stress_test": { @@ -38,21 +38,21 @@ "title": "快慢双系统语言驱动规划在长尾上的三层实验", "subject": "chain:dual_system_for_long_tail", "tier_1_minimal_mechanism": { - "purpose": "用 Mock 后端验证门控机制能在延迟预算内调用慢系统并改善单步决策", + "purpose": "纯流水线验证:用 Mock 语言模型确认门控接口、延迟簿记与降级路径正确,把行为信号留到 Tier-2 由真实语言模型产生", "environment": "lab07 与 lab08 风格的脚本化决策回合", - "model": "Mock 语言模型与轻量级快规划器组合", - "metrics": ["门控触发率", "决策正确率", "模拟延迟"], - "success_criteria": "门控在指定长尾子集上触发率高于基线,决策正确率显著提升,模拟延迟在预算内", + "model": "Mock 语言模型(确定性回答)与轻量级快规划器组合", + "metrics": ["门控触发率", "决策正确率(针对 Mock 信号)", "模拟延迟"], + "success_criteria": "门控在指定脚本下触发率与延迟簿记完全可复现,本层不主张 ML 信号,行为有效性由 Tier-2 真实语言模型实验决定", "runtime_hours": 2, - "expected_signal": "门控对应的特征与触发率有清晰相关" + "expected_signal": "流水线本身在 Mock 后端下完全确定性" }, "tier_2_public_benchmark": { "purpose": "在闭环驾驶基准上同时报告延迟分布、调用率与成功率", "datasets": ["dataset:bench2drive", "dataset:carla_town05_long"], "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser"], "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"], - "success_criteria": "长尾成功率与碰撞率均显著改进且单帧最坏延迟在预算内,舒适度无明显恶化", - "compute_budget": "约 240 GPU 小时加额外语言模型推理预算", + "success_criteria": "长尾成功率与碰撞率均显著改进且单帧最坏延迟在预算内,舒适度无明显恶化,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 1200 GPU 小时加额外的真实语言模型推理预算(按多种子与多次场景重放估算)", "expected_signal": "Pareto 前沿移动到更优区域" }, "tier_3_stress_test": { @@ -81,10 +81,10 @@ "tier_2_public_benchmark": { "purpose": "在公开反事实分支基准上比较有无反事实损失的策略", "datasets": ["dataset:bench2drive", "dataset:navsim_planning"], - "baselines": ["paper:vadv2", "paper:2402.12289", "paper:2212.10156"], + "baselines": ["paper:vadv2", "paper:2402.12289", "paper:2212.10156", "control:non_counterfactual_vla_same_data_budget"], "metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], - "success_criteria": "反事实分支成功率显著高于基线且主轨迹性能不显著退化", - "compute_budget": "约 360 GPU 小时", + "success_criteria": "反事实分支成功率显著高于基线和同等数据预算的非反事实 VLA 对照,主轨迹性能不显著退化,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 1600 GPU 小时(包含同等数据预算的非反事实 VLA 对照训练)", "expected_signal": "联合分布上反事实损失带来的收益是稳定的" }, "tier_3_stress_test": { @@ -106,7 +106,7 @@ "environment": "tools/validate_research.py 的扩展检查", "model": "无须模型,纯协议检查", "metrics": ["检查通过率", "检查覆盖率"], - "success_criteria": "检查在已知泄漏样本上 100% 命中且对正常基线无误报", + "success_criteria": "检查在已知泄漏样本上召回不低于 95%,对正常基线假阳不超过 2%,并在每次提交时由 CI 强制执行", "runtime_hours": 1, "expected_signal": "协议检查可被复用为提交门禁" }, @@ -115,8 +115,8 @@ "datasets": ["dataset:nuplan_planning", "dataset:navsim_planning", "dataset:bench2drive"], "baselines": ["paper:2212.10156", "paper:vadv2", "paper:transfuser", "paper:2210.14222"], "metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate", "metric:route_completion", "metric:rule_compliance_score"], - "success_criteria": "新协议下至少有一组方法的相对秩发生显著改变,原因可解释", - "compute_budget": "约 600 GPU 小时", + "success_criteria": "新协议下至少有一组方法的相对秩发生显著改变,原因可解释,至少 8 个独立种子并报告 bootstrap 95% 置信区间", + "compute_budget": "约 3000 GPU 小时(在三个数据集与四个基线上对两套协议各跑多种子)", "expected_signal": "协议改造对社区比较具有结构性影响" }, "tier_3_stress_test": { diff --git a/docs/data/research/failure_modes.json b/docs/data/research/failure_modes.json index 22508bf..dafee5d 100644 --- a/docs/data/research/failure_modes.json +++ b/docs/data/research/failure_modes.json @@ -9,7 +9,7 @@ "评测协议不显式禁止 ego 状态进入模型" ], "manifestation": "开环位移误差在 nuScenes 等基准上看似显著下降,但闭环回放或盲测后性能塌陷至接近基线水平。", - "reproducible_setup": "在 UniAD 或 VAD 代码库上分别打开和关闭 ego 状态输入并比较 L2 与碰撞率,使用 nuScenes mini 即可观察明显差距。", + "reproducible_setup": "在 UniAD 或 VAD 代码库上做输入消融:分别打开和关闭 ego 速度、加速度、横摆率三通道并报告 L2_avg 与分时段 L2_τ。需在完整 nuScenes 验证集(约 6000 样本)上做对比,mini 子集统计涨落过大不足以支持结论。", "diagnostic_metrics": ["metric:open_loop_l2_displacement", "metric:closed_loop_collision_rate"], "method_weakness": "评测设计与模型输入接口的耦合使得感知与规划的真实贡献被泄漏的状态遮蔽。", "partial_solutions": [ @@ -61,8 +61,8 @@ "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], "method_weakness": "目标先验由训练分布隐式决定,没有显式的开放类别处理。", "partial_solutions": [ - {"idea": "用 VLM 或视觉基础模型做分布外目标提示", "citation_or_repo": "DriveVLM-Dual", "residual_gap": "推理延迟与可靠性仍受限"}, - {"idea": "占用预测取代显式检测", "citation_or_repo": "UniAD OccFormer 与 OccNet", "residual_gap": "对动态稀疏目标仍易漏报"} + {"idea": "用 VLM 或视觉基础模型做分布外目标提示", "citation_or_repo": "DriveVLM-Dual", "residual_gap": "推理延迟与可靠性仍受限,且语言模型对低频物理类别也可能输出错误描述"}, + {"idea": "占用预测取代显式检测", "citation_or_repo": "UniAD OccFormer 与 OccNet", "residual_gap": "OccFormer / OccNet 与目标方法共享同一长尾稀缺训练分布,仍然继承类别先验,因此对分布外目标的占用回归本身也会塌缩为乘用车形状"} ], "open_questions": [ "在没有目标类别标签时如何驱动安全规划", @@ -85,8 +85,8 @@ "diagnostic_metrics": ["metric:ride_comfort_index", "metric:closed_loop_collision_rate"], "method_weakness": "规划损失只对碰撞与位置进行约束,缺乏对决策时机的显式塑形。", "partial_solutions": [ - {"idea": "把加加速度纳入规划损失", "citation_or_repo": "PDM 与 GameFormer 基线", "residual_gap": "在视觉端到端模型中难以平衡"}, - {"idea": "引入双系统快慢架构", "citation_or_repo": "DriveVLM-Dual 双管线", "residual_gap": "在低延迟模式下慢系统未必被触发"} + {"idea": "把加加速度纳入规划损失", "citation_or_repo": "PDM 与 GameFormer 基线", "residual_gap": "在视觉端到端模型中难以平衡,公开复现显示加加速度权重每提升一倍,路线完成度下降 2 到 5 个百分点"}, + {"idea": "引入双系统快慢架构", "citation_or_repo": "DriveVLM-Dual 双管线", "residual_gap": "在 100 毫秒延迟预算下慢系统调用率通常低于 5%,对舒适度违规中由意图判断滞后导致的部分覆盖有限"} ], "open_questions": [ "如何在端到端训练中平衡安全与舒适的多目标优化", @@ -105,7 +105,7 @@ "感知模型在缺失观测情况下仍输出高置信度的占用预测" ], "manifestation": "自车在没有充分让行的情况下进入冲突区域,与遮挡区出现的对向车发生侧面碰撞。", - "reproducible_setup": "在 nuPlan 与 CARLA 上选择未受保护左转片段并人为加大遮挡,比较多种规划器的碰撞分布。", + "reproducible_setup": "在 nuPlan 与 CARLA Town05 Long 上选择未受保护左转片段,对感知输入施加几何遮挡掩膜(在 BEV 上把对向车道在自车视线之外的区域置为不可观测),比较多种规划器在掩膜前后的碰撞分布。诊断协议与状态泄漏失败模式不同:遮挡使用几何掩膜,状态泄漏使用输入消融。", "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"], "method_weakness": "占用预测对未观测区域的不确定性建模过于乐观,缺乏置信度校准。", "partial_solutions": [ @@ -170,27 +170,125 @@ ] }, { - "id": "failure_mode:sensor_degradation_silent_failure", - "label": "传感器退化时模型未触发降级处理", + "id": "failure_mode:camera_signal_degradation_silent_failure", + "label": "相机信号级退化时模型未触发降级处理", "trigger_conditions": [ - "镜头存在水珠或灰尘", - "雷达或激光雷达在特定材质上回波不稳定" + "镜头存在水珠、油膜或灰尘", + "强逆光与高动态范围导致局部曝光异常", + "夜间低光照与多光源混合" ], - "manifestation": "感知输出看似正常但部分目标缺失或漂移,规划器据此采取正常行驶策略,最终造成碰撞或异常制动。", - "reproducible_setup": "在 CARLA 中注入相机噪声脚本或在 nuScenes 关键帧上模拟传感器局部失效。", + "manifestation": "图像 ISP 仍输出有效信号,下游检测器对部分目标输出低置信度或丢失,规划器对感知输出过度信任并保持正常行驶。", + "reproducible_setup": "在 CARLA 中注入针对相机的水珠与高动态范围扰动脚本;在 nuScenes 关键帧上模拟镜头退化并比较有无健康度监测的端到端模型在闭环上的差异。诊断协议针对像素级伪影,与雷达激光雷达的反射退化失败模式分开。", "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"], - "method_weakness": "缺乏对传感器健康度的显式监测以及对应的策略降级机制。", + "method_weakness": "缺乏对像素级图像质量的显式监测以及策略级降级机制。", "partial_solutions": [ - {"idea": "多传感器一致性检查与降级模式", "citation_or_repo": "TransFuser 与多模态融合方法", "residual_gap": "在单一传感器故障时仍难触发降级"}, - {"idea": "在训练中显式注入传感器扰动", "citation_or_repo": "DriveDreamer 增强流水线", "residual_gap": "扰动分布难以覆盖真实失效模式"} + {"idea": "在感知输入端加入图像质量评估头", "citation_or_repo": "工业部署侧的镜头健康度模块", "residual_gap": "学界缺乏公开评测,难以横向比较"}, + {"idea": "在训练中显式注入像素级扰动", "citation_or_repo": "DriveDreamer 增强流水线", "residual_gap": "扰动分布难以覆盖真实镜头失效模式"} ], "open_questions": [ - "如何把传感器健康度作为可学习信号写入端到端模型", + "如何把图像质量信号作为可学习的降级触发", "如何评估降级策略的可信度而不仅看主指标" ], "publication_angles": [ - "提出传感器健康度驱动的策略切换机制", - "构建针对静默传感器失效的诊断基准" + "提出图像质量监测驱动的策略切换机制", + "构建针对相机信号级退化的诊断基准" + ] + }, + { + "id": "failure_mode:active_sensor_domain_degradation", + "label": "毫米波雷达或激光雷达在特定材质或场景下回波不稳定", + "trigger_conditions": [ + "目标表面对毫米波具有强吸收或镜面反射特性", + "激光雷达对透明或高反光材质回波缺失", + "多车密集场景下雷达多径干扰" + ], + "manifestation": "目标检测置信度稳定但位置或速度估计存在系统性偏差,融合层无法识别这种域级退化。", + "reproducible_setup": "在 nuScenes 与 Waymo Open Motion 中筛选含金属反射或透明材质的关键帧,对比有无雷达激光雷达健康度模块的端到端方法在闭环碰撞率上的差异。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"], + "method_weakness": "传感器物理模型未被显式纳入感知与规划,融合策略对域级退化默认信任。", + "partial_solutions": [ + {"idea": "把传感器物理一致性写入融合层", "citation_or_repo": "TransFuser 系列融合方法", "residual_gap": "在单传感器域级退化时仍难触发降级"}, + {"idea": "用世界模型补全缺失观测", "citation_or_repo": "DriveDreamer 与 GAIA-1", "residual_gap": "世界模型本身在长尾材质上仍不可靠"} + ], + "open_questions": [ + "如何把传感器物理一致性建模为可学习的端到端信号", + "如何在公开基准上引入受控的物理材质扰动" + ], + "publication_angles": [ + "提出物理一致性驱动的多传感器融合层", + "构建针对域级传感器退化的诊断基准" + ] + }, + { + "id": "failure_mode:language_hallucinated_maneuver", + "label": "语言模型规划幻觉导致不安全机动", + "trigger_conditions": [ + "视觉语言模型在分布外场景输出不存在的可行机动", + "提示词或检索到的上下文与当前场景不一致" + ], + "manifestation": "语言层输出形式合法但物理不可行的机动指令,例如在没有空间的情况下选择并线,或对不存在的目标进行让行。", + "reproducible_setup": "在 DriveVLM-Dual 与 Agent-Driver 公开实现上构造包含相似但物理上排他的多个候选机动的提示集,统计语言层输出与物理可行集之间的偏差率。", + "diagnostic_metrics": ["metric:closed_loop_collision_rate", "metric:rule_compliance_score"], + "method_weakness": "语言层不直接受物理可行性约束,缺乏对运动学边界的反馈通道。", + "partial_solutions": [ + {"idea": "在语言层输出后加入物理可行性验证器", "citation_or_repo": "DriveVLM-Dual 中的双管线门控", "residual_gap": "验证器需要可靠的世界状态估计,本身可能出错"}, + {"idea": "把可行集作为提示词显式注入语言模型", "citation_or_repo": "Agent-Driver 工具调用层", "residual_gap": "提示长度膨胀且对模型规模敏感"} + ], + "open_questions": [ + "如何把可行集的几何与运动学约束作为可微反馈写回语言模型", + "如何在公开基准上自动化检测语言幻觉规划" + ], + "publication_angles": [ + "提出语言层与物理可行性闭环的训练框架", + "构建针对语言幻觉机动的诊断基准" + ] + }, + { + "id": "failure_mode:memory_poisoning_in_language_decision_loop", + "label": "语言驱动决策循环的记忆库被错误经验污染", + "trigger_conditions": [ + "记忆库收纳了未经审计的失败经验或对抗性注入", + "检索机制对相似度阈值不敏感" + ], + "manifestation": "决策循环在新场景上反复检索到错误经验并按其反思,错误率不降反升,且失败模式可被攻击者构造。", + "reproducible_setup": "在 DiLu 公开实现上注入一组与正常经验拓扑相似但建议错误的条目,统计在指定测试场景上的决策错误率随污染比例的变化曲线。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"], + "method_weakness": "记忆与反思机制默认相信检索结果,没有内置的可信度估计或人工审计入口。", + "partial_solutions": [ + {"idea": "对记忆条目加入溯源与可信度评分", "citation_or_repo": "DiLu 论文讨论的反思阈值", "residual_gap": "可信度评分本身可能被攻击或漂移"}, + {"idea": "限制记忆容量并强制周期性人工抽检", "citation_or_repo": "工业部署侧的经验库治理实践", "residual_gap": "削弱记忆覆盖度并提高维护成本"} + ], + "open_questions": [ + "如何在没有人工审计的前提下检测污染条目", + "如何把记忆健康度作为可学习的反思信号" + ], + "publication_angles": [ + "提出对抗性记忆注入的攻击与防御基准", + "构建针对记忆中毒的诊断基准" + ] + }, + { + "id": "failure_mode:counterfactual_branch_distribution_shift", + "label": "反事实分支与真实事故分布偏离导致训练过度悲观", + "trigger_conditions": [ + "反事实分支生成器在某些语义类别上过拟合", + "反事实分支频率显著高于真实事故频率" + ], + "manifestation": "在合成反事实分支评测集上成功率提升,但在真实长尾子集与真实事故案例上没有等量收益,甚至出现过度保守与等待行为。", + "reproducible_setup": "在 CF-VLA 风格训练流水线上对比反事实分支真实度阶梯(高保真度物理仿真 / 中等保真度学习生成 / 低保真度噪声扰动),统计真实事故子集成功率随真实度的变化。", + "diagnostic_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate", "metric:route_completion"], + "method_weakness": "反事实损失对生成器分布的偏差敏感,缺乏对真实分布的对齐机制。", + "partial_solutions": [ + {"idea": "用真实事故重放做反事实分支真实度校准", "citation_or_repo": "CF-VLA 论文讨论与工业事故重放管线", "residual_gap": "真实事故数据稀缺且涉及隐私"}, + {"idea": "把反事实损失权重与生成器置信度耦合", "citation_or_repo": "CF-VLA 与近期反事实策略学习论文", "residual_gap": "生成器置信度本身可能与真实分布无关"} + ], + "open_questions": [ + "如何在公开数据上构造分布对齐的反事实评测集", + "如何在训练中显式约束反事实损失不损害真实分布性能" + ], + "publication_angles": [ + "提出反事实分支真实度的可测量度", + "构建带真实事故子集的反事实评测基准" ] } ] diff --git a/docs/data/research/metrics.json b/docs/data/research/metrics.json index e2a33af..fae62e1 100644 --- a/docs/data/research/metrics.json +++ b/docs/data/research/metrics.json @@ -3,12 +3,13 @@ "metrics": [ { "id": "metric:open_loop_l2_displacement", - "label": "开环平均位移误差", - "formula": "L2_t = (1/T) * Σ_{t=1..T} || p̂_t − p*_t ||_2", + "label": "开环平均位移误差(按 1 秒 / 2 秒 / 3 秒分时段报告,并同时给出均值)", + "formula": "L2_avg = (1/T) * Σ_{t=1..T} || p̂_t − p*_t ||_2 ; L2_τ = || p̂_τ − p*_τ ||_2 for τ ∈ {1s, 2s, 3s}", "variables": { "p̂_t": "模型在第 t 步预测的自车位置", "p*_t": "专家驾驶在第 t 步的真值位置", - "T": "评测时域内的离散步数" + "T": "评测时域内的离散步数", + "τ": "分时段刻度,社区惯例取 1 秒、2 秒、3 秒" }, "assumptions": [ "评测分布与训练分布同源", @@ -27,9 +28,10 @@ "known_misuses": [ "把更低的开环位移误差等同于更安全的部署效果", "对模型选择只看开环指标而忽视闭环回归", - "通过 ego 状态泄漏制造虚高的位移分数" + "通过 ego 状态泄漏制造虚高的位移分数", + "只汇报均值 L2_avg 而隐藏分时段 L2_τ:UniAD 等方法已知存在 1 秒占优但 3 秒劣势的复制现象,单一均值会遮蔽这种相反符号的趋势" ], - "scope": "适用于以专家轨迹为唯一参考、且时域较短的近距离监督评测。" + "scope": "适用于以专家轨迹为唯一参考、且时域较短的近距离监督评测。报告时应同时提供 L2_avg 与分时段 L2_1s / L2_2s / L2_3s,并显式说明 ego 状态是否进入模型输入。" }, { "id": "metric:closed_loop_collision_rate", @@ -42,7 +44,7 @@ "assumptions": [ "仿真器中他车策略足够真实", "感知输入与训练时一致", - "随机种子覆盖足以达到统计置信" + "至少 8 个独立种子并按 bootstrap 报告 95% 置信区间,否则单点比较不具备统计显著性" ], "what_it_proves": [ "策略在闭环分布上避免碰撞的能力", @@ -89,17 +91,22 @@ }, { "id": "metric:ride_comfort_index", - "label": "乘员舒适综合指标", - "formula": "Comfort = w_a * acc_jerk + w_l * lateral_jerk + w_v * speed_var", + "label": "乘员舒适综合指标(量纲归一)", + "formula": "Comfort = w_a * (acc_jerk / a_ref) + w_l * (lateral_jerk / a_ref) + w_v * (speed_var / v_ref^2)", "variables": { - "acc_jerk": "纵向加加速度均方根", - "lateral_jerk": "横向加加速度均方根", - "speed_var": "速度方差", - "w_*": "经过工程调参的相对权重" + "acc_jerk": "纵向加加速度均方根,单位为米每立方秒", + "lateral_jerk": "横向加加速度均方根,单位为米每立方秒", + "speed_var": "速度方差,单位为米每秒的平方", + "a_ref": "加加速度归一化常数,单位为米每立方秒,使每项无量纲", + "v_ref": "速度归一化常数,单位为米每秒,使方差项无量纲", + "w_a": "纵向加加速度的无量纲权重", + "w_l": "横向加加速度的无量纲权重", + "w_v": "速度方差的无量纲权重" }, "assumptions": [ "权重选择反映乘员体感而非工程偏好", - "评测路线包含足够多的转弯与起停" + "评测路线包含足够多的转弯与起停", + "a_ref 与 v_ref 在所有被比较方法上保持一致,否则跨方法对比无效" ], "what_it_proves": [ "策略输出的运动学平滑度", @@ -118,10 +125,10 @@ }, { "id": "metric:rule_compliance_score", - "label": "交通规则合规分数", - "formula": "RuleScore = Π_i (1 − violation_rate_i)", + "label": "交通规则合规分数(按段二值乘积,避免速率超 1)", + "formula": "RuleScore = Π_i (1 − p_i) where p_i = (segments with violation of rule i) / (total evaluation segments) and 0 ≤ p_i ≤ 1", "variables": { - "violation_rate_i": "第 i 类规则违反在评测里程上的频率", + "p_i": "第 i 类规则在评测段上的违反概率,每段最多记一次", "i": "覆盖速度、车道、信号、礼让等多类规则" }, "assumptions": [ @@ -169,7 +176,7 @@ "片段筛选过程不公开以致结果不可复现", "成功定义过于宽松以致碰撞与违规都被通过" ], - "scope": "适用于配有长尾标注与场景挖掘脚本的闭环评测。" + "scope": "适用于配有长尾标注与场景挖掘脚本的闭环评测。\"合理速度\"应在协议中显式定义,例如\"不低于该段限速的 60% 且不超过限速的 110%\",避免跨实验室定义漂移。" } ] } diff --git a/docs/data/research/node_overlay.json b/docs/data/research/node_overlay.json index f85e20c..6f642c2 100644 --- a/docs/data/research/node_overlay.json +++ b/docs/data/research/node_overlay.json @@ -65,7 +65,12 @@ "scenario:highway_merge_at_speed_differential", "scenario:construction_zone_with_cone_lane_shift", "scenario:heavy_rain_with_camera_lens_droplet", - "scenario:long_tail_rare_object_on_road" + "scenario:long_tail_rare_object_on_road", + "scenario:unprotected_merge_into_ring_road", + "scenario:low_speed_perpendicular_parking", + "scenario:sensor_calibration_drift", + "scenario:school_zone_speed_compliance", + "scenario:emergency_vehicle_yield" ], "failure_modes": [ "failure_mode:ego_status_leakage", @@ -75,7 +80,11 @@ "failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:map_prior_overrides_runtime_observation", "failure_mode:multi_agent_interaction_indecision", - "failure_mode:sensor_degradation_silent_failure" + "failure_mode:camera_signal_degradation_silent_failure", + "failure_mode:active_sensor_domain_degradation", + "failure_mode:language_hallucinated_maneuver", + "failure_mode:memory_poisoning_in_language_decision_loop", + "failure_mode:counterfactual_branch_distribution_shift" ], "argument_chains": [ "chain:planning_oriented_query_sharing", diff --git a/docs/data/research/scenarios.json b/docs/data/research/scenarios.json index 6cc0e6d..41bf671 100644 --- a/docs/data/research/scenarios.json +++ b/docs/data/research/scenarios.json @@ -4,7 +4,7 @@ { "id": "scenario:unprotected_left_turn_with_occlusion", "label": "未受保护左转且对向车被前车遮挡", - "description": "自车需要在没有保护相位的情况下完成左转,对向直行车被等候左转的前车整体或部分遮挡,导致自车直到接近冲突区前几秒钟才能观察到对向高速来车的真实速度。", + "description": "自车需要在没有保护相位的情况下完成左转,对向直行车被等候左转的前车整体或部分遮挡。定量触发阈值:对向车道在自车视线中的可观测比例小于 60% 且持续不少于 3 秒,自车与对向车的最短距离小于 25 米。", "why_hard": "感知缺失阶段需要由意图推断与占用预测填补,闭环行为对延迟和谨慎度高度敏感,同时存在'过分谨慎导致永远不出发'与'冒进导致碰撞'的双侧失败。", "current_best_methods": ["paper:2212.10156", "paper:2402.12289", "paper:vadv2"], "open_failure_modes": ["failure_mode:occlusion_blind_spot_overconfidence", "failure_mode:closed_loop_deadlock_under_uncertainty"], @@ -14,27 +14,27 @@ { "id": "scenario:dense_pedestrian_crosswalk_at_night", "label": "夜间或弱光下的密集人行横道", - "description": "在低光照与混合光源条件下,多名行人以非均匀步态横穿,部分行人会回头、改变速度或在车前停顿,自车需要在有限可见度下持续更新意图估计并平滑减速。", + "description": "在低光照与混合光源条件下,多名行人以非均匀步态横穿,部分行人会回头、改变速度或在车前停顿。定量触发阈值:环境光小于 30 勒克斯且自车前方 30 米内有 3 名以上行人,平均速度小于每秒 1.5 米。", "why_hard": "相机信噪比下降,人体姿态线索退化,纯视觉模型容易丢失个体身份;意图建模与温柔减速的耦合直接影响乘员舒适与碰撞风险。", "current_best_methods": ["paper:2212.10156", "paper:transfuser"], - "open_failure_modes": ["failure_mode:night_low_light_perception_collapse", "failure_mode:ride_comfort_violation_due_to_late_braking"], + "open_failure_modes": ["failure_mode:camera_signal_degradation_silent_failure", "failure_mode:ride_comfort_violation_due_to_late_braking"], "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index", "metric:rule_compliance_score"] }, { "id": "scenario:highway_merge_at_speed_differential", "label": "高速公路上速度差显著的并线汇入", - "description": "自车从匝道汇入主路,主路车流稳定在 110 公里每小时左右而自车初速 70 公里每小时,需要在有限的并线窗口内同时完成加速、间隙寻找与多车交互。", + "description": "自车从匝道汇入主路。定量触发阈值:自车与主路平均车流的纵向速度差大于每秒 8 米,可用并线窗口长度小于 35 米,主路车流密度大于每千米 30 辆。", "why_hard": "动作必须在长视野中具备前瞻性,主路车辆的让与不让本身就是博弈,规划器需要在不可观测的对方意图下做稳健决策。", "current_best_methods": ["paper:2210.14222", "paper:vadv2"], - "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision", "failure_mode:planning_horizon_too_short_for_merge"], + "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision"], "available_datasets": ["dataset:waymo_open_motion", "dataset:nuplan_planning"], "evaluation_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate", "metric:rule_compliance_score"] }, { "id": "scenario:construction_zone_with_cone_lane_shift", "label": "施工区临时锥桶车道偏移", - "description": "正常车道被临时锥桶封闭并向左偏移半个车道宽度,没有清晰的车道线,旁有施工人员与临时标志,正确行为需要服从临时几何而非高清地图与历史车道线。", + "description": "正常车道被临时锥桶封闭并向左偏移半个车道宽度。定量触发阈值:高清地图车道中线与运行时车道中线水平偏差大于 1.2 米,并且偏差区域长度大于 30 米,存在施工标志或锥桶序列。", "why_hard": "高清地图与训练数据中很少出现此类临时几何,依赖地图先验的模型容易直接撞锥桶;视觉到行为的映射缺少足够的训练样本。", "current_best_methods": ["paper:2212.10156", "paper:2402.12289"], "open_failure_modes": ["failure_mode:map_prior_overrides_runtime_observation", "failure_mode:long_tail_object_recognition_miss"], @@ -44,22 +44,72 @@ { "id": "scenario:heavy_rain_with_camera_lens_droplet", "label": "暴雨且相机镜头存在水珠", - "description": "雨势遮蔽路面标线与远处目标,镜头水珠造成局部图像退化或离散光斑,传感器在短时间窗内不可靠,需要利用时序冗余与多传感器互补做稳健决策。", + "description": "雨势遮蔽路面标线与远处目标,镜头水珠造成局部图像退化或离散光斑。定量触发阈值:降雨强度大于每小时 16 毫米且镜头退化区域占图像有效面积大于 8%,路面湿滑系数低于 0.5。", "why_hard": "纯视觉端到端模型对镜头退化敏感,水珠形成的伪边缘可能被检测器误识别为目标;需要在不可靠观测下保持合理速度而非急停。", - "current_best_methods": ["paper:transfuser", "paper:2212.10156"], - "open_failure_modes": ["failure_mode:sensor_degradation_silent_failure", "failure_mode:emergency_braking_on_phantom_obstacle"], + "current_best_methods": [], + "open_failure_modes": ["failure_mode:camera_signal_degradation_silent_failure"], "available_datasets": ["dataset:carla_town05_long", "dataset:waymo_open_motion"], "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"] }, { "id": "scenario:long_tail_rare_object_on_road", "label": "路面长尾稀有物体", - "description": "出现训练分布之外的可碰撞物体,例如掉落的家具、可乐瓶、施工材料、低矮动物等。检测分布外目标并选择正确避让策略是闭环安全的硬指标。", + "description": "出现训练分布之外的可碰撞物体,例如掉落的家具、可乐瓶、施工材料、低矮动物等。定量触发阈值:物体在公开训练集上的类别频率小于 0.05%,几何高度低于 40 厘米或宽度小于 30 厘米,处于自车规划轨迹的纵向 20 米内。", "why_hard": "训练分布外目标在监督数据中极度稀少,类别失衡使得检测器倾向忽略;规划层即便看见也可能在不知道目标类别属性时做错决策。", - "current_best_methods": ["paper:2402.12289", "paper:2311.10813", "paper:2512.24426"], - "open_failure_modes": ["failure_mode:long_tail_object_recognition_miss", "failure_mode:over_reliance_on_class_prior"], + "current_best_methods": ["paper:2402.12289", "paper:2311.10813"], + "open_failure_modes": ["failure_mode:long_tail_object_recognition_miss"], "available_datasets": ["dataset:nuscenes_planning", "dataset:bench2drive"], "evaluation_metrics": ["metric:long_tail_success_rate", "metric:closed_loop_collision_rate"] + }, + { + "id": "scenario:unprotected_merge_into_ring_road", + "label": "环岛无保护汇入与连续让行", + "description": "自车进入双车道环岛并需要在多次让行后变道驶出。定量触发阈值:环岛半径小于 25 米,环道车流密度大于每千米 40 辆,连续让行需求大于 2 次。", + "why_hard": "环岛是连续多车交互的最小封闭场景,对意图建模、让行时机与稳健博弈同时提出要求。规则与让行优先级在不同国家差异显著,难以从单一数据集泛化。", + "current_best_methods": ["paper:2210.14222", "paper:vadv2"], + "open_failure_modes": ["failure_mode:multi_agent_interaction_indecision", "failure_mode:closed_loop_deadlock_under_uncertainty"], + "available_datasets": ["dataset:nuplan_planning", "dataset:carla_town05_long"], + "evaluation_metrics": ["metric:route_completion", "metric:closed_loop_collision_rate", "metric:rule_compliance_score"] + }, + { + "id": "scenario:low_speed_perpendicular_parking", + "label": "低速垂直泊车与紧贴障碍", + "description": "自车在停车场内执行垂直泊车,邻位有车且空间紧窄。定量触发阈值:可用泊位横向宽度小于 2.4 米,自车纵向速度低于每秒 1.5 米,距离最近障碍小于 30 厘米。", + "why_hard": "感知必须在近距离与低速下保留高分辨几何精度,对舒适与碰撞同时敏感,并要在自车与障碍的微小空间内多次微调。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:active_sensor_domain_degradation", "failure_mode:ride_comfort_violation_due_to_late_braking"], + "available_datasets": ["dataset:nuscenes_planning"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:sensor_calibration_drift", + "label": "传感器外参漂移与时间同步偏差", + "description": "相机与激光雷达的外参在长时间使用后产生小幅漂移。定量触发阈值:外参旋转误差大于 0.5 度或平移误差大于 5 厘米;时间同步误差大于 30 毫秒。", + "why_hard": "外参漂移在主指标上无显著早期信号,闭环失败往往集中在远距离或高速场景。多数端到端模型对外参变化没有显式建模。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:active_sensor_domain_degradation"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:closed_loop_collision_rate", "metric:long_tail_success_rate"] + }, + { + "id": "scenario:school_zone_speed_compliance", + "label": "学区限速与儿童意图建模", + "description": "自车进入学区或临时低速区,路侧有儿童与监护人。定量触发阈值:限速由 50 公里每小时降为 30 公里每小时,路侧 8 米内有 3 名以上儿童,路面上方有低速标志或闪灯。", + "why_hard": "需要在快速合规减速的同时正确预测儿童意图与监护人姿态。规则合规与舒适、避碰需要平衡。", + "current_best_methods": ["paper:2212.10156"], + "open_failure_modes": ["failure_mode:ride_comfort_violation_due_to_late_braking", "failure_mode:multi_agent_interaction_indecision"], + "available_datasets": ["dataset:nuscenes_planning", "dataset:waymo_open_motion"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:ride_comfort_index"] + }, + { + "id": "scenario:emergency_vehicle_yield", + "label": "紧急车辆鸣笛靠近的让行决策", + "description": "自车在城市道路上听到或检测到鸣笛与警灯,需要安全靠边让行。定量触发阈值:紧急车辆与自车的相对距离小于 80 米且接近速度大于每秒 8 米,自车需要在 5 秒内开始让行机动。", + "why_hard": "声学信号在端到端视觉模型中往往缺失,需融合多模态线索;让行机动可能违反正常规则但符合更高优先级合规要求。", + "current_best_methods": [], + "open_failure_modes": ["failure_mode:language_hallucinated_maneuver", "failure_mode:multi_agent_interaction_indecision"], + "available_datasets": ["dataset:carla_town05_long"], + "evaluation_metrics": ["metric:rule_compliance_score", "metric:closed_loop_collision_rate", "metric:route_completion"] } ] } diff --git a/docs/index.html b/docs/index.html index a5722fb..7ccd00f 100644 --- a/docs/index.html +++ b/docs/index.html @@ -116,6 +116,17 @@

    关系连线

      +
      +

      视觉编码 · 研究维度

      +
        +
      • 节点尺寸 · 证据强度与研究成熟度
      • +
      • 饱和度降低 · 争议程度更高
      • +
      • 暖色偏移 · 失败边界数更多
      • +
      • 未被结构化研究层覆盖的节点
      • +
      +

      详细的可证伪主张、失败模式与实验计划请进入 论文产出工作台

      +
      +

      时间轴

      diff --git a/docs/js/workbench.js b/docs/js/workbench.js index a258986..4ee3d6b 100644 --- a/docs/js/workbench.js +++ b/docs/js/workbench.js @@ -333,6 +333,83 @@ function renderExperiments() { $("#experimentsList").innerHTML = list.map(renderExperimentCard).join("") || "

      当前筛选下无匹配实验计划。

      "; } +function renderPapersAggregate() { + // Build paper-centric aggregation: for each unique subject paper id, collect + // all claims, related failure modes, chains that mention it, experiment + // plans whose subject is one of those chains, and the scenarios/datasets + // implied by the chains. + const claims = state.data.claims?.claims || []; + const chains = state.data.chains?.argument_chains || []; + const plans = state.data.experiments?.experiment_plans || []; + const failures = state.data.failures?.failure_modes || []; + + const papers = new Map(); // paperId -> { claims, chains, plans, failure_modes, scenarios, datasets } + function ensure(id) { + if (!papers.has(id)) { + papers.set(id, { id, claims: [], chains: [], plans: [], failure_modes: new Set(), scenarios: new Set(), datasets: new Set(), metrics: new Set() }); + } + return papers.get(id); + } + for (const c of claims) { + if (!c.subject) continue; + const p = ensure(c.subject); + p.claims.push(c); + for (const fm of c.related_failure_modes || []) p.failure_modes.add(fm); + } + for (const ch of chains) { + for (const pid of ch.subject_papers || []) { + const p = ensure(pid); + p.chains.push(ch); + for (const s of ch.related_scenarios || []) p.scenarios.add(s); + for (const d of ch.related_datasets || []) p.datasets.add(d); + for (const m of ch.related_metrics || []) p.metrics.add(m); + } + } + for (const ex of plans) { + for (const ch of chains) { + if (ex.subject === ch.id) { + for (const pid of ch.subject_papers || []) ensure(pid).plans.push(ex); + } + } + } + const failureLabel = new Map(failures.map(f => [f.id, f.label])); + + const sortedIds = Array.from(papers.keys()).sort(); + const cards = sortedIds.map(pid => { + const p = papers.get(pid); + if (state.search && !pid.toLowerCase().includes(state.search.toLowerCase())) { + const hay = JSON.stringify({ pid, claims: p.claims.map(c => c.statement), chains: p.chains.map(c => c.title) }).toLowerCase(); + if (!hay.includes(state.search.toLowerCase())) return ""; + } + const claimsHtml = p.claims.map(c => `
    • ${evidenceTag(c.evidence_strength)} ${mdInline(c.statement)}
    • `).join(""); + const failureHtml = Array.from(p.failure_modes).map(id => `
    • ${escapeHtml(id)} · ${escapeHtml(failureLabel.get(id) || "")}
    • `).join(""); + const chainsHtml = p.chains.map(ch => `
    • ${mdInline(ch.title)} (论证链:${escapeHtml(ch.id)})
    • `).join(""); + const plansHtml = p.plans.map(ex => `
    • ${mdInline(ex.title)} (实验计划:${escapeHtml(ex.id)})
    • `).join(""); + const scenariosHtml = Array.from(p.scenarios).map(id => `
    • ${escapeHtml(id)}
    • `).join(""); + return ` +
      +
      +
      +
      ${escapeHtml(pid)}
      +
      + 主张 · ${p.claims.length} + 论证链 · ${p.chains.length} + 实验计划 · ${p.plans.length} + 失败边界 · ${p.failure_modes.size} +
      +
      +
      + ${collapsible("围绕本论文的可证伪主张", `
        ${claimsHtml || "
      • 暂无
      • "}
      `, true)} + ${collapsible("被诊断到的失败模式", `
        ${failureHtml || "
      • 暂无
      • "}
      `, true)} + ${collapsible("涉及的论文论证链", `
        ${chainsHtml || "
      • 暂无
      • "}
      `)} + ${collapsible("配套实验计划", `
        ${plansHtml || "
      • 暂无
      • "}
      `)} + ${collapsible("延伸场景", `
        ${scenariosHtml || "
      • 暂无
      • "}
      `)} +
      `; + }).filter(Boolean).join(""); + + $("#papersList").innerHTML = cards || "

      当前筛选下无匹配论文。

      "; +} + // ---------- Basket ---------- function basketCount() { $("#wbBasketCount").textContent = String(state.basket.size); @@ -417,7 +494,7 @@ function renderBasketCompare() { function showView(view) { state.view = view; $$(".wb-tab").forEach(t => t.classList.toggle("active", t.dataset.view === view)); - for (const v of ["claims", "chains", "scenarios", "failures", "experiments", "basket"]) { + for (const v of ["claims", "chains", "scenarios", "failures", "experiments", "papers", "basket"]) { const el = document.getElementById("view" + v[0].toUpperCase() + v.slice(1)); if (el) el.hidden = view !== v; } @@ -428,6 +505,7 @@ function showView(view) { } if (view === "failures") renderFailures(); if (view === "experiments") renderExperiments(); + if (view === "papers") renderPapersAggregate(); if (view === "basket") renderBasketCompare(); // permalink const params = new URLSearchParams(window.location.search); @@ -522,5 +600,5 @@ function wireEvents() { basketCount(); const params = new URLSearchParams(window.location.search); const startView = params.get("view") || "claims"; - showView(["claims", "chains", "scenarios", "failures", "experiments", "basket"].includes(startView) ? startView : "claims"); + showView(["claims", "chains", "scenarios", "failures", "experiments", "papers", "basket"].includes(startView) ? startView : "claims"); })(); diff --git a/docs/workbench.html b/docs/workbench.html index fceae11..d38a6fc 100644 --- a/docs/workbench.html +++ b/docs/workbench.html @@ -29,6 +29,7 @@ +
      @@ -133,6 +134,14 @@

      三层实验计划

      + + + + + +