From 6d15fadc18bbebb2d871feb95cd4542760902af2 Mon Sep 17 00:00:00 2001
From: AD2000X <thecausticfinale@gmail.com>
Date: Wed, 3 Jun 2026 15:48:04 +0100
Subject: [PATCH] feat: Phase 4 PR-A eval-summary backbone

- src/phase4_summary.py: pure per-phase summarizers + inline layout-CSV
  aggregation + deterministic markdown render (no file/Drive/gradio IO)
- scripts/build_phase4_summary.py: 5 metrics JSONs + 3 layout CSVs ->
  outputs/evaluation/phase4_summary.json (gitignored) + reports/phase4_metrics.md
- tests/test_phase4_summary.py: 10 synthetic tests (full pytest 246 green)
- reports/phase4_metrics.md: generated metrics snapshot (no-drift, LF)
- docs/phase4_brief.md: committed implementation brief
- README/DEVLOG/PLAN: Phase 4 in progress; README stale "Phase 2 active" removed

reports/phase4_metrics.md is generated; outputs/evaluation/phase4_summary.json remains gitignored.
---
 DEVLOG.md                       |  26 ++++
 PLAN.md                         |  11 +-
 README.md                       |  27 ++--
 docs/phase4_brief.md            | 104 +++++++++++++
 reports/phase4_metrics.md       |  69 +++++++++
 scripts/build_phase4_summary.py |  97 ++++++++++++
 src/phase4_summary.py           | 252 ++++++++++++++++++++++++++++++++
 tests/test_phase4_summary.py    | 200 +++++++++++++++++++++++++
 8 files changed, 772 insertions(+), 14 deletions(-)
 create mode 100644 docs/phase4_brief.md
 create mode 100644 reports/phase4_metrics.md
 create mode 100644 scripts/build_phase4_summary.py
 create mode 100644 src/phase4_summary.py
 create mode 100644 tests/test_phase4_summary.py

diff --git a/DEVLOG.md b/DEVLOG.md
index e84c51d..b214114 100644
--- a/DEVLOG.md
+++ b/DEVLOG.md
@@ -181,6 +181,32 @@ Decisions outgrow this file, split them into `DECISIONS.md` (or `docs/adr/`).
 
 ---
 
+## 2026-06-03 - Phase 4 eval-summary backbone (PR-A)
+
+### Result - one summary aggregated from the per-phase artifacts; report numbers never hand-copied
+
+- **What landed:** `src/phase4_summary.py` (pure summarizers + layout aggregation + markdown
+  render, no file/Drive/gradio IO), `scripts/build_phase4_summary.py` (reads the five metrics
+  JSONs + three layout CSVs, writes `outputs/evaluation/phase4_summary.json` gitignored +
+  `reports/phase4_metrics.md` committed), `tests/test_phase4_summary.py` (10 synthetic tests).
+  See `docs/phase4_brief.md`.
+- **Phase 2 has no metrics JSON**, so the builder aggregates it inline from the staged
+  `diagnostic_pos.csv` / `diagnostic_neg.csv` / `smoke_structure.csv`, matching the table-level
+  matching + FP definitions in `scripts/eval_layout_iou.py` and the OK/WARN split in
+  `scripts/smoke_structure.py`. This reproduced the prior DEVLOG layout numbers **exactly** (mean
+  crop IoU 0.900; matched@0.50 0.900/0.916; matched@0.75 0.880/0.895; crop->TATR 285/286 = 0.997),
+  confirming the inline path needs no Colab re-run.
+- **No-drift gate:** `render_metrics_markdown` is pure and deterministic and the file is written
+  with LF; rebuilding leaves `reports/phase4_metrics.md` byte-identical, so the committed report
+  snippet cannot silently drift from the artifacts.
+- **Reporting choices:** retrieval reports hit@{1,5,10} + MRR@10 only (recall@k == hit@k under one
+  relevant chunk per question, `src/eval_retrieval.py`); a missing artifact degrades to
+  `{"available": false}` rather than failing.
+- **Result:** full `pytest` green (246, +10). Headline echoes: FUNSD `test_50.qa_links` F1 0.727;
+  QA `gt_markdown` answer_exact 0.675. PR-B (report) and PR-C (Gradio demo) follow.
+
+---
+
 ## 2026-06-03 - Phase 3 FUNSD relation-linking baseline (V1)
 
 ### Result - annotation-only spatial heuristic; high precision, recall is the design ceiling
diff --git a/PLAN.md b/PLAN.md
index 16c3bfa..3a539c0 100644
--- a/PLAN.md
+++ b/PLAN.md
@@ -544,7 +544,16 @@ Implementation details:
 
 **Phases 0 through 3 are complete and merged** (v1 = table-only RAG; Phase 2 = DocLayNet
 layout-crop integration; Phase 3 = FUNSD relation baseline, both merged to `main`
-2026-06-03). **Phase 4 (full demo + evaluation + report) is the next phase.**
+2026-06-03). **Phase 4 (full demo + evaluation + report) is in progress** on
+`feature/phase4-demo-eval-report`; PR-A (the eval-summary backbone) has landed.
+
+Phase 4 PR-A delivered (capstone summary backbone; see `docs/phase4_brief.md`):
+`src/phase4_summary.py` (pure per-phase summarizers + inline layout-CSV aggregation + markdown
+render), `scripts/build_phase4_summary.py` (writes `outputs/evaluation/phase4_summary.json` and
+the committed `reports/phase4_metrics.md`), `tests/test_phase4_summary.py` (10 synthetic tests).
+Report numbers are generated from the summary (never hand-copied), guarded by a no-drift gate.
+Next: PR-B (`reports/final_report.md` + `notebooks/07_final_report.ipynb`) and PR-C
+(`scripts/run_demo.py` + `notebooks/06_demo.ipynb`, key-optional Gradio demo).
 
 Phase 3 V1 delivered (annotation-only deterministic relation baseline; see
 `docs/phase3_brief.md`): `src/funsd_extraction.py` (parse + dedupe + per-answer-argmax
diff --git a/README.md b/README.md
index 64de610..a6a84fb 100644
--- a/README.md
+++ b/README.md
@@ -47,16 +47,17 @@ pytest
 
 ## Status
 
-Phases 0 through 1C are complete; the v1 release (table-only RAG) is merged to `main`.
-Delivered: the repo foundation; Phase 1A table topology (TATR grid derivation,
-spanning-cell mapping, grid validation, occupancy-aware HTML parsing); Phase 1B OCR
-content extraction (word-to-cell assignment, financial number normalization, content
-metrics); and Phase 1C table-only RAG (BM25 + dense BGE cosine + RRF retrieval, one
-chunk per table, single-provider grounded answer generation, GT-filled vs OCR-filled
-corpora scored separately).
-
-Current branch: Phase 2 (DocLayNet layout integration) is the active follow-up:
-page-level region detection -> table crop -> the existing Phase 1A/1B pipeline.
-The layout-crop MVP gate is implemented and scored on fixed DocLayNet subsets; the
-remaining close-out is the full crop->TATR structure smoke rerun after the tightened
-empty-grid validator. See [PLAN.md](PLAN.md) for the phase roadmap.
+**Phases 0 through 3 are complete and merged to `main`.** Delivered: the repo foundation;
+Phase 1A table topology (TATR grid derivation, spanning-cell mapping, grid validation,
+occupancy-aware HTML parsing); Phase 1B OCR content extraction (word-to-cell assignment,
+financial number normalization, content metrics); Phase 1C table-only RAG (BM25 + dense
+BGE cosine + RRF retrieval, one chunk per table, single-provider grounded answer
+generation, GT-filled vs OCR-filled corpora scored separately); Phase 2 DocLayNet
+layout-crop integration (page-level region detection -> table crop -> the Phase 1A/1B
+pipeline); and Phase 3 FUNSD relation-linking baseline (annotation-only deterministic
+predictor, held-out `test_50.qa_links` F1 0.727).
+
+Current phase: Phase 4 (full demo + evaluation + report) is in progress on
+`feature/phase4-demo-eval-report` — a capstone that aggregates the per-phase metrics into
+one summary, a key-optional Gradio demo, and a written report. See [PLAN.md](PLAN.md) for
+the phase roadmap.
diff --git a/docs/phase4_brief.md b/docs/phase4_brief.md
new file mode 100644
index 0000000..cc598ad
--- /dev/null
+++ b/docs/phase4_brief.md
@@ -0,0 +1,104 @@
+# Phase 4 — Demo + Eval Summary + Final Report (capstone)
+
+> Implementation brief for Phase 4. Committed in the repo (travels with `git pull` to Colab) so
+> the references to it in `DEVLOG.md` and the `src/phase4_summary.py` /
+> `scripts/build_phase4_summary.py` docstrings resolve. Status: PR-A (the eval-summary backbone)
+> implemented on `feature/phase4-demo-eval-report` — `src/phase4_summary.py`,
+> `scripts/build_phase4_summary.py`, `tests/test_phase4_summary.py`, and the generated
+> `reports/phase4_metrics.md`. PR-B (report) and PR-C (demo) follow.
+
+## Context
+
+Phases 0-3 are merged to `main` (FinTabNet.c table topology + OCR content + table-only RAG +
+DocLayNet layout + FUNSD relations). Phase 4 is the **capstone**: make the work presentable,
+reportable, and reproducible. It is explicitly **not new research** — it assembles the existing
+deterministic/custom metrics into one summary, a Gradio demo, and a written report.
+GriTS/Ragas/DeepEval are future work.
+
+All Drive evaluation artifacts are staged locally under `outputs/` (gitignored): metrics JSONs,
+layout CSVs, the RAG chunk corpus, QA sets, table outputs, crops/regions. FUNSD raw is at
+`data/raw/funsd/`.
+
+## Locked decisions
+
+- **Assemble, don't research.** GriTS / Ragas / DeepEval = future work, never a Phase 4 gate.
+- **Report is the product; notebooks are runners** (P1/P2): aggregation in `.py`; notebooks only
+  pull branch + run a script + display tables/figures.
+- **Demo is artifact-backed**, not live PDF -> layout -> TATR -> OCR -> RAG. The only live piece
+  is a QA box doing retrieval + answer generation over the **existing** chunk corpus.
+- **Notebook numbering 06/07** (contiguous). **Entrypoint `scripts/run_demo.py`** (runners live
+  in `scripts/`; no root `app.py` unless HF Spaces later).
+- **Demo degrades gracefully on two independent axes.** (a) *Retrieval stack:* default to
+  **BM25 retrieval-only** (pure CPU, no model); enable dense + RRF only when the embedding stack
+  is importable (a key-less reviewer may also lack a GPU). (b) *Answer generation:* gated solely
+  by `OPENROUTER_API_KEY` (disabled tab + key-missing message when absent). The demo must fully
+  launch with **neither**.
+- **Report metrics generated from the summary, never hand-copied.** The builder emits
+  `phase4_summary.json` and a paste-ready markdown table; report numbers read from the table.
+- **Commit policy:** `reports/phase4_metrics.md` is committed (generated report snippet);
+  `outputs/evaluation/phase4_summary.json` stays gitignored under `outputs/`. The no-drift gate
+  checks `reports/phase4_metrics.md` is byte-identical after a rebuild (the builder writes LF).
+- **Retrieval reported as hit@1 / hit@5 / hit@10 + MRR@10 only.** With one relevant chunk per
+  question `recall@k == hit@k` (`src/eval_retrieval.py`), so recall@k is dropped from the report.
+
+## Input artifacts (all verified present)
+
+| Source | File | Headline keys |
+|---|---|---|
+| 1A topology | `outputs/evaluation/phase1a_topology_<run-id>.json` | row/col_count_accuracy, cell_occupancy_f1, spanning_cell_detection_rate (n=300) |
+| 1B content | `outputs/evaluation/phase1b_content_<run-id>.json` | `aggregate` / `one_to_one` / `topology_matched_subset` cell metrics |
+| 1C retrieval | `outputs/evaluation/rag/phase1c_retrieval.json` | corpora x {bm25,dense,rrf} x hit@{1,5,10}, mrr@10 |
+| 1C QA | `outputs/evaluation/rag/phase1c_qa.json` | configs x {answer_exact, numeric_relaxed, citation_hit, abstain_rate} — GT vs OCR |
+| 2 layout | `outputs/layout/diagnostic_pos.csv` + `diagnostic_neg.csv` + `smoke_structure.csv` | mean crop IoU, matched@0.50/0.75, table-free FP rate, crop->TATR OK rate |
+| 3 FUNSD | `outputs/evaluation/phase3_funsd_relations.json` | `primary`="test_50.qa_links"; results[split][scope] P/R/F1 |
+
+Default deliverable run-id is `mvp_rand` (Phase 1A/1B). **Phase 2 has no JSON**; the builder
+aggregates it inline from the staged CSVs (no Colab re-run), matching the table-level matching +
+FP definitions printed by `scripts/eval_layout_iou.py` (and `scripts/smoke_structure.py` for the
+crop->TATR OK/WARN split). The inline aggregation reproduces the DEVLOG layout numbers exactly
+(mean crop IoU 0.900; matched@0.50 0.900/0.916; matched@0.75 0.880/0.895; crop->TATR 285/286).
+
+## Files
+
+- `src/phase4_summary.py` (new) — pure helpers, no file/Drive/gradio IO: `summarize_topology` /
+  `_content` / `_retrieval` (drops recall@k) / `_qa` / `_funsd` (headline from the JSON's own
+  `primary` pointer); `layout_metrics_from_rows(pos, neg, smoke)` (aggregation over parsed CSV
+  rows); `build_summary(parts)` (missing part -> `{"available": false}`);
+  `render_metrics_markdown(summary)` (deterministic paste-ready table). Style of
+  `src/eval_funsd.py`.
+- `scripts/build_phase4_summary.py` (new) — reads the five JSONs + three layout CSVs, calls the
+  pure helpers, writes `outputs/evaluation/phase4_summary.json` (gitignored) +
+  `reports/phase4_metrics.md` (committed, LF). Graceful on a missing artifact.
+- `scripts/run_demo.py` (new, PR-C) — Gradio app; `gradio` imported inside the script only (never
+  from `src/` or tests); BM25 retrieval default, dense/RRF only if the embedding stack imports;
+  answer generation gated by `OPENROUTER_API_KEY`. Reuses `src/retrieval.py`, `src/llm_client.py`.
+  Tabs: Overview, Table QA, Table Extraction, Layout, FUNSD Relations, Limitations.
+- `notebooks/06_demo.ipynb` (PR-C), `notebooks/07_final_report.ipynb` (PR-B) — Colab runners.
+- `reports/final_report.md` (PR-B) — methodology, metrics (generated-from-summary), GT-vs-OCR
+  separation, limitations, future work, "reproduce in this order".
+- `tests/test_phase4_summary.py` (new) — synthetic fixtures only (P3).
+- Docs: `README.md` status refresh (no stale "Phase 2 active"); `DEVLOG.md` entry; `PLAN.md` §7.
+
+## Out of scope (future work)
+GriTS; Ragas / DeepEval; full-document (non-table) chunking; chart/figure extraction;
+cross-encoder reranker / learned query routing; live PDF -> pipeline; HF Spaces deploy.
+
+## Verification / gates
+1. **Unit:** `pytest tests/test_phase4_summary.py` green, then full `pytest` green — synthetic,
+   no Drive/network, no gradio.
+2. **Summary build:** `python scripts/build_phase4_summary.py` writes the JSON + markdown; numbers
+   match the sources (FUNSD test_50.qa F1 0.727; QA gt_markdown answer_exact 0.675; layout
+   matched@0.50 recall 0.900).
+3. **No-drift:** re-running the builder leaves `reports/phase4_metrics.md` byte-identical.
+4. **Demo:** `scripts/run_demo.py` launches in the degraded case (no key, no embedding stack) and
+   the full case.
+5. **Report:** `reports/final_report.md` exists; README has no stale Phase 2 wording.
+
+## Build order (TDD) + PR boundaries
+- **PR-A (core, done):** tests -> `src/phase4_summary.py` -> `scripts/build_phase4_summary.py` ->
+  generated `reports/phase4_metrics.md`; + README/DEVLOG/PLAN docs.
+- **PR-B (report):** `reports/final_report.md` + `notebooks/07_final_report.ipynb`.
+- **PR-C (demo):** `scripts/run_demo.py` + `notebooks/06_demo.ipynb`.
+
+## Branch
+`feature/phase4-demo-eval-report` cut from the latest `origin/main` after `git fetch`.
diff --git a/reports/phase4_metrics.md b/reports/phase4_metrics.md
new file mode 100644
index 0000000..b8896df
--- /dev/null
+++ b/reports/phase4_metrics.md
@@ -0,0 +1,69 @@
+<!-- generated by scripts/build_phase4_summary.py - do not edit by hand -->
+
+# Phase 4 metrics summary
+
+Generated from `outputs/evaluation/phase4_summary.json`; do not edit by hand.
+
+## Table extraction (Phase 1A topology, Phase 1B content)
+| topology metric | value (n=300) |
+|---|---|
+| row count accuracy | 0.790 |
+| col count accuracy | 0.987 |
+| cell occupancy F1 | 0.977 |
+| spanning cell detection | 0.957 |
+
+| content (cell-level) | exact | numeric | non-empty F1 |
+|---|---|---|---|
+| aggregate (n=300) | 0.804 | 0.876 | 0.977 |
+| one-to-one (IoU>=0.5) | 0.761 | 0.826 | 0.906 |
+| topology-matched (n=234) | 0.819 | 0.902 | 0.988 |
+
+mean alignment IoU (one-to-one): 0.877
+
+## Layout (Phase 2 DocLayNet crop)
+| layout metric | value |
+|---|---|
+| mean crop IoU (GT-table pages) | 0.900 |
+| matched@0.50 (recall / precision) | 0.900 / 0.916 |
+| matched@0.75 (recall / precision) | 0.880 / 0.895 |
+| table-free crop FP rate | 0.065 |
+| crop -> TATR OK rate | 0.997 (285/286) |
+
+## Retrieval (Phase 1C, table chunks)
+| corpus (n=30) | method | hit@1 | hit@5 | hit@10 | MRR@10 |
+|---|---|---|---|---|---|
+| gt_linearized | bm25 | 0.933 | 1.000 | 1.000 | 0.958 |
+| gt_linearized | dense | 0.667 | 0.900 | 0.933 | 0.749 |
+| gt_linearized | rrf | 0.833 | 0.933 | 1.000 | 0.892 |
+| gt_markdown | bm25 | 0.900 | 0.933 | 0.967 | 0.917 |
+| gt_markdown | dense | 0.633 | 0.800 | 0.833 | 0.699 |
+| gt_markdown | rrf | 0.800 | 0.900 | 0.967 | 0.839 |
+| ocr_linearized | bm25 | 0.933 | 1.000 | 1.000 | 0.958 |
+| ocr_linearized | dense | 0.767 | 0.933 | 0.933 | 0.816 |
+| ocr_linearized | rrf | 0.867 | 0.967 | 1.000 | 0.910 |
+| ocr_markdown | bm25 | 0.900 | 0.933 | 0.967 | 0.917 |
+| ocr_markdown | dense | 0.667 | 0.733 | 0.867 | 0.712 |
+| ocr_markdown | rrf | 0.733 | 0.933 | 0.933 | 0.807 |
+
+## Table QA (Phase 1C, answer generation)
+| config (n=46) | answer exact | numeric relaxed | citation hit | abstain rate |
+|---|---|---|---|---|
+| gt_linearized | 0.650 | 0.875 | 0.825 | 0.000 |
+| gt_markdown | 0.675 | 0.775 | 0.800 | 0.025 |
+| ocr_linearized | 0.575 | 0.800 | 0.850 | 0.050 |
+| ocr_markdown | 0.550 | 0.700 | 0.750 | 0.050 |
+
+## FUNSD relations (Phase 3)
+headline (test_50.qa_links): P 0.946 / R 0.590 / F1 0.727
+
+| split | scope | precision | recall | f1 |
+|---|---|---|---|---|
+| all_199 | all_links | 0.925 | 0.401 | 0.560 |
+| all_199 | qa_links | 0.925 | 0.535 | 0.678 |
+| debug_20 | all_links | 0.944 | 0.293 | 0.447 |
+| debug_20 | qa_links | 0.944 | 0.363 | 0.524 |
+| test_50 | all_links | 0.946 | 0.464 | 0.623 |
+| test_50 | qa_links | 0.946 | 0.590 | 0.727 |
+| train_149 | all_links | 0.919 | 0.385 | 0.543 |
+| train_149 | qa_links | 0.919 | 0.521 | 0.665 |
+
diff --git a/scripts/build_phase4_summary.py b/scripts/build_phase4_summary.py
new file mode 100644
index 0000000..32542ab
--- /dev/null
+++ b/scripts/build_phase4_summary.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Build the Phase 4 capstone summary from the per-phase evaluation artifacts.
+
+Reads the five metrics JSONs + the three Phase 2 layout CSVs from outputs/, aggregates them with
+the pure helpers in src/phase4_summary.py, and writes:
+  - outputs/evaluation/phase4_summary.json   (gitignored machine artifact)
+  - reports/phase4_metrics.md                (committed, paste-ready; the report reads these)
+A missing artifact degrades gracefully (its section is marked unavailable). Layout has no JSON, so
+it is aggregated inline from diagnostic_pos.csv / diagnostic_neg.csv / smoke_structure.csv. The
+markdown is written with LF newlines so the no-drift gate holds across Windows and Colab. See
+docs/phase4_brief.md.
+
+Usage:
+    python scripts/build_phase4_summary.py [--run-id mvp_rand]
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+import argparse
+import csv
+import json
+
+from src import config
+from src import phase4_summary as p4
+
+
+def _load_json(path: Path):
+    return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None
+
+
+def _load_csv(path: Path):
+    if not path.exists():
+        return None
+    with path.open(newline="", encoding="utf-8") as f:
+        return list(csv.DictReader(f))
+
+
+def _layout_part(layout_dir: Path):
+    """Aggregate the three staged layout CSVs; None unless all are present."""
+    pos = _load_csv(layout_dir / "diagnostic_pos.csv")
+    neg = _load_csv(layout_dir / "diagnostic_neg.csv")
+    smoke = _load_csv(layout_dir / "smoke_structure.csv")
+    if pos is None or neg is None or smoke is None:
+        return None
+    return p4.layout_metrics_from_rows(pos, neg, smoke)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Build the Phase 4 capstone summary.")
+    ap.add_argument("--run-id", default="mvp_rand",
+                    help="run-id suffix of the Phase 1A/1B deliverable artifacts")
+    args = ap.parse_args()
+
+    ev = config.EVALUATION
+    rag = ev / "rag"
+    topo = _load_json(ev / f"phase1a_topology_{args.run_id}.json")
+    content = _load_json(ev / f"phase1b_content_{args.run_id}.json")
+    retr = _load_json(rag / "phase1c_retrieval.json")
+    qa = _load_json(rag / "phase1c_qa.json")
+    funsd = _load_json(ev / "phase3_funsd_relations.json")
+
+    parts = {
+        "topology": p4.summarize_topology(topo) if topo else None,
+        "content": p4.summarize_content(content) if content else None,
+        "retrieval": p4.summarize_retrieval(retr) if retr else None,
+        "qa": p4.summarize_qa(qa) if qa else None,
+        "layout": _layout_part(config.LAYOUT_OUTPUT),
+        "funsd": p4.summarize_funsd(funsd) if funsd else None,
+    }
+    summary = p4.build_summary(parts)
+
+    summary_path = ev / "phase4_summary.json"
+    summary_path.parent.mkdir(parents=True, exist_ok=True)
+    summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
+
+    md_path = config.ROOT / "reports" / "phase4_metrics.md"
+    md_path.parent.mkdir(parents=True, exist_ok=True)
+    with md_path.open("w", encoding="utf-8", newline="") as f:   # newline="": LF verbatim
+        f.write(p4.render_metrics_markdown(summary))
+
+    print("Phase 4 summary - artifact availability:")
+    for name in p4.PHASES:
+        print(f"  {name:<10} {'OK' if summary[name].get('available') else 'MISSING'}")
+    if summary["funsd"].get("available"):
+        h = summary["funsd"]["headline"]
+        print(f"\nFUNSD headline ({summary['funsd']['primary']}): "
+              f"P {h['precision']:.3f} / R {h['recall']:.3f} / F1 {h['f1']:.3f}")
+    print(f"\nsummary -> {summary_path}")
+    print(f"metrics  -> {md_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/phase4_summary.py b/src/phase4_summary.py
new file mode 100644
index 0000000..5f04afc
--- /dev/null
+++ b/src/phase4_summary.py
@@ -0,0 +1,252 @@
+"""Phase 4 capstone: aggregate the per-phase evaluation artifacts into one summary.
+
+Pure helpers only - no file IO, no Drive, no gradio. Each summarizer takes an already-loaded
+metrics dict (the per-phase evaluation JSON) or parsed CSV rows (layout) and returns a normalized
+summary dict; scripts/build_phase4_summary.py does the reading/writing. `build_summary` assembles
+the parts (a missing one becomes `{"available": False}`), and `render_metrics_markdown` turns the
+summary into the deterministic, paste-ready table committed at reports/phase4_metrics.md - the
+report prose reads those numbers, they are never hand-copied. See docs/phase4_brief.md.
+"""
+
+from __future__ import annotations
+
+from statistics import mean
+
+# --- per-phase summarizers (input: the loaded metrics JSON dict) ---
+
+_CONTENT_KEYS = ["cell_text_exact_match", "numeric_cell_relaxed_match", "non_empty_cell_content_f1"]
+_RETRIEVAL_KEYS = ["hit@1", "hit@5", "hit@10", "mrr@10"]  # recall@k == hit@k here, so dropped
+_QA_KEYS = ["answer_exact", "numeric_relaxed", "citation_hit", "abstain_rate"]
+_PRF = ("precision", "recall", "f1")
+
+PHASES = ["topology", "content", "retrieval", "qa", "layout", "funsd"]
+
+
+def summarize_topology(d: dict) -> dict:
+    return {
+        "n": d["num_samples"],
+        "row_count_accuracy": d["row_count_accuracy"],
+        "col_count_accuracy": d["col_count_accuracy"],
+        "cell_occupancy_f1": d["cell_occupancy_f1"],
+        "spanning_cell_detection_rate": d["spanning_cell_detection_rate"],
+    }
+
+
+def summarize_content(d: dict) -> dict:
+    agg, o2o = d["aggregate"], d["one_to_one"]
+    sub = d["topology_matched_subset"]["metrics"]
+    pick = lambda m: {k: m[k] for k in _CONTENT_KEYS}
+    return {
+        "n": d["num_samples"],
+        "aggregate": {**pick(agg), "alignment_coverage": agg["alignment_coverage"]},
+        "one_to_one": {**pick(o2o), "mean_alignment_iou": o2o["mean_alignment_iou"]},
+        "topology_matched_subset": {"n": d["topology_matched_subset"]["num_samples"], **pick(sub)},
+    }
+
+
+def summarize_retrieval(d: dict) -> dict:
+    corpora = {
+        corpus: {method: {k: m[k] for k in _RETRIEVAL_KEYS} for method, m in methods.items()}
+        for corpus, methods in d["corpora"].items()
+    }
+    return {"n": d["num_questions"], "methods": d["methods"], "corpora": corpora}
+
+
+def summarize_qa(d: dict) -> dict:
+    configs = {name: {k: m[k] for k in _QA_KEYS} for name, m in d["configs"].items()}
+    return {"n": d["num_questions"], "configs": configs}
+
+
+def summarize_funsd(d: dict) -> dict:
+    split, scope_key = d["primary"].split(".", 1)   # "test_50.qa_links" -> "test_50", "qa_links"
+    head = d["results"][split][scope_key]
+    results = {
+        sp: {sk: {k: sv[k] for k in _PRF} for sk, sv in scopes.items()}
+        for sp, scopes in d["results"].items()
+    }
+    return {"primary": d["primary"], "headline": {k: head[k] for k in _PRF}, "results": results}
+
+
+# --- layout aggregation (Phase 2 has no JSON; aggregate the staged CSV rows inline) ---
+
+
+def _i(v) -> int:
+    return int(v)
+
+
+def _f(v) -> float:
+    return float(v)
+
+
+def _truthy(v) -> bool:
+    return str(v).strip().lower() == "true"
+
+
+def layout_metrics_from_rows(pos_rows: list[dict], neg_rows: list[dict],
+                             smoke_rows: list[dict]) -> dict:
+    """Aggregate diagnostic_pos.csv (GT-table pages), diagnostic_neg.csv (table-free pages), and
+    smoke_structure.csv (crop -> TATR) rows. Rows are dicts of strings (csv.DictReader); cast here.
+    Mirrors the table-level matching + FP definitions printed by scripts/eval_layout_iou.py.
+    """
+    gt_total = sum(_i(r["gt_tables"]) for r in pos_rows)
+    crop_total = sum(_i(r["num_crop_tables"]) for r in pos_rows)
+    m50 = sum(_i(r["matched_50"]) for r in pos_rows)
+    m75 = sum(_i(r["matched_75"]) for r in pos_rows)
+
+    def matched(m: int) -> dict:
+        return {"recall": m / gt_total if gt_total else 0.0,
+                "precision": m / crop_total if crop_total else 0.0}
+
+    n_neg = len(neg_rows)
+    primary_fp = sum(1 for r in neg_rows if _i(r["primary_tables"]) > 0)
+    crop_fp = sum(1 for r in neg_rows if _i(r["num_crop_tables"]) > 0)
+
+    n_smoke = len(smoke_rows)
+    ok = sum(1 for r in smoke_rows if _truthy(r["valid"]))
+
+    return {
+        "n_gt_pages": len(pos_rows),
+        "gt_tables": gt_total,
+        "crops": crop_total,
+        "mean_crop_iou": mean(_f(r["best_iou_crop"]) for r in pos_rows) if pos_rows else 0.0,
+        "matched@0.50": matched(m50),
+        "matched@0.75": matched(m75),
+        "table_free_pages": n_neg,
+        "primary_fp_rate": primary_fp / n_neg if n_neg else 0.0,
+        "crop_fp_rate": crop_fp / n_neg if n_neg else 0.0,
+        "crop_to_tatr": {"n": n_smoke, "ok": ok, "warn": n_smoke - ok,
+                         "ok_rate": ok / n_smoke if n_smoke else 0.0},
+    }
+
+
+# --- assembly + render ---
+
+
+def build_summary(parts: dict) -> dict:
+    """Assemble per-phase summary dicts. parts maps a phase name (see PHASES) to its summary dict
+    or None; present parts get `available: True`, a missing one becomes `{"available": False}`."""
+    out: dict = {}
+    for name in PHASES:
+        val = parts.get(name)
+        out[name] = {"available": True, **val} if val is not None else {"available": False}
+    return out
+
+
+_BANNER = "<!-- generated by scripts/build_phase4_summary.py - do not edit by hand -->"
+
+
+def _fmt(x: float) -> str:
+    return f"{x:.3f}"
+
+
+def _table(header: list[str], rows: list[list[str]]) -> list[str]:
+    out = ["| " + " | ".join(header) + " |", "|" + "|".join(["---"] * len(header)) + "|"]
+    out += ["| " + " | ".join(r) + " |" for r in rows]
+    return out
+
+
+def render_metrics_markdown(summary: dict) -> str:
+    """Deterministic markdown for reports/phase4_metrics.md. Pure: same summary -> same bytes
+    (the no-drift property). A missing phase renders as '_Not available._'."""
+    L: list[str] = [_BANNER, "", "# Phase 4 metrics summary", "",
+                    "Generated from `outputs/evaluation/phase4_summary.json`; do not edit by hand.",
+                    ""]
+
+    # Table extraction
+    L.append("## Table extraction (Phase 1A topology, Phase 1B content)")
+    t = summary["topology"]
+    if t.get("available"):
+        L += _table(["topology metric", f"value (n={t['n']})"], [
+            ["row count accuracy", _fmt(t["row_count_accuracy"])],
+            ["col count accuracy", _fmt(t["col_count_accuracy"])],
+            ["cell occupancy F1", _fmt(t["cell_occupancy_f1"])],
+            ["spanning cell detection", _fmt(t["spanning_cell_detection_rate"])],
+        ])
+    else:
+        L.append("_Not available._")
+    L.append("")
+    c = summary["content"]
+    if c.get("available"):
+        L += _table(["content (cell-level)", "exact", "numeric", "non-empty F1"], [
+            [f"aggregate (n={c['n']})", _fmt(c["aggregate"]["cell_text_exact_match"]),
+             _fmt(c["aggregate"]["numeric_cell_relaxed_match"]),
+             _fmt(c["aggregate"]["non_empty_cell_content_f1"])],
+            ["one-to-one (IoU>=0.5)", _fmt(c["one_to_one"]["cell_text_exact_match"]),
+             _fmt(c["one_to_one"]["numeric_cell_relaxed_match"]),
+             _fmt(c["one_to_one"]["non_empty_cell_content_f1"])],
+            [f"topology-matched (n={c['topology_matched_subset']['n']})",
+             _fmt(c["topology_matched_subset"]["cell_text_exact_match"]),
+             _fmt(c["topology_matched_subset"]["numeric_cell_relaxed_match"]),
+             _fmt(c["topology_matched_subset"]["non_empty_cell_content_f1"])],
+        ])
+        L.append("")
+        L.append(f"mean alignment IoU (one-to-one): {_fmt(c['one_to_one']['mean_alignment_iou'])}")
+    else:
+        L.append("_Not available._")
+    L.append("")
+
+    # Layout
+    L.append("## Layout (Phase 2 DocLayNet crop)")
+    g = summary["layout"]
+    if g.get("available"):
+        cr = g["crop_to_tatr"]
+        L += _table(["layout metric", "value"], [
+            ["mean crop IoU (GT-table pages)", _fmt(g["mean_crop_iou"])],
+            ["matched@0.50 (recall / precision)",
+             f"{_fmt(g['matched@0.50']['recall'])} / {_fmt(g['matched@0.50']['precision'])}"],
+            ["matched@0.75 (recall / precision)",
+             f"{_fmt(g['matched@0.75']['recall'])} / {_fmt(g['matched@0.75']['precision'])}"],
+            ["table-free crop FP rate", _fmt(g["crop_fp_rate"])],
+            ["crop -> TATR OK rate", f"{_fmt(cr['ok_rate'])} ({cr['ok']}/{cr['n']})"],
+        ])
+    else:
+        L.append("_Not available._")
+    L.append("")
+
+    # Retrieval
+    L.append("## Retrieval (Phase 1C, table chunks)")
+    r = summary["retrieval"]
+    if r.get("available"):
+        rows = []
+        for corpus in sorted(r["corpora"]):
+            for method in sorted(r["corpora"][corpus]):
+                m = r["corpora"][corpus][method]
+                rows.append([corpus, method, _fmt(m["hit@1"]), _fmt(m["hit@5"]),
+                             _fmt(m["hit@10"]), _fmt(m["mrr@10"])])
+        L += _table([f"corpus (n={r['n']})", "method", "hit@1", "hit@5", "hit@10", "MRR@10"], rows)
+    else:
+        L.append("_Not available._")
+    L.append("")
+
+    # QA
+    L.append("## Table QA (Phase 1C, answer generation)")
+    q = summary["qa"]
+    if q.get("available"):
+        rows = [[cfg, _fmt(m["answer_exact"]), _fmt(m["numeric_relaxed"]),
+                 _fmt(m["citation_hit"]), _fmt(m["abstain_rate"])]
+                for cfg, m in sorted(q["configs"].items())]
+        L += _table([f"config (n={q['n']})", "answer exact", "numeric relaxed",
+                     "citation hit", "abstain rate"], rows)
+    else:
+        L.append("_Not available._")
+    L.append("")
+
+    # FUNSD
+    L.append("## FUNSD relations (Phase 3)")
+    f = summary["funsd"]
+    if f.get("available"):
+        h = f["headline"]
+        L.append(f"headline ({f['primary']}): "
+                 f"P {_fmt(h['precision'])} / R {_fmt(h['recall'])} / F1 {_fmt(h['f1'])}")
+        L.append("")
+        rows = []
+        for split in sorted(f["results"]):
+            for scope in sorted(f["results"][split]):
+                v = f["results"][split][scope]
+                rows.append([split, scope, _fmt(v["precision"]), _fmt(v["recall"]), _fmt(v["f1"])])
+        L += _table(["split", "scope", "precision", "recall", "f1"], rows)
+    else:
+        L.append("_Not available._")
+    L.append("")
+
+    return "\n".join(L) + "\n"
diff --git a/tests/test_phase4_summary.py b/tests/test_phase4_summary.py
new file mode 100644
index 0000000..f483074
--- /dev/null
+++ b/tests/test_phase4_summary.py
@@ -0,0 +1,200 @@
+"""Phase 4 capstone summary tests (CPU, synthetic) - Phase 4.
+
+The summarizers take already-loaded metrics dicts (the per-phase evaluation JSONs) or parsed
+CSV rows (layout) and return normalized summary dicts; no file IO, no Drive, no gradio is
+imported. Fixtures are tiny inline dicts shaped like the real artifacts. Covers each summarizer,
+the layout CSV aggregation math (incl. a multi-GT-table page and the table-free FP rate),
+missing-artifact tolerance, and the deterministic markdown render (the no-drift property).
+"""
+
+from src.phase4_summary import (
+    build_summary,
+    layout_metrics_from_rows,
+    render_metrics_markdown,
+    summarize_content,
+    summarize_funsd,
+    summarize_qa,
+    summarize_retrieval,
+    summarize_topology,
+)
+
+
+# --- fixtures (shaped like the real outputs/evaluation/*.json) ---
+
+TOPO = {
+    "evaluation_type": "topology", "num_samples": 300,
+    "row_count_accuracy": 0.79, "col_count_accuracy": 0.987,
+    "cell_occupancy_f1": 0.977, "spanning_cell_detection_rate": 0.957,
+}
+
+CONTENT = {
+    "num_samples": 300,
+    "aggregate": {"cell_text_exact_match": 0.804, "numeric_cell_relaxed_match": 0.876,
+                  "non_empty_cell_content_f1": 0.977, "alignment_coverage": 0.990},
+    "one_to_one": {"cell_text_exact_match": 0.761, "numeric_cell_relaxed_match": 0.825,
+                   "non_empty_cell_content_f1": 0.906, "mean_alignment_iou": 0.877},
+    "topology_matched_subset": {"num_samples": 234, "metrics": {
+        "cell_text_exact_match": 0.819, "numeric_cell_relaxed_match": 0.902,
+        "non_empty_cell_content_f1": 0.988}},
+}
+
+RETR = {
+    "num_questions": 30, "ks": [1, 5, 10], "methods": ["bm25", "dense", "rrf"],
+    "corpora": {"gt_markdown": {"bm25": {
+        "hit@1": 0.9, "recall@1": 0.9, "mrr@1": 0.9,
+        "hit@5": 0.93, "recall@5": 0.93, "mrr@5": 0.91,
+        "hit@10": 0.97, "recall@10": 0.97, "mrr@10": 0.92}}},
+}
+
+QA = {
+    "num_questions": 46, "top_k": 10,
+    "configs": {"gt_markdown": {
+        "num_questions": 46, "num_answerable": 40,
+        "answer_exact": 0.675, "numeric_relaxed": 0.775,
+        "citation_hit": 0.8, "abstain_rate": 0.025, "abstain_accuracy": 1.0}},
+}
+
+FUNSD = {
+    "primary": "test_50.qa_links",
+    "results": {
+        "test_50": {
+            "qa_links": {"precision": 0.946, "recall": 0.590, "f1": 0.727,
+                         "tp": 494, "n_pred": 522, "n_gold": 837, "scope": "qa"},
+            "all_links": {"precision": 0.946, "recall": 0.464, "f1": 0.623}},
+        "train_149": {
+            "qa_links": {"precision": 0.919, "recall": 0.521, "f1": 0.665},
+            "all_links": {"precision": 0.919, "recall": 0.385, "f1": 0.543}}},
+}
+
+# layout CSV rows arrive as strings (csv.DictReader); the aggregator must cast.
+POS = [  # GT-table pages
+    {"gt_tables": "1", "num_crop_tables": "1", "best_iou_crop": "0.90",
+     "matched_50": "1", "matched_75": "1"},
+    {"gt_tables": "3", "num_crop_tables": "2", "best_iou_crop": "0.60",
+     "matched_50": "2", "matched_75": "1"},
+]
+NEG = [  # table-free pages
+    {"gt_tables": "0", "primary_tables": "0", "fallback_used": "False", "num_crop_tables": "0"},
+    {"gt_tables": "0", "primary_tables": "1", "fallback_used": "False", "num_crop_tables": "1"},
+    {"gt_tables": "0", "primary_tables": "0", "fallback_used": "True", "num_crop_tables": "0"},
+    {"gt_tables": "0", "primary_tables": "0", "fallback_used": "False", "num_crop_tables": "0"},
+]
+SMOKE = [
+    {"crop": "a.png", "valid": "True", "failure_reasons": ""},
+    {"crop": "b.png", "valid": "True", "failure_reasons": ""},
+    {"crop": "c.png", "valid": "False", "failure_reasons": "rows_not_monotonic"},
+    {"crop": "d.png", "valid": "True", "failure_reasons": ""},
+]
+
+
+# --- per-phase summarizers ---
+
+
+def test_summarize_topology():
+    m = summarize_topology(TOPO)
+    assert m["n"] == 300
+    assert m["row_count_accuracy"] == 0.79
+    assert m["spanning_cell_detection_rate"] == 0.957
+
+
+def test_summarize_content_three_sections():
+    m = summarize_content(CONTENT)
+    assert m["n"] == 300
+    assert m["aggregate"]["cell_text_exact_match"] == 0.804
+    assert m["one_to_one"]["mean_alignment_iou"] == 0.877
+    assert m["topology_matched_subset"]["n"] == 234
+    assert m["topology_matched_subset"]["non_empty_cell_content_f1"] == 0.988
+
+
+def test_summarize_retrieval_keeps_hit_mrr_drops_recall():
+    m = summarize_retrieval(RETR)
+    assert m["n"] == 30
+    cell = m["corpora"]["gt_markdown"]["bm25"]
+    assert set(cell) == {"hit@1", "hit@5", "hit@10", "mrr@10"}
+    assert "recall@1" not in cell and "mrr@1" not in cell
+    assert cell["mrr@10"] == 0.92
+
+
+def test_summarize_qa():
+    m = summarize_qa(QA)
+    assert m["n"] == 46
+    cfg = m["configs"]["gt_markdown"]
+    assert set(cfg) == {"answer_exact", "numeric_relaxed", "citation_hit", "abstain_rate"}
+    assert cfg["answer_exact"] == 0.675
+
+
+def test_summarize_funsd_headline_from_primary_pointer():
+    m = summarize_funsd(FUNSD)
+    assert m["primary"] == "test_50.qa_links"
+    assert m["headline"] == {"precision": 0.946, "recall": 0.590, "f1": 0.727}
+    # per-split results carry only p/r/f1 (no tp/n_pred noise)
+    assert set(m["results"]["test_50"]["qa_links"]) == {"precision", "recall", "f1"}
+    assert m["results"]["train_149"]["qa_links"]["f1"] == 0.665
+
+
+# --- layout aggregation (the inline CSV math) ---
+
+
+def test_layout_metrics_from_rows():
+    m = layout_metrics_from_rows(POS, NEG, SMOKE)
+    # gt_total=4, crop_total=3, m50=3, m75=2
+    assert m["gt_tables"] == 4 and m["crops"] == 3
+    assert m["mean_crop_iou"] == 0.75
+    assert m["matched@0.50"]["recall"] == 0.75      # 3/4
+    assert m["matched@0.50"]["precision"] == 1.0     # 3/3
+    assert m["matched@0.75"]["recall"] == 0.5        # 2/4
+    assert round(m["matched@0.75"]["precision"], 4) == round(2 / 3, 4)
+    # table-free FP: 1 page with a final crop, 1 with a primary detection, out of 4
+    assert m["table_free_pages"] == 4
+    assert m["crop_fp_rate"] == 0.25
+    assert m["primary_fp_rate"] == 0.25
+    # crop -> TATR: 3 OK / 1 WARN
+    assert m["crop_to_tatr"] == {"n": 4, "ok": 3, "warn": 1, "ok_rate": 0.75}
+
+
+def test_layout_metrics_empty_is_zero_safe():
+    m = layout_metrics_from_rows([], [], [])
+    assert m["mean_crop_iou"] == 0.0
+    assert m["matched@0.50"]["recall"] == 0.0
+    assert m["crop_fp_rate"] == 0.0
+    assert m["crop_to_tatr"]["ok_rate"] == 0.0
+
+
+# --- assembly + render ---
+
+
+def _full_parts():
+    return {
+        "topology": summarize_topology(TOPO),
+        "content": summarize_content(CONTENT),
+        "retrieval": summarize_retrieval(RETR),
+        "qa": summarize_qa(QA),
+        "layout": layout_metrics_from_rows(POS, NEG, SMOKE),
+        "funsd": summarize_funsd(FUNSD),
+    }
+
+
+def test_build_summary_marks_present_and_missing():
+    parts = _full_parts()
+    parts["layout"] = None                       # simulate a missing artifact
+    s = build_summary(parts)
+    assert s["layout"] == {"available": False}
+    assert s["topology"]["available"] is True
+    assert s["funsd"]["headline"]["f1"] == 0.727
+
+
+def test_render_markdown_deterministic_and_grounded():
+    s = build_summary(_full_parts())
+    md = render_metrics_markdown(s)
+    assert isinstance(md, str) and md.endswith("\n")
+    assert md == render_metrics_markdown(s)       # no-drift: pure + deterministic
+    assert "0.727" in md                          # FUNSD headline f1 surfaced
+    assert "generated by" in md                   # static banner, no timestamp
+    assert "recall@" not in md                    # recall@k dropped from the report
+
+
+def test_render_markdown_tolerates_missing_part():
+    parts = _full_parts()
+    parts["funsd"] = None
+    md = render_metrics_markdown(build_summary(parts))
+    assert "not available" in md.lower()