From 6d15fadc18bbebb2d871feb95cd4542760902af2 Mon Sep 17 00:00:00 2001 From: AD2000X Date: Wed, 3 Jun 2026 15:48:04 +0100 Subject: [PATCH] feat: Phase 4 PR-A eval-summary backbone - src/phase4_summary.py: pure per-phase summarizers + inline layout-CSV aggregation + deterministic markdown render (no file/Drive/gradio IO) - scripts/build_phase4_summary.py: 5 metrics JSONs + 3 layout CSVs -> outputs/evaluation/phase4_summary.json (gitignored) + reports/phase4_metrics.md - tests/test_phase4_summary.py: 10 synthetic tests (full pytest 246 green) - reports/phase4_metrics.md: generated metrics snapshot (no-drift, LF) - docs/phase4_brief.md: committed implementation brief - README/DEVLOG/PLAN: Phase 4 in progress; README stale "Phase 2 active" removed reports/phase4_metrics.md is generated; outputs/evaluation/phase4_summary.json remains gitignored. --- DEVLOG.md | 26 ++++ PLAN.md | 11 +- README.md | 27 ++-- docs/phase4_brief.md | 104 +++++++++++++ reports/phase4_metrics.md | 69 +++++++++ scripts/build_phase4_summary.py | 97 ++++++++++++ src/phase4_summary.py | 252 ++++++++++++++++++++++++++++++++ tests/test_phase4_summary.py | 200 +++++++++++++++++++++++++ 8 files changed, 772 insertions(+), 14 deletions(-) create mode 100644 docs/phase4_brief.md create mode 100644 reports/phase4_metrics.md create mode 100644 scripts/build_phase4_summary.py create mode 100644 src/phase4_summary.py create mode 100644 tests/test_phase4_summary.py diff --git a/DEVLOG.md b/DEVLOG.md index e84c51d..b214114 100644 --- a/DEVLOG.md +++ b/DEVLOG.md @@ -181,6 +181,32 @@ Decisions outgrow this file, split them into `DECISIONS.md` (or `docs/adr/`). --- +## 2026-06-03 - Phase 4 eval-summary backbone (PR-A) + +### Result - one summary aggregated from the per-phase artifacts; report numbers never hand-copied + +- **What landed:** `src/phase4_summary.py` (pure summarizers + layout aggregation + markdown + render, no file/Drive/gradio IO), `scripts/build_phase4_summary.py` (reads the five metrics + JSONs + three layout CSVs, writes `outputs/evaluation/phase4_summary.json` gitignored + + `reports/phase4_metrics.md` committed), `tests/test_phase4_summary.py` (10 synthetic tests). + See `docs/phase4_brief.md`. +- **Phase 2 has no metrics JSON**, so the builder aggregates it inline from the staged + `diagnostic_pos.csv` / `diagnostic_neg.csv` / `smoke_structure.csv`, matching the table-level + matching + FP definitions in `scripts/eval_layout_iou.py` and the OK/WARN split in + `scripts/smoke_structure.py`. This reproduced the prior DEVLOG layout numbers **exactly** (mean + crop IoU 0.900; matched@0.50 0.900/0.916; matched@0.75 0.880/0.895; crop->TATR 285/286 = 0.997), + confirming the inline path needs no Colab re-run. +- **No-drift gate:** `render_metrics_markdown` is pure and deterministic and the file is written + with LF; rebuilding leaves `reports/phase4_metrics.md` byte-identical, so the committed report + snippet cannot silently drift from the artifacts. +- **Reporting choices:** retrieval reports hit@{1,5,10} + MRR@10 only (recall@k == hit@k under one + relevant chunk per question, `src/eval_retrieval.py`); a missing artifact degrades to + `{"available": false}` rather than failing. +- **Result:** full `pytest` green (246, +10). Headline echoes: FUNSD `test_50.qa_links` F1 0.727; + QA `gt_markdown` answer_exact 0.675. PR-B (report) and PR-C (Gradio demo) follow. + +--- + ## 2026-06-03 - Phase 3 FUNSD relation-linking baseline (V1) ### Result - annotation-only spatial heuristic; high precision, recall is the design ceiling diff --git a/PLAN.md b/PLAN.md index 16c3bfa..3a539c0 100644 --- a/PLAN.md +++ b/PLAN.md @@ -544,7 +544,16 @@ Implementation details: **Phases 0 through 3 are complete and merged** (v1 = table-only RAG; Phase 2 = DocLayNet layout-crop integration; Phase 3 = FUNSD relation baseline, both merged to `main` -2026-06-03). **Phase 4 (full demo + evaluation + report) is the next phase.** +2026-06-03). **Phase 4 (full demo + evaluation + report) is in progress** on +`feature/phase4-demo-eval-report`; PR-A (the eval-summary backbone) has landed. + +Phase 4 PR-A delivered (capstone summary backbone; see `docs/phase4_brief.md`): +`src/phase4_summary.py` (pure per-phase summarizers + inline layout-CSV aggregation + markdown +render), `scripts/build_phase4_summary.py` (writes `outputs/evaluation/phase4_summary.json` and +the committed `reports/phase4_metrics.md`), `tests/test_phase4_summary.py` (10 synthetic tests). +Report numbers are generated from the summary (never hand-copied), guarded by a no-drift gate. +Next: PR-B (`reports/final_report.md` + `notebooks/07_final_report.ipynb`) and PR-C +(`scripts/run_demo.py` + `notebooks/06_demo.ipynb`, key-optional Gradio demo). Phase 3 V1 delivered (annotation-only deterministic relation baseline; see `docs/phase3_brief.md`): `src/funsd_extraction.py` (parse + dedupe + per-answer-argmax diff --git a/README.md b/README.md index 64de610..a6a84fb 100644 --- a/README.md +++ b/README.md @@ -47,16 +47,17 @@ pytest ## Status -Phases 0 through 1C are complete; the v1 release (table-only RAG) is merged to `main`. -Delivered: the repo foundation; Phase 1A table topology (TATR grid derivation, -spanning-cell mapping, grid validation, occupancy-aware HTML parsing); Phase 1B OCR -content extraction (word-to-cell assignment, financial number normalization, content -metrics); and Phase 1C table-only RAG (BM25 + dense BGE cosine + RRF retrieval, one -chunk per table, single-provider grounded answer generation, GT-filled vs OCR-filled -corpora scored separately). - -Current branch: Phase 2 (DocLayNet layout integration) is the active follow-up: -page-level region detection -> table crop -> the existing Phase 1A/1B pipeline. -The layout-crop MVP gate is implemented and scored on fixed DocLayNet subsets; the -remaining close-out is the full crop->TATR structure smoke rerun after the tightened -empty-grid validator. See [PLAN.md](PLAN.md) for the phase roadmap. +**Phases 0 through 3 are complete and merged to `main`.** Delivered: the repo foundation; +Phase 1A table topology (TATR grid derivation, spanning-cell mapping, grid validation, +occupancy-aware HTML parsing); Phase 1B OCR content extraction (word-to-cell assignment, +financial number normalization, content metrics); Phase 1C table-only RAG (BM25 + dense +BGE cosine + RRF retrieval, one chunk per table, single-provider grounded answer +generation, GT-filled vs OCR-filled corpora scored separately); Phase 2 DocLayNet +layout-crop integration (page-level region detection -> table crop -> the Phase 1A/1B +pipeline); and Phase 3 FUNSD relation-linking baseline (annotation-only deterministic +predictor, held-out `test_50.qa_links` F1 0.727). + +Current phase: Phase 4 (full demo + evaluation + report) is in progress on +`feature/phase4-demo-eval-report` — a capstone that aggregates the per-phase metrics into +one summary, a key-optional Gradio demo, and a written report. See [PLAN.md](PLAN.md) for +the phase roadmap. diff --git a/docs/phase4_brief.md b/docs/phase4_brief.md new file mode 100644 index 0000000..cc598ad --- /dev/null +++ b/docs/phase4_brief.md @@ -0,0 +1,104 @@ +# Phase 4 — Demo + Eval Summary + Final Report (capstone) + +> Implementation brief for Phase 4. Committed in the repo (travels with `git pull` to Colab) so +> the references to it in `DEVLOG.md` and the `src/phase4_summary.py` / +> `scripts/build_phase4_summary.py` docstrings resolve. Status: PR-A (the eval-summary backbone) +> implemented on `feature/phase4-demo-eval-report` — `src/phase4_summary.py`, +> `scripts/build_phase4_summary.py`, `tests/test_phase4_summary.py`, and the generated +> `reports/phase4_metrics.md`. PR-B (report) and PR-C (demo) follow. + +## Context + +Phases 0-3 are merged to `main` (FinTabNet.c table topology + OCR content + table-only RAG + +DocLayNet layout + FUNSD relations). Phase 4 is the **capstone**: make the work presentable, +reportable, and reproducible. It is explicitly **not new research** — it assembles the existing +deterministic/custom metrics into one summary, a Gradio demo, and a written report. +GriTS/Ragas/DeepEval are future work. + +All Drive evaluation artifacts are staged locally under `outputs/` (gitignored): metrics JSONs, +layout CSVs, the RAG chunk corpus, QA sets, table outputs, crops/regions. FUNSD raw is at +`data/raw/funsd/`. + +## Locked decisions + +- **Assemble, don't research.** GriTS / Ragas / DeepEval = future work, never a Phase 4 gate. +- **Report is the product; notebooks are runners** (P1/P2): aggregation in `.py`; notebooks only + pull branch + run a script + display tables/figures. +- **Demo is artifact-backed**, not live PDF -> layout -> TATR -> OCR -> RAG. The only live piece + is a QA box doing retrieval + answer generation over the **existing** chunk corpus. +- **Notebook numbering 06/07** (contiguous). **Entrypoint `scripts/run_demo.py`** (runners live + in `scripts/`; no root `app.py` unless HF Spaces later). +- **Demo degrades gracefully on two independent axes.** (a) *Retrieval stack:* default to + **BM25 retrieval-only** (pure CPU, no model); enable dense + RRF only when the embedding stack + is importable (a key-less reviewer may also lack a GPU). (b) *Answer generation:* gated solely + by `OPENROUTER_API_KEY` (disabled tab + key-missing message when absent). The demo must fully + launch with **neither**. +- **Report metrics generated from the summary, never hand-copied.** The builder emits + `phase4_summary.json` and a paste-ready markdown table; report numbers read from the table. +- **Commit policy:** `reports/phase4_metrics.md` is committed (generated report snippet); + `outputs/evaluation/phase4_summary.json` stays gitignored under `outputs/`. The no-drift gate + checks `reports/phase4_metrics.md` is byte-identical after a rebuild (the builder writes LF). +- **Retrieval reported as hit@1 / hit@5 / hit@10 + MRR@10 only.** With one relevant chunk per + question `recall@k == hit@k` (`src/eval_retrieval.py`), so recall@k is dropped from the report. + +## Input artifacts (all verified present) + +| Source | File | Headline keys | +|---|---|---| +| 1A topology | `outputs/evaluation/phase1a_topology_.json` | row/col_count_accuracy, cell_occupancy_f1, spanning_cell_detection_rate (n=300) | +| 1B content | `outputs/evaluation/phase1b_content_.json` | `aggregate` / `one_to_one` / `topology_matched_subset` cell metrics | +| 1C retrieval | `outputs/evaluation/rag/phase1c_retrieval.json` | corpora x {bm25,dense,rrf} x hit@{1,5,10}, mrr@10 | +| 1C QA | `outputs/evaluation/rag/phase1c_qa.json` | configs x {answer_exact, numeric_relaxed, citation_hit, abstain_rate} — GT vs OCR | +| 2 layout | `outputs/layout/diagnostic_pos.csv` + `diagnostic_neg.csv` + `smoke_structure.csv` | mean crop IoU, matched@0.50/0.75, table-free FP rate, crop->TATR OK rate | +| 3 FUNSD | `outputs/evaluation/phase3_funsd_relations.json` | `primary`="test_50.qa_links"; results[split][scope] P/R/F1 | + +Default deliverable run-id is `mvp_rand` (Phase 1A/1B). **Phase 2 has no JSON**; the builder +aggregates it inline from the staged CSVs (no Colab re-run), matching the table-level matching + +FP definitions printed by `scripts/eval_layout_iou.py` (and `scripts/smoke_structure.py` for the +crop->TATR OK/WARN split). The inline aggregation reproduces the DEVLOG layout numbers exactly +(mean crop IoU 0.900; matched@0.50 0.900/0.916; matched@0.75 0.880/0.895; crop->TATR 285/286). + +## Files + +- `src/phase4_summary.py` (new) — pure helpers, no file/Drive/gradio IO: `summarize_topology` / + `_content` / `_retrieval` (drops recall@k) / `_qa` / `_funsd` (headline from the JSON's own + `primary` pointer); `layout_metrics_from_rows(pos, neg, smoke)` (aggregation over parsed CSV + rows); `build_summary(parts)` (missing part -> `{"available": false}`); + `render_metrics_markdown(summary)` (deterministic paste-ready table). Style of + `src/eval_funsd.py`. +- `scripts/build_phase4_summary.py` (new) — reads the five JSONs + three layout CSVs, calls the + pure helpers, writes `outputs/evaluation/phase4_summary.json` (gitignored) + + `reports/phase4_metrics.md` (committed, LF). Graceful on a missing artifact. +- `scripts/run_demo.py` (new, PR-C) — Gradio app; `gradio` imported inside the script only (never + from `src/` or tests); BM25 retrieval default, dense/RRF only if the embedding stack imports; + answer generation gated by `OPENROUTER_API_KEY`. Reuses `src/retrieval.py`, `src/llm_client.py`. + Tabs: Overview, Table QA, Table Extraction, Layout, FUNSD Relations, Limitations. +- `notebooks/06_demo.ipynb` (PR-C), `notebooks/07_final_report.ipynb` (PR-B) — Colab runners. +- `reports/final_report.md` (PR-B) — methodology, metrics (generated-from-summary), GT-vs-OCR + separation, limitations, future work, "reproduce in this order". +- `tests/test_phase4_summary.py` (new) — synthetic fixtures only (P3). +- Docs: `README.md` status refresh (no stale "Phase 2 active"); `DEVLOG.md` entry; `PLAN.md` §7. + +## Out of scope (future work) +GriTS; Ragas / DeepEval; full-document (non-table) chunking; chart/figure extraction; +cross-encoder reranker / learned query routing; live PDF -> pipeline; HF Spaces deploy. + +## Verification / gates +1. **Unit:** `pytest tests/test_phase4_summary.py` green, then full `pytest` green — synthetic, + no Drive/network, no gradio. +2. **Summary build:** `python scripts/build_phase4_summary.py` writes the JSON + markdown; numbers + match the sources (FUNSD test_50.qa F1 0.727; QA gt_markdown answer_exact 0.675; layout + matched@0.50 recall 0.900). +3. **No-drift:** re-running the builder leaves `reports/phase4_metrics.md` byte-identical. +4. **Demo:** `scripts/run_demo.py` launches in the degraded case (no key, no embedding stack) and + the full case. +5. **Report:** `reports/final_report.md` exists; README has no stale Phase 2 wording. + +## Build order (TDD) + PR boundaries +- **PR-A (core, done):** tests -> `src/phase4_summary.py` -> `scripts/build_phase4_summary.py` -> + generated `reports/phase4_metrics.md`; + README/DEVLOG/PLAN docs. +- **PR-B (report):** `reports/final_report.md` + `notebooks/07_final_report.ipynb`. +- **PR-C (demo):** `scripts/run_demo.py` + `notebooks/06_demo.ipynb`. + +## Branch +`feature/phase4-demo-eval-report` cut from the latest `origin/main` after `git fetch`. diff --git a/reports/phase4_metrics.md b/reports/phase4_metrics.md new file mode 100644 index 0000000..b8896df --- /dev/null +++ b/reports/phase4_metrics.md @@ -0,0 +1,69 @@ + + +# Phase 4 metrics summary + +Generated from `outputs/evaluation/phase4_summary.json`; do not edit by hand. + +## Table extraction (Phase 1A topology, Phase 1B content) +| topology metric | value (n=300) | +|---|---| +| row count accuracy | 0.790 | +| col count accuracy | 0.987 | +| cell occupancy F1 | 0.977 | +| spanning cell detection | 0.957 | + +| content (cell-level) | exact | numeric | non-empty F1 | +|---|---|---|---| +| aggregate (n=300) | 0.804 | 0.876 | 0.977 | +| one-to-one (IoU>=0.5) | 0.761 | 0.826 | 0.906 | +| topology-matched (n=234) | 0.819 | 0.902 | 0.988 | + +mean alignment IoU (one-to-one): 0.877 + +## Layout (Phase 2 DocLayNet crop) +| layout metric | value | +|---|---| +| mean crop IoU (GT-table pages) | 0.900 | +| matched@0.50 (recall / precision) | 0.900 / 0.916 | +| matched@0.75 (recall / precision) | 0.880 / 0.895 | +| table-free crop FP rate | 0.065 | +| crop -> TATR OK rate | 0.997 (285/286) | + +## Retrieval (Phase 1C, table chunks) +| corpus (n=30) | method | hit@1 | hit@5 | hit@10 | MRR@10 | +|---|---|---|---|---|---| +| gt_linearized | bm25 | 0.933 | 1.000 | 1.000 | 0.958 | +| gt_linearized | dense | 0.667 | 0.900 | 0.933 | 0.749 | +| gt_linearized | rrf | 0.833 | 0.933 | 1.000 | 0.892 | +| gt_markdown | bm25 | 0.900 | 0.933 | 0.967 | 0.917 | +| gt_markdown | dense | 0.633 | 0.800 | 0.833 | 0.699 | +| gt_markdown | rrf | 0.800 | 0.900 | 0.967 | 0.839 | +| ocr_linearized | bm25 | 0.933 | 1.000 | 1.000 | 0.958 | +| ocr_linearized | dense | 0.767 | 0.933 | 0.933 | 0.816 | +| ocr_linearized | rrf | 0.867 | 0.967 | 1.000 | 0.910 | +| ocr_markdown | bm25 | 0.900 | 0.933 | 0.967 | 0.917 | +| ocr_markdown | dense | 0.667 | 0.733 | 0.867 | 0.712 | +| ocr_markdown | rrf | 0.733 | 0.933 | 0.933 | 0.807 | + +## Table QA (Phase 1C, answer generation) +| config (n=46) | answer exact | numeric relaxed | citation hit | abstain rate | +|---|---|---|---|---| +| gt_linearized | 0.650 | 0.875 | 0.825 | 0.000 | +| gt_markdown | 0.675 | 0.775 | 0.800 | 0.025 | +| ocr_linearized | 0.575 | 0.800 | 0.850 | 0.050 | +| ocr_markdown | 0.550 | 0.700 | 0.750 | 0.050 | + +## FUNSD relations (Phase 3) +headline (test_50.qa_links): P 0.946 / R 0.590 / F1 0.727 + +| split | scope | precision | recall | f1 | +|---|---|---|---|---| +| all_199 | all_links | 0.925 | 0.401 | 0.560 | +| all_199 | qa_links | 0.925 | 0.535 | 0.678 | +| debug_20 | all_links | 0.944 | 0.293 | 0.447 | +| debug_20 | qa_links | 0.944 | 0.363 | 0.524 | +| test_50 | all_links | 0.946 | 0.464 | 0.623 | +| test_50 | qa_links | 0.946 | 0.590 | 0.727 | +| train_149 | all_links | 0.919 | 0.385 | 0.543 | +| train_149 | qa_links | 0.919 | 0.521 | 0.665 | + diff --git a/scripts/build_phase4_summary.py b/scripts/build_phase4_summary.py new file mode 100644 index 0000000..32542ab --- /dev/null +++ b/scripts/build_phase4_summary.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +"""Build the Phase 4 capstone summary from the per-phase evaluation artifacts. + +Reads the five metrics JSONs + the three Phase 2 layout CSVs from outputs/, aggregates them with +the pure helpers in src/phase4_summary.py, and writes: + - outputs/evaluation/phase4_summary.json (gitignored machine artifact) + - reports/phase4_metrics.md (committed, paste-ready; the report reads these) +A missing artifact degrades gracefully (its section is marked unavailable). Layout has no JSON, so +it is aggregated inline from diagnostic_pos.csv / diagnostic_neg.csv / smoke_structure.csv. The +markdown is written with LF newlines so the no-drift gate holds across Windows and Colab. See +docs/phase4_brief.md. + +Usage: + python scripts/build_phase4_summary.py [--run-id mvp_rand] +""" +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import argparse +import csv +import json + +from src import config +from src import phase4_summary as p4 + + +def _load_json(path: Path): + return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None + + +def _load_csv(path: Path): + if not path.exists(): + return None + with path.open(newline="", encoding="utf-8") as f: + return list(csv.DictReader(f)) + + +def _layout_part(layout_dir: Path): + """Aggregate the three staged layout CSVs; None unless all are present.""" + pos = _load_csv(layout_dir / "diagnostic_pos.csv") + neg = _load_csv(layout_dir / "diagnostic_neg.csv") + smoke = _load_csv(layout_dir / "smoke_structure.csv") + if pos is None or neg is None or smoke is None: + return None + return p4.layout_metrics_from_rows(pos, neg, smoke) + + +def main() -> None: + ap = argparse.ArgumentParser(description="Build the Phase 4 capstone summary.") + ap.add_argument("--run-id", default="mvp_rand", + help="run-id suffix of the Phase 1A/1B deliverable artifacts") + args = ap.parse_args() + + ev = config.EVALUATION + rag = ev / "rag" + topo = _load_json(ev / f"phase1a_topology_{args.run_id}.json") + content = _load_json(ev / f"phase1b_content_{args.run_id}.json") + retr = _load_json(rag / "phase1c_retrieval.json") + qa = _load_json(rag / "phase1c_qa.json") + funsd = _load_json(ev / "phase3_funsd_relations.json") + + parts = { + "topology": p4.summarize_topology(topo) if topo else None, + "content": p4.summarize_content(content) if content else None, + "retrieval": p4.summarize_retrieval(retr) if retr else None, + "qa": p4.summarize_qa(qa) if qa else None, + "layout": _layout_part(config.LAYOUT_OUTPUT), + "funsd": p4.summarize_funsd(funsd) if funsd else None, + } + summary = p4.build_summary(parts) + + summary_path = ev / "phase4_summary.json" + summary_path.parent.mkdir(parents=True, exist_ok=True) + summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8") + + md_path = config.ROOT / "reports" / "phase4_metrics.md" + md_path.parent.mkdir(parents=True, exist_ok=True) + with md_path.open("w", encoding="utf-8", newline="") as f: # newline="": LF verbatim + f.write(p4.render_metrics_markdown(summary)) + + print("Phase 4 summary - artifact availability:") + for name in p4.PHASES: + print(f" {name:<10} {'OK' if summary[name].get('available') else 'MISSING'}") + if summary["funsd"].get("available"): + h = summary["funsd"]["headline"] + print(f"\nFUNSD headline ({summary['funsd']['primary']}): " + f"P {h['precision']:.3f} / R {h['recall']:.3f} / F1 {h['f1']:.3f}") + print(f"\nsummary -> {summary_path}") + print(f"metrics -> {md_path}") + + +if __name__ == "__main__": + main() diff --git a/src/phase4_summary.py b/src/phase4_summary.py new file mode 100644 index 0000000..5f04afc --- /dev/null +++ b/src/phase4_summary.py @@ -0,0 +1,252 @@ +"""Phase 4 capstone: aggregate the per-phase evaluation artifacts into one summary. + +Pure helpers only - no file IO, no Drive, no gradio. Each summarizer takes an already-loaded +metrics dict (the per-phase evaluation JSON) or parsed CSV rows (layout) and returns a normalized +summary dict; scripts/build_phase4_summary.py does the reading/writing. `build_summary` assembles +the parts (a missing one becomes `{"available": False}`), and `render_metrics_markdown` turns the +summary into the deterministic, paste-ready table committed at reports/phase4_metrics.md - the +report prose reads those numbers, they are never hand-copied. See docs/phase4_brief.md. +""" + +from __future__ import annotations + +from statistics import mean + +# --- per-phase summarizers (input: the loaded metrics JSON dict) --- + +_CONTENT_KEYS = ["cell_text_exact_match", "numeric_cell_relaxed_match", "non_empty_cell_content_f1"] +_RETRIEVAL_KEYS = ["hit@1", "hit@5", "hit@10", "mrr@10"] # recall@k == hit@k here, so dropped +_QA_KEYS = ["answer_exact", "numeric_relaxed", "citation_hit", "abstain_rate"] +_PRF = ("precision", "recall", "f1") + +PHASES = ["topology", "content", "retrieval", "qa", "layout", "funsd"] + + +def summarize_topology(d: dict) -> dict: + return { + "n": d["num_samples"], + "row_count_accuracy": d["row_count_accuracy"], + "col_count_accuracy": d["col_count_accuracy"], + "cell_occupancy_f1": d["cell_occupancy_f1"], + "spanning_cell_detection_rate": d["spanning_cell_detection_rate"], + } + + +def summarize_content(d: dict) -> dict: + agg, o2o = d["aggregate"], d["one_to_one"] + sub = d["topology_matched_subset"]["metrics"] + pick = lambda m: {k: m[k] for k in _CONTENT_KEYS} + return { + "n": d["num_samples"], + "aggregate": {**pick(agg), "alignment_coverage": agg["alignment_coverage"]}, + "one_to_one": {**pick(o2o), "mean_alignment_iou": o2o["mean_alignment_iou"]}, + "topology_matched_subset": {"n": d["topology_matched_subset"]["num_samples"], **pick(sub)}, + } + + +def summarize_retrieval(d: dict) -> dict: + corpora = { + corpus: {method: {k: m[k] for k in _RETRIEVAL_KEYS} for method, m in methods.items()} + for corpus, methods in d["corpora"].items() + } + return {"n": d["num_questions"], "methods": d["methods"], "corpora": corpora} + + +def summarize_qa(d: dict) -> dict: + configs = {name: {k: m[k] for k in _QA_KEYS} for name, m in d["configs"].items()} + return {"n": d["num_questions"], "configs": configs} + + +def summarize_funsd(d: dict) -> dict: + split, scope_key = d["primary"].split(".", 1) # "test_50.qa_links" -> "test_50", "qa_links" + head = d["results"][split][scope_key] + results = { + sp: {sk: {k: sv[k] for k in _PRF} for sk, sv in scopes.items()} + for sp, scopes in d["results"].items() + } + return {"primary": d["primary"], "headline": {k: head[k] for k in _PRF}, "results": results} + + +# --- layout aggregation (Phase 2 has no JSON; aggregate the staged CSV rows inline) --- + + +def _i(v) -> int: + return int(v) + + +def _f(v) -> float: + return float(v) + + +def _truthy(v) -> bool: + return str(v).strip().lower() == "true" + + +def layout_metrics_from_rows(pos_rows: list[dict], neg_rows: list[dict], + smoke_rows: list[dict]) -> dict: + """Aggregate diagnostic_pos.csv (GT-table pages), diagnostic_neg.csv (table-free pages), and + smoke_structure.csv (crop -> TATR) rows. Rows are dicts of strings (csv.DictReader); cast here. + Mirrors the table-level matching + FP definitions printed by scripts/eval_layout_iou.py. + """ + gt_total = sum(_i(r["gt_tables"]) for r in pos_rows) + crop_total = sum(_i(r["num_crop_tables"]) for r in pos_rows) + m50 = sum(_i(r["matched_50"]) for r in pos_rows) + m75 = sum(_i(r["matched_75"]) for r in pos_rows) + + def matched(m: int) -> dict: + return {"recall": m / gt_total if gt_total else 0.0, + "precision": m / crop_total if crop_total else 0.0} + + n_neg = len(neg_rows) + primary_fp = sum(1 for r in neg_rows if _i(r["primary_tables"]) > 0) + crop_fp = sum(1 for r in neg_rows if _i(r["num_crop_tables"]) > 0) + + n_smoke = len(smoke_rows) + ok = sum(1 for r in smoke_rows if _truthy(r["valid"])) + + return { + "n_gt_pages": len(pos_rows), + "gt_tables": gt_total, + "crops": crop_total, + "mean_crop_iou": mean(_f(r["best_iou_crop"]) for r in pos_rows) if pos_rows else 0.0, + "matched@0.50": matched(m50), + "matched@0.75": matched(m75), + "table_free_pages": n_neg, + "primary_fp_rate": primary_fp / n_neg if n_neg else 0.0, + "crop_fp_rate": crop_fp / n_neg if n_neg else 0.0, + "crop_to_tatr": {"n": n_smoke, "ok": ok, "warn": n_smoke - ok, + "ok_rate": ok / n_smoke if n_smoke else 0.0}, + } + + +# --- assembly + render --- + + +def build_summary(parts: dict) -> dict: + """Assemble per-phase summary dicts. parts maps a phase name (see PHASES) to its summary dict + or None; present parts get `available: True`, a missing one becomes `{"available": False}`.""" + out: dict = {} + for name in PHASES: + val = parts.get(name) + out[name] = {"available": True, **val} if val is not None else {"available": False} + return out + + +_BANNER = "" + + +def _fmt(x: float) -> str: + return f"{x:.3f}" + + +def _table(header: list[str], rows: list[list[str]]) -> list[str]: + out = ["| " + " | ".join(header) + " |", "|" + "|".join(["---"] * len(header)) + "|"] + out += ["| " + " | ".join(r) + " |" for r in rows] + return out + + +def render_metrics_markdown(summary: dict) -> str: + """Deterministic markdown for reports/phase4_metrics.md. Pure: same summary -> same bytes + (the no-drift property). A missing phase renders as '_Not available._'.""" + L: list[str] = [_BANNER, "", "# Phase 4 metrics summary", "", + "Generated from `outputs/evaluation/phase4_summary.json`; do not edit by hand.", + ""] + + # Table extraction + L.append("## Table extraction (Phase 1A topology, Phase 1B content)") + t = summary["topology"] + if t.get("available"): + L += _table(["topology metric", f"value (n={t['n']})"], [ + ["row count accuracy", _fmt(t["row_count_accuracy"])], + ["col count accuracy", _fmt(t["col_count_accuracy"])], + ["cell occupancy F1", _fmt(t["cell_occupancy_f1"])], + ["spanning cell detection", _fmt(t["spanning_cell_detection_rate"])], + ]) + else: + L.append("_Not available._") + L.append("") + c = summary["content"] + if c.get("available"): + L += _table(["content (cell-level)", "exact", "numeric", "non-empty F1"], [ + [f"aggregate (n={c['n']})", _fmt(c["aggregate"]["cell_text_exact_match"]), + _fmt(c["aggregate"]["numeric_cell_relaxed_match"]), + _fmt(c["aggregate"]["non_empty_cell_content_f1"])], + ["one-to-one (IoU>=0.5)", _fmt(c["one_to_one"]["cell_text_exact_match"]), + _fmt(c["one_to_one"]["numeric_cell_relaxed_match"]), + _fmt(c["one_to_one"]["non_empty_cell_content_f1"])], + [f"topology-matched (n={c['topology_matched_subset']['n']})", + _fmt(c["topology_matched_subset"]["cell_text_exact_match"]), + _fmt(c["topology_matched_subset"]["numeric_cell_relaxed_match"]), + _fmt(c["topology_matched_subset"]["non_empty_cell_content_f1"])], + ]) + L.append("") + L.append(f"mean alignment IoU (one-to-one): {_fmt(c['one_to_one']['mean_alignment_iou'])}") + else: + L.append("_Not available._") + L.append("") + + # Layout + L.append("## Layout (Phase 2 DocLayNet crop)") + g = summary["layout"] + if g.get("available"): + cr = g["crop_to_tatr"] + L += _table(["layout metric", "value"], [ + ["mean crop IoU (GT-table pages)", _fmt(g["mean_crop_iou"])], + ["matched@0.50 (recall / precision)", + f"{_fmt(g['matched@0.50']['recall'])} / {_fmt(g['matched@0.50']['precision'])}"], + ["matched@0.75 (recall / precision)", + f"{_fmt(g['matched@0.75']['recall'])} / {_fmt(g['matched@0.75']['precision'])}"], + ["table-free crop FP rate", _fmt(g["crop_fp_rate"])], + ["crop -> TATR OK rate", f"{_fmt(cr['ok_rate'])} ({cr['ok']}/{cr['n']})"], + ]) + else: + L.append("_Not available._") + L.append("") + + # Retrieval + L.append("## Retrieval (Phase 1C, table chunks)") + r = summary["retrieval"] + if r.get("available"): + rows = [] + for corpus in sorted(r["corpora"]): + for method in sorted(r["corpora"][corpus]): + m = r["corpora"][corpus][method] + rows.append([corpus, method, _fmt(m["hit@1"]), _fmt(m["hit@5"]), + _fmt(m["hit@10"]), _fmt(m["mrr@10"])]) + L += _table([f"corpus (n={r['n']})", "method", "hit@1", "hit@5", "hit@10", "MRR@10"], rows) + else: + L.append("_Not available._") + L.append("") + + # QA + L.append("## Table QA (Phase 1C, answer generation)") + q = summary["qa"] + if q.get("available"): + rows = [[cfg, _fmt(m["answer_exact"]), _fmt(m["numeric_relaxed"]), + _fmt(m["citation_hit"]), _fmt(m["abstain_rate"])] + for cfg, m in sorted(q["configs"].items())] + L += _table([f"config (n={q['n']})", "answer exact", "numeric relaxed", + "citation hit", "abstain rate"], rows) + else: + L.append("_Not available._") + L.append("") + + # FUNSD + L.append("## FUNSD relations (Phase 3)") + f = summary["funsd"] + if f.get("available"): + h = f["headline"] + L.append(f"headline ({f['primary']}): " + f"P {_fmt(h['precision'])} / R {_fmt(h['recall'])} / F1 {_fmt(h['f1'])}") + L.append("") + rows = [] + for split in sorted(f["results"]): + for scope in sorted(f["results"][split]): + v = f["results"][split][scope] + rows.append([split, scope, _fmt(v["precision"]), _fmt(v["recall"]), _fmt(v["f1"])]) + L += _table(["split", "scope", "precision", "recall", "f1"], rows) + else: + L.append("_Not available._") + L.append("") + + return "\n".join(L) + "\n" diff --git a/tests/test_phase4_summary.py b/tests/test_phase4_summary.py new file mode 100644 index 0000000..f483074 --- /dev/null +++ b/tests/test_phase4_summary.py @@ -0,0 +1,200 @@ +"""Phase 4 capstone summary tests (CPU, synthetic) - Phase 4. + +The summarizers take already-loaded metrics dicts (the per-phase evaluation JSONs) or parsed +CSV rows (layout) and return normalized summary dicts; no file IO, no Drive, no gradio is +imported. Fixtures are tiny inline dicts shaped like the real artifacts. Covers each summarizer, +the layout CSV aggregation math (incl. a multi-GT-table page and the table-free FP rate), +missing-artifact tolerance, and the deterministic markdown render (the no-drift property). +""" + +from src.phase4_summary import ( + build_summary, + layout_metrics_from_rows, + render_metrics_markdown, + summarize_content, + summarize_funsd, + summarize_qa, + summarize_retrieval, + summarize_topology, +) + + +# --- fixtures (shaped like the real outputs/evaluation/*.json) --- + +TOPO = { + "evaluation_type": "topology", "num_samples": 300, + "row_count_accuracy": 0.79, "col_count_accuracy": 0.987, + "cell_occupancy_f1": 0.977, "spanning_cell_detection_rate": 0.957, +} + +CONTENT = { + "num_samples": 300, + "aggregate": {"cell_text_exact_match": 0.804, "numeric_cell_relaxed_match": 0.876, + "non_empty_cell_content_f1": 0.977, "alignment_coverage": 0.990}, + "one_to_one": {"cell_text_exact_match": 0.761, "numeric_cell_relaxed_match": 0.825, + "non_empty_cell_content_f1": 0.906, "mean_alignment_iou": 0.877}, + "topology_matched_subset": {"num_samples": 234, "metrics": { + "cell_text_exact_match": 0.819, "numeric_cell_relaxed_match": 0.902, + "non_empty_cell_content_f1": 0.988}}, +} + +RETR = { + "num_questions": 30, "ks": [1, 5, 10], "methods": ["bm25", "dense", "rrf"], + "corpora": {"gt_markdown": {"bm25": { + "hit@1": 0.9, "recall@1": 0.9, "mrr@1": 0.9, + "hit@5": 0.93, "recall@5": 0.93, "mrr@5": 0.91, + "hit@10": 0.97, "recall@10": 0.97, "mrr@10": 0.92}}}, +} + +QA = { + "num_questions": 46, "top_k": 10, + "configs": {"gt_markdown": { + "num_questions": 46, "num_answerable": 40, + "answer_exact": 0.675, "numeric_relaxed": 0.775, + "citation_hit": 0.8, "abstain_rate": 0.025, "abstain_accuracy": 1.0}}, +} + +FUNSD = { + "primary": "test_50.qa_links", + "results": { + "test_50": { + "qa_links": {"precision": 0.946, "recall": 0.590, "f1": 0.727, + "tp": 494, "n_pred": 522, "n_gold": 837, "scope": "qa"}, + "all_links": {"precision": 0.946, "recall": 0.464, "f1": 0.623}}, + "train_149": { + "qa_links": {"precision": 0.919, "recall": 0.521, "f1": 0.665}, + "all_links": {"precision": 0.919, "recall": 0.385, "f1": 0.543}}}, +} + +# layout CSV rows arrive as strings (csv.DictReader); the aggregator must cast. +POS = [ # GT-table pages + {"gt_tables": "1", "num_crop_tables": "1", "best_iou_crop": "0.90", + "matched_50": "1", "matched_75": "1"}, + {"gt_tables": "3", "num_crop_tables": "2", "best_iou_crop": "0.60", + "matched_50": "2", "matched_75": "1"}, +] +NEG = [ # table-free pages + {"gt_tables": "0", "primary_tables": "0", "fallback_used": "False", "num_crop_tables": "0"}, + {"gt_tables": "0", "primary_tables": "1", "fallback_used": "False", "num_crop_tables": "1"}, + {"gt_tables": "0", "primary_tables": "0", "fallback_used": "True", "num_crop_tables": "0"}, + {"gt_tables": "0", "primary_tables": "0", "fallback_used": "False", "num_crop_tables": "0"}, +] +SMOKE = [ + {"crop": "a.png", "valid": "True", "failure_reasons": ""}, + {"crop": "b.png", "valid": "True", "failure_reasons": ""}, + {"crop": "c.png", "valid": "False", "failure_reasons": "rows_not_monotonic"}, + {"crop": "d.png", "valid": "True", "failure_reasons": ""}, +] + + +# --- per-phase summarizers --- + + +def test_summarize_topology(): + m = summarize_topology(TOPO) + assert m["n"] == 300 + assert m["row_count_accuracy"] == 0.79 + assert m["spanning_cell_detection_rate"] == 0.957 + + +def test_summarize_content_three_sections(): + m = summarize_content(CONTENT) + assert m["n"] == 300 + assert m["aggregate"]["cell_text_exact_match"] == 0.804 + assert m["one_to_one"]["mean_alignment_iou"] == 0.877 + assert m["topology_matched_subset"]["n"] == 234 + assert m["topology_matched_subset"]["non_empty_cell_content_f1"] == 0.988 + + +def test_summarize_retrieval_keeps_hit_mrr_drops_recall(): + m = summarize_retrieval(RETR) + assert m["n"] == 30 + cell = m["corpora"]["gt_markdown"]["bm25"] + assert set(cell) == {"hit@1", "hit@5", "hit@10", "mrr@10"} + assert "recall@1" not in cell and "mrr@1" not in cell + assert cell["mrr@10"] == 0.92 + + +def test_summarize_qa(): + m = summarize_qa(QA) + assert m["n"] == 46 + cfg = m["configs"]["gt_markdown"] + assert set(cfg) == {"answer_exact", "numeric_relaxed", "citation_hit", "abstain_rate"} + assert cfg["answer_exact"] == 0.675 + + +def test_summarize_funsd_headline_from_primary_pointer(): + m = summarize_funsd(FUNSD) + assert m["primary"] == "test_50.qa_links" + assert m["headline"] == {"precision": 0.946, "recall": 0.590, "f1": 0.727} + # per-split results carry only p/r/f1 (no tp/n_pred noise) + assert set(m["results"]["test_50"]["qa_links"]) == {"precision", "recall", "f1"} + assert m["results"]["train_149"]["qa_links"]["f1"] == 0.665 + + +# --- layout aggregation (the inline CSV math) --- + + +def test_layout_metrics_from_rows(): + m = layout_metrics_from_rows(POS, NEG, SMOKE) + # gt_total=4, crop_total=3, m50=3, m75=2 + assert m["gt_tables"] == 4 and m["crops"] == 3 + assert m["mean_crop_iou"] == 0.75 + assert m["matched@0.50"]["recall"] == 0.75 # 3/4 + assert m["matched@0.50"]["precision"] == 1.0 # 3/3 + assert m["matched@0.75"]["recall"] == 0.5 # 2/4 + assert round(m["matched@0.75"]["precision"], 4) == round(2 / 3, 4) + # table-free FP: 1 page with a final crop, 1 with a primary detection, out of 4 + assert m["table_free_pages"] == 4 + assert m["crop_fp_rate"] == 0.25 + assert m["primary_fp_rate"] == 0.25 + # crop -> TATR: 3 OK / 1 WARN + assert m["crop_to_tatr"] == {"n": 4, "ok": 3, "warn": 1, "ok_rate": 0.75} + + +def test_layout_metrics_empty_is_zero_safe(): + m = layout_metrics_from_rows([], [], []) + assert m["mean_crop_iou"] == 0.0 + assert m["matched@0.50"]["recall"] == 0.0 + assert m["crop_fp_rate"] == 0.0 + assert m["crop_to_tatr"]["ok_rate"] == 0.0 + + +# --- assembly + render --- + + +def _full_parts(): + return { + "topology": summarize_topology(TOPO), + "content": summarize_content(CONTENT), + "retrieval": summarize_retrieval(RETR), + "qa": summarize_qa(QA), + "layout": layout_metrics_from_rows(POS, NEG, SMOKE), + "funsd": summarize_funsd(FUNSD), + } + + +def test_build_summary_marks_present_and_missing(): + parts = _full_parts() + parts["layout"] = None # simulate a missing artifact + s = build_summary(parts) + assert s["layout"] == {"available": False} + assert s["topology"]["available"] is True + assert s["funsd"]["headline"]["f1"] == 0.727 + + +def test_render_markdown_deterministic_and_grounded(): + s = build_summary(_full_parts()) + md = render_metrics_markdown(s) + assert isinstance(md, str) and md.endswith("\n") + assert md == render_metrics_markdown(s) # no-drift: pure + deterministic + assert "0.727" in md # FUNSD headline f1 surfaced + assert "generated by" in md # static banner, no timestamp + assert "recall@" not in md # recall@k dropped from the report + + +def test_render_markdown_tolerates_missing_part(): + parts = _full_parts() + parts["funsd"] = None + md = render_metrics_markdown(build_summary(parts)) + assert "not available" in md.lower()