diff --git a/notebooks/06_demo.ipynb b/notebooks/06_demo.ipynb new file mode 100644 index 0000000..1cd4d2f --- /dev/null +++ b/notebooks/06_demo.ipynb @@ -0,0 +1,152 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Phase 4 - Demo (Colab runner)\n", + "\n", + "Runner only: mount Drive, pull the Phase 4 branch, install gradio, regenerate the summary, then\n", + "launch the artifact-backed Gradio demo. Logic lives in `scripts/run_demo.py` and `src/`, not in\n", + "this notebook (P1/P2).\n", + "\n", + "The demo degrades gracefully: it launches with **no** `OPENROUTER_API_KEY` (metrics, artifact\n", + "views, and BM25 retrieval all work; the answer-generation tab shows disabled). Dense/RRF retrieval\n", + "light up only if the embedding stack is installed. Set the key in the optional cell below to enable\n", + "grounded answer generation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Boot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Mount Drive so config.OUTPUT_ROOT (the staged artifacts + chunks) is available.\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 2. Get the code onto the VM and pin the Phase 4 branch.\n", + "# Repin BRANCH to 'main' once the Phase 4 PRs are merged.\n", + "import os\n", + "\n", + "REPO = '/content/FinDocStructRAG'\n", + "BRANCH = 'feature/phase4-demo' # PR-C; flip to 'main' after merge\n", + "\n", + "if not os.path.isdir(f'{REPO}/.git'):\n", + " !git clone --quiet https://github.com/AD2000X/FinDocStructRAG.git {REPO}\n", + "\n", + "!cd {REPO} && git fetch origin --quiet\n", + "!cd {REPO} && git checkout {BRANCH} && git pull --ff-only origin {BRANCH}\n", + "!cd {REPO} && echo branch: $(git rev-parse --abbrev-ref HEAD) HEAD: $(git log --oneline -1)\n", + "%cd /content/FinDocStructRAG" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install demo deps\n", + "\n", + "gradio is a demo-only dependency (not in `requirements-core.txt`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install -q gradio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (Optional) enable answer generation\n", + "\n", + "Leave this cell as-is to run retrieval-only (no key needed). To enable grounded answer generation,\n", + "store `OPENROUTER_API_KEY` in Colab Secrets and uncomment the two lines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "# from google.colab import userdata\n", + "# os.environ['OPENROUTER_API_KEY'] = userdata.get('OPENROUTER_API_KEY')\n", + "print('answer generation:', 'enabled' if os.getenv('OPENROUTER_API_KEY') else 'disabled (retrieval-only)')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the summary\n", + "\n", + "So the Overview tab and the embedded metrics table are fresh." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python scripts/build_phase4_summary.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Launch the demo\n", + "\n", + "This cell stays running and prints a public `share` URL. Open it to use the tabs: Overview,\n", + "Table QA, Table Extraction, Layout, FUNSD Relations, Limitations. Stop the cell to shut the app\n", + "down." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python scripts/run_demo.py" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/scripts/run_demo.py b/scripts/run_demo.py new file mode 100644 index 0000000..2e58cab --- /dev/null +++ b/scripts/run_demo.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +"""Phase 4 demo: artifact-backed Gradio app for the FinDocStructRAG capstone. + +Serves the already-produced evaluation artifacts (metrics, table outputs, layout crops, FUNSD +results) and does live BM25 retrieval + (optional) grounded answer generation over the existing +table chunks. Nothing runs a live PDF pipeline. The app degrades gracefully on two axes: + +- Retrieval stack: BM25 is the always-on CPU default; dense + RRF light up only when the + embedding stack (sentence-transformers) is importable. +- Answer generation: enabled only when OPENROUTER_API_KEY is set; otherwise the Table QA tab + still shows retrieval and the answer box explains the key is missing. + +gradio (and the dense/torch stack) are imported lazily inside the functions that need them, never +at module top, from src/, or from tests, so pytest and core stay gradio-free. See +docs/phase4_brief.md. + +Usage: + python scripts/run_demo.py +""" +from __future__ import annotations + +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +import importlib.util +import json +import os +from collections import Counter + +from src import config +from src import table_serialize +from src.retrieval import BM25Index, rrf_fuse +from src.llm_client import build_openrouter_complete, generate_answer + +CORPORA = ["gt_linearized", "gt_markdown", "ocr_linearized", "ocr_markdown"] +TOP_K = 5 + +HAS_KEY = bool(os.getenv("OPENROUTER_API_KEY")) +DENSE_AVAILABLE = importlib.util.find_spec("sentence_transformers") is not None +RETRIEVAL_METHODS = ["bm25"] + (["dense", "rrf"] if DENSE_AVAILABLE else []) + +_CHUNKS: dict = {} +_BM25: dict = {} +_DENSE: dict = {} +_EMBEDDER = None + + +# --- artifact loading (cached) --- + + +def _load_json(path: Path): + return json.loads(path.read_text(encoding="utf-8")) if path.exists() else None + + +def load_chunks(corpus: str) -> list[dict]: + if corpus not in _CHUNKS: + path = config.CHUNKS / f"{corpus}.jsonl" + rows = [] + if path.exists(): + for line in path.read_text(encoding="utf-8").splitlines(): + if line.strip(): + rows.append(json.loads(line)) + _CHUNKS[corpus] = rows + return _CHUNKS[corpus] + + +def _chunk_by_id(corpus: str) -> dict: + return {c["chunk_id"]: c for c in load_chunks(corpus)} + + +def get_bm25(corpus: str): + if corpus not in _BM25: + chunks = load_chunks(corpus) + _BM25[corpus] = BM25Index(chunks) if chunks else None + return _BM25[corpus] + + +def _get_embedder(): + global _EMBEDDER + if _EMBEDDER is None: + from src.dense_retrieval import build_bge_embedder + _EMBEDDER = build_bge_embedder() + return _EMBEDDER + + +def get_dense(corpus: str): + if corpus not in _DENSE: + from src.dense_retrieval import DenseIndex + chunks = load_chunks(corpus) + _DENSE[corpus] = DenseIndex(chunks, _get_embedder()) if chunks else None + return _DENSE[corpus] + + +# --- Table QA --- + + +def retrieve_ids(corpus: str, query: str, method: str, top_k: int = TOP_K): + """Return (chunk_ids, note). Falls back to BM25 if dense/RRF is unavailable or errors.""" + bm = get_bm25(corpus) + if bm is None: + return [], f"No chunks for corpus '{corpus}'." + if method == "bm25": + return bm.search(query, top_k), "" + try: + dn = get_dense(corpus) + if method == "dense": + return dn.search(query, top_k), "" + fused = rrf_fuse([bm.search(query, top_k * 2), dn.search(query, top_k * 2)], top_k=top_k) + return fused, "" + except Exception as e: + return bm.search(query, top_k), f"(dense/RRF unavailable: {type(e).__name__}; showing BM25)" + + +def _render_chunks(corpus: str, chunk_ids: list[str]) -> str: + by_id = _chunk_by_id(corpus) + out = [] + for rank, cid in enumerate(chunk_ids, 1): + c = by_id.get(cid, {}) + out.append(f"**{rank}. `{cid}`** (source={c.get('text_source', '?')}, " + f"{c.get('serialization', '?')})\n\n```\n{c.get('text', '')}\n```") + return "\n\n".join(out) if out else "_No results._" + + +def qa_retrieve(corpus: str, query: str, method: str) -> str: + if not (query or "").strip(): + return "_Enter a question._" + ids, note = retrieve_ids(corpus, query, method) + return f"**Corpus:** `{corpus}` **Method:** {method} {note}\n\n" + _render_chunks(corpus, ids) + + +def qa_answer(corpus: str, query: str, method: str) -> str: + if not HAS_KEY: + return ("_Answer generation disabled: set `OPENROUTER_API_KEY` to enable. " + "Retrieval above still works._") + if not (query or "").strip(): + return "_Enter a question._" + ids, _ = retrieve_ids(corpus, query, method) + by_id = _chunk_by_id(corpus) + evidence = [by_id[c] for c in ids if c in by_id] + try: + ans = generate_answer(query, evidence, complete=build_openrouter_complete()) + except Exception as e: + return f"_Answer generation error: {type(e).__name__}: {e}_" + cites = ", ".join(f"`{c}`" for c in ans.citations) or "(none)" + status = "abstained" if ans.abstained else "answered" + return f"**Answer ({status}):** {ans.answer or '(empty)'}\n\n**Citations:** {cites}" + + +# --- Table Extraction --- + + +_TABLE_SOURCES = [ + (config.TABLES_GT_FILLED, "GT-filled (reference, P4)"), + (config.TABLES_OCR_FILLED, "OCR-filled (real extraction)"), + (config.TABLES_TATR_PREDICTED, "TATR-predicted (structure)"), +] + + +def list_samples() -> list[str]: + d = config.TABLES_GT_FILLED + return sorted(p.stem for p in d.glob("*.json")) if d.exists() else [] + + +def table_view(sample_id: str) -> str: + if not sample_id: + return "_Pick a sample._" + parts = [] + for d, label in _TABLE_SOURCES: + tbl = _load_json(d / f"{sample_id}.json") + if tbl is None: + parts.append(f"### {label}\n_Not available._") + else: + parts.append(f"### {label} ({tbl.get('num_rows', '?')}x{tbl.get('num_cols', '?')})\n\n" + + (table_serialize.serialize_markdown(tbl) or "_empty_")) + return "\n\n".join(parts) + + +# --- Layout --- + + +def list_layout_pages() -> list[str]: + d = config.LAYOUT_OUTPUT / "regions" + return sorted(p.stem for p in d.glob("*.json")) if d.exists() else [] + + +def layout_summary_md() -> str: + s = _load_json(config.EVALUATION / "phase4_summary.json") + g = (s or {}).get("layout", {}) + if not g.get("available"): + return "_Phase 2 layout summary not available (run build_phase4_summary)._" + cr = g["crop_to_tatr"] + return (f"**Phase 2 layout (aggregate):** mean crop IoU {g['mean_crop_iou']:.3f}; " + f"matched@0.50 recall {g['matched@0.50']['recall']:.3f}; " + f"table-free FP rate {g['crop_fp_rate']:.3f}; " + f"crop->TATR OK {cr['ok']}/{cr['n']}.") + + +def layout_view(page_id: str): + if not page_id: + return [], "_Pick a page._" + crops = sorted(str(p) for p in (config.LAYOUT_OUTPUT / "crops").glob(f"{page_id}_table_*.png")) + regions = _load_json(config.LAYOUT_OUTPUT / "regions" / f"{page_id}.json") or [] + counts = Counter(r["label"] for r in regions) + tables = [r for r in regions if r["label"] == "table"] + lines = [f"**Page `{page_id}`** - {len(regions)} regions, {len(crops)} table crop(s).", + "region counts: " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items()))] + for i, t in enumerate(tables): + lines.append(f"- table {i}: score {t.get('score', '?')}, box {[round(x, 1) for x in t['box']]}") + return crops, "\n".join(lines) + + +# --- FUNSD + Overview + Limitations --- + + +def funsd_view() -> str: + d = _load_json(config.EVALUATION / "phase3_funsd_relations.json") + if d is None: + return "_FUNSD results not available._" + lines = [f"**Held-out headline:** `{d['primary']}`.", "", + "| split | scope | precision | recall | f1 |", "|---|---|---|---|---|"] + for split, scopes in d["results"].items(): + for scope, m in scopes.items(): + lines.append(f"| {split} | {scope} | {m['precision']:.3f} | {m['recall']:.3f} | {m['f1']:.3f} |") + lines += ["", "_Annotation-only deterministic baseline; recall is the design ceiling. " + "Qualitative error overlays live in `notebooks/05_phase3_funsd_relations.ipynb`._"] + return "\n".join(lines) + + +def overview_view() -> str: + summary = _load_json(config.EVALUATION / "phase4_summary.json") + parts = ["## Capstone overview", ""] + if summary: + parts.append("**Artifact availability:** " + ", ".join( + f"{name}={'OK' if part.get('available') else 'MISSING'}" for name, part in summary.items())) + f = summary.get("funsd", {}) + if f.get("available"): + h = f["headline"] + parts.append(f"\n**FUNSD headline ({f['primary']}):** " + f"P {h['precision']:.3f} / R {h['recall']:.3f} / F1 {h['f1']:.3f}") + else: + parts.append("_Run `python scripts/build_phase4_summary.py` to generate the summary._") + metrics_md = config.ROOT / "reports" / "phase4_metrics.md" + if metrics_md.exists(): + parts += ["", "---", "", metrics_md.read_text(encoding="utf-8")] + return "\n".join(parts) + + +LIMITATIONS_MD = """## Limitations (honest scope) + +- **Subset evaluation**, not whole-dataset benchmarks. +- **GT-filled vs OCR-filled are kept separate (P4):** GT-filled is a QA-validation reference, + never reported as an extraction output; OCR-filled is the real extraction. +- **FUNSD V1 is annotation-only** over GT entities (no entity detection, single link per answer, + geometry-only) - recall is the design ceiling. +- **RAG is table-only**; full-document text, charts, and figures are out of scope + (caption-level / future work). +- **This demo is artifact-backed** - it serves produced outputs and does live retrieval / answer + generation over the existing chunks; it does not run a live PDF -> pipeline. +- **GriTS / Ragas / DeepEval** are future work, not used as gates here. +""" + + +def main() -> None: + import gradio as gr + + samples = list_samples() + pages = list_layout_pages() + answer_gen = "enabled" if HAS_KEY else "disabled (no OPENROUTER_API_KEY)" + + with gr.Blocks(title="FinDocStructRAG capstone demo") as demo: + gr.Markdown(f"# FinDocStructRAG - capstone demo\n" + f"Artifact-backed. Retrieval: {', '.join(RETRIEVAL_METHODS)}. " + f"Answer generation: {answer_gen}.") + + with gr.Tab("Overview"): + ov = gr.Markdown() + demo.load(overview_view, None, ov) + + with gr.Tab("Table QA"): + with gr.Row(): + corpus = gr.Dropdown(CORPORA, value="gt_linearized", label="corpus (GT vs OCR)") + method = gr.Dropdown(RETRIEVAL_METHODS, value="bm25", label="retrieval method") + question = gr.Textbox(label="question", + placeholder="e.g. What was the discount rate in 2014?") + with gr.Row(): + btn_r = gr.Button("Retrieve") + btn_a = gr.Button("Generate answer" + ("" if HAS_KEY else " (disabled)")) + results = gr.Markdown() + answer = gr.Markdown() + btn_r.click(qa_retrieve, [corpus, question, method], results) + btn_a.click(qa_answer, [corpus, question, method], answer) + + with gr.Tab("Table Extraction"): + if samples: + sample = gr.Dropdown(samples, value=samples[0], label="sample table") + tbl_out = gr.Markdown() + sample.change(table_view, sample, tbl_out) + demo.load(table_view, sample, tbl_out) + else: + gr.Markdown("_No table outputs found under outputs/tables/._") + + with gr.Tab("Layout"): + gr.Markdown(layout_summary_md()) + if pages: + page = gr.Dropdown(pages, value=pages[0], label="DocLayNet page") + gallery = gr.Gallery(label="table crops") + region_md = gr.Markdown() + page.change(layout_view, page, [gallery, region_md]) + demo.load(layout_view, page, [gallery, region_md]) + else: + gr.Markdown("_No layout crops found under outputs/layout/._") + + with gr.Tab("FUNSD Relations"): + funsd_md = gr.Markdown() + demo.load(funsd_view, None, funsd_md) + + with gr.Tab("Limitations"): + gr.Markdown(LIMITATIONS_MD) + + demo.launch(share=config.IN_COLAB) + + +if __name__ == "__main__": + main()