Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 102 additions & 6 deletions notebooks/04_phase2_layout.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -144,15 +144,15 @@
{
"cell_type": "code",
"id": "47c2a078",
"source": "!python scripts/run_layout_batch.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --table-threshold 0.5",
"source": "!python scripts/run_layout_batch.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "3bc88c2c",
"source": "# Manifest preview + spot-check one crop\nimport pandas as pd\nfrom pathlib import Path\nfrom src import config\nfrom IPython.display import display, Image as IPImage\n\nmanifest_path = config.LAYOUT_OUTPUT / \"manifest.csv\"\ndf = pd.read_csv(manifest_path)\nprint(df[[\"page_id\", \"status\", \"gt_tables\", \"num_regions\", \"num_tables\", \"num_cropped\", \"fallback_used\"]].to_string(index=False))\nprint(f\"\\nprocessed={df['status'].eq('processed').sum()} failed={df['status'].eq('failed').sum()}\")\nprint(f\"gt_table pages={df['gt_tables'].gt(0).sum()} total crops={df['num_cropped'].sum()} fallback pages={df['fallback_used'].sum()}\")\n\n# Display first crop found\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\nfirst_crop = next(crops_dir.glob(\"*.png\"), None)\nif first_crop:\n print(f\"\\nspot-check: {first_crop.name}\")\n display(IPImage(str(first_crop), width=600))\nelse:\n print(\"no crops written\")",
"source": "# Manifest preview + spot-check one crop\nimport pandas as pd\nfrom pathlib import Path\nfrom src import config\nfrom IPython.display import display, Image as IPImage\n\nmanifest_path = config.LAYOUT_OUTPUT / \"manifest.csv\"\ndf = pd.read_csv(manifest_path)\nprint(df[[\"page_id\", \"status\", \"gt_tables\", \"num_regions\", \"num_tables\", \"num_cropped\", \"fallback_used\"]].to_string(index=False))\nprint(f\"\\nprocessed={df['status'].eq('processed').sum()} failed={df['status'].eq('failed').sum()}\")\nprint(f\"gt_table pages={df['gt_tables'].gt(0).sum()} total crops={df['num_cropped'].sum()} fallback pages={df['fallback_used'].sum()}\")\n\n# Only show crops from this run's pages (avoids stale artifact confusion)\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\nrun_pages = set(df[\"page_id\"])\nrun_crops = sorted(\n f for f in crops_dir.glob(\"*.png\")\n if f.stem.rsplit(\"_table_\", 1)[0] in run_pages\n)\nfirst_crop = run_crops[0] if run_crops else None\nif first_crop:\n print(f\"\\nspot-check: {first_crop.name}\")\n display(IPImage(str(first_crop), width=600))\nelse:\n print(\"no crops written for this run\")",
"metadata": {},
"execution_count": null,
"outputs": []
Expand All @@ -166,15 +166,15 @@
{
"cell_type": "code",
"id": "3b8b19a3",
"source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --table-threshold 0.5",
"source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "4b348fa6",
"source": "# Diagnostic CSV preview (sorted by best_iou_final desc)\nimport pandas as pd\nfrom src import config\n\ndf = pd.read_csv(config.LAYOUT_OUTPUT / \"diagnostic.csv\")\ncols = [\"page_id\", \"gt_tables\", \"primary_tables\", \"primary_max_score\",\n \"fallback_used\", \"best_iou_primary\", \"best_iou_fallback\", \"best_iou_final\"]\nprint(df[cols].sort_values(\"best_iou_final\", ascending=False).to_string(index=False))",
"source": "# Diagnostic CSV preview (sorted by best_iou_crop desc)\nimport pandas as pd\nfrom src import config\n\n# --require-table-gt writes diagnostic_pos.csv; fall back to diagnostic.csv\n_diag = config.LAYOUT_OUTPUT / \"diagnostic_pos.csv\"\nif not _diag.exists():\n _diag = config.LAYOUT_OUTPUT / \"diagnostic.csv\"\ndf = pd.read_csv(_diag)\ncols = [\"page_id\", \"gt_tables\", \"num_crop_tables\", \"matched_50\", \"matched_75\",\n \"primary_max_score\", \"fallback_used\",\n \"best_iou_primary\", \"best_iou_crop\"]\nprint(df[cols].sort_values(\"best_iou_crop\", ascending=False).to_string(index=False))\nhas_gt = df[df[\"gt_tables\"] > 0]\nprint(f\"\\ngt_total={has_gt['gt_tables'].sum()} crops={has_gt['num_crop_tables'].sum()}\"\n f\" matched@0.5={has_gt['matched_50'].sum()} matched@0.75={has_gt['matched_75'].sum()}\"\n f\" mean_crop_iou={has_gt['best_iou_crop'].mean():.3f}\")",
"metadata": {},
"execution_count": null,
"outputs": []
Expand All @@ -188,15 +188,15 @@
{
"cell_type": "code",
"id": "95b7eab2",
"source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --exclude-table-gt --primary-threshold 0.3 --table-threshold 0.5",
"source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --exclude-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"id": "3adb928e",
"source": "## Step 5c - Retune: confirm threshold=0.30 improvement\n\nQ3 simulation says lowering `table_threshold` from 0.50 → 0.30 should raise\nmean IoU 0.823 → 0.925 (+10%). This is because 3 of 4 fallback triggers at\nthresh=0.50 were false-negatives: primary scored 0.34–0.45 but had IoU ~0.963.\nAt thresh=0.30 only val_000378 (true miss) goes to TATR fallback.\n\nRun the same positive diagnostic with `--table-threshold 0.30` to measure the\nactual (not simulated) improvement. The column `best_iou_final` at thresh=0.30\nshould be higher than at thresh=0.50 for the 3 reclaimed pages.",
"source": "## Step 5c - Retune: confirm threshold=0.30 improvement\n\nQ3 simulation (new fallback rule: fires only when `primary_tables >= 1`) predicts\nlowering `table_threshold` from 0.50 → 0.30 reduces fallback pages from 3 → 1\nand raises `mean_iou_crop_sim` (0.963 reclaimed for the 2 pages where primary\nhad a low-score but high-IoU box).\n\nWatch the two key numbers in the output:\n- `mean best_iou_crop` at thresh=0.30 vs thresh=0.50\n- `Fallback used` count (should drop from 3 to 1)\n\nThe 2 reclaimed pages are val_000238 (primary score 0.44, IoU 0.954) and\nval_001347 (primary max 0.45, IoU 0.949). val_004383 (score 0.335) stays in\nfallback territory at both thresholds.",
"metadata": {}
},
{
Expand All @@ -206,6 +206,102 @@
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"id": "8e75e001",
"source": "## Step 5d - Spot-check val_005241: dedup collapse (primary IoU 0.944 → crop IoU 0.610)\n\n`val_005241` has **2 GT tables** that sit close together. Primary found 2 boxes that\noverlap each other above `dedup_iou=0.5`, so dedup kept only the higher-scoring one.\nThat surviving box aligns with GT table 2 (IoU ~0.61), not GT table 1 (IoU ~0.94).\nThe box that would have given 0.94 was dropped as a \"duplicate.\"\n\nThis cell re-runs the primary detector directly (no dedup) and shows per-box IoU against\neach GT so the collapse is visible. Also displays any crops saved by Step 4.\n\n**Run Step 4 first** so the regions JSON and crops exist at `config.LAYOUT_OUTPUT`.",
"metadata": {}
},
{
"cell_type": "code",
"id": "baefa041",
"source": "import json\nfrom PIL import Image\nfrom datasets import load_dataset\nfrom IPython.display import display, Image as IPImage\nfrom src import config\nfrom src.bbox_utils import iou, xywh_to_xyxy\nfrom src.layout_detector import build_layout_detector\nfrom src.layout_parsing import TABLE_LABEL\n\nPAGE_ID = \"val_005241\"\nORIG_IDX = 5241\n\n# GT boxes\nds_val = load_dataset(\"docling-project/DocLayNet-v1.1\", split=\"val\")\nex = ds_val[ORIG_IDX]\nbboxes = ex.get(\"bboxes\", ex.get(\"bbox\", []))\ncats = ex.get(\"category_id\", [])\ngt_boxes = [xywh_to_xyxy(tuple(b)) for cat, b in zip(cats, bboxes) if cat == 9]\nprint(f\"GT tables ({len(gt_boxes)}):\")\nfor i, b in enumerate(gt_boxes):\n print(f\" GT[{i}]: {[round(c) for c in b]}\")\n\n# Primary detector (no dedup)\ntry:\n layout_det\nexcept NameError:\n layout_det = build_layout_detector(config.LAYOUT_MODEL, threshold=0.3)\n\nimg = ex[\"image\"].convert(\"RGB\")\nprimary_tables = [r for r in layout_det(img) if r.label == TABLE_LABEL]\nprint(f\"\\nPrimary-alone table regions ({len(primary_tables)}) — before dedup:\")\nfor i, r in enumerate(primary_tables):\n ious_vs_gt = [round(iou(r.box, g), 4) for g in gt_boxes]\n ious_vs_primary = [round(iou(r.box, r2.box), 4) for j, r2 in enumerate(primary_tables) if j != i]\n print(f\" P[{i}] score={r.score:.4f} box={[round(c) for c in r.box]}\")\n print(f\" IoU/GT={ious_vs_gt} IoU/otherPrimary={ious_vs_primary}\")\n\n# Final regions from batch JSON\nregions_path = config.LAYOUT_OUTPUT / \"regions\" / f\"{PAGE_ID}.json\"\nif regions_path.exists():\n regions = json.loads(regions_path.read_text())\n final_tables = [r for r in regions if r[\"label\"] == \"table\"]\n print(f\"\\nFinal table regions from batch JSON ({len(final_tables)}) — after dedup:\")\n for r in final_tables:\n box = tuple(r[\"box\"])\n ious_vs_gt = [round(iou(box, g), 4) for g in gt_boxes]\n print(f\" score={r['score']:.4f} source={r['source']} box={[round(c) for c in box]}\")\n print(f\" IoU/GT={ious_vs_gt}\")\nelse:\n print(f\"\\n[warn] {regions_path} not found — run Step 4 first\")\n\n# Crops\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\ncrop_files = sorted(crops_dir.glob(f\"{PAGE_ID}_table_*.png\"))\nprint(f\"\\nCrops saved by Step 4 ({len(crop_files)}):\")\nfor crop_path in crop_files:\n print(f\" {crop_path.name}\")\n display(IPImage(str(crop_path), width=600))\nif not crop_files:\n print(\" none — re-run Step 4 (default thresh=0.3 will now save them)\")",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"id": "a7552431",
"source": "## Step 5e - dedup sensitivity: test dedup-iou=0.70\n\n`val_005241`: P[0] score=0.484 IoU=0.610, P[1] score=0.342 IoU=0.944.\nTheir mutual IoU is ~0.600. At `dedup_iou=0.50`, NMS collapses them (keeps P[0] by score);\nat `dedup_iou=0.70`, both survive and both GTs get covered.\n\nExpected effect: `best_iou_crop` for val_005241 rises toward 0.944;\nthe trade-off is potentially one extra crop (two overlapping boxes for the same region).\nWatch `mean best_iou_crop` across all 20 pages — it should improve if val_005241 is\nthe main outlier and other pages are unaffected by the looser dedup.",
"metadata": {}
},
{
"cell_type": "code",
"id": "389a86df",
"source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --dedup-iou 0.7",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"id": "d7d14b1a",
"source": "## Step 6 - MVP evaluation (seed=42, n=200)\n\nFinal calibration: `table_threshold=0.30`, `dedup_iou=0.70` (both now defaults).\nThis is the gate run before recording Phase 2 layout detection as complete.\n\n**Pass criteria (approximate):**\n- `mean best_iou_crop` ≥ 0.88 on positive set\n- FP crop rate ≤ 15% on negative set\n- `failure` count = 0 or negligible\n\nRun 6a (batch), then 6b (positive IoU), then 6c (false-positive).\nRe-run the Step 4 manifest preview cell after 6a to see n=200 stats.",
"metadata": {}
},
{
"cell_type": "code",
"id": "f497cccb",
"source": "# Step 6a - MVP batch (seed=42, n=200)\n!python scripts/run_layout_batch.py --seed 42 --n 200 --require-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "56f9c060",
"source": "# Step 6b - positive IoU diagnostic (seed=42, n=200)\n!python scripts/eval_layout_iou.py --seed 42 --n 200 --require-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "5292729f",
"source": "# Step 6c - false-positive diagnostic (seed=42, n=200)\n!python scripts/eval_layout_iou.py --seed 42 --n 200 --exclude-table-gt --primary-threshold 0.3",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"id": "976edd57",
"source": "## Step 7 - End-to-end: Phase 2 crop → TATR structure recognition\n\nConfirms the Phase 2 crops are compatible with the Phase 1 TATR structure model.\nFor each selected crop: runs inference → `normalize_tatr_prediction` → `validate_grid_geometry`.\nPrints `rows`, `cols`, `cells`, `valid`, and failure reasons per crop. Writes `smoke_structure.csv`.\n\n**Step 7 (baseline):** no band dedup — measures raw TATR output quality on DocLayNet crops.\n**Step 7c (dedup):** `--dedup-bands` applies 1-D NMS to overlapping row/col bands before normalize.\n\n`--n 50 --seed 42` samples 50 crops from the Step 6a batch (286 available).",
"metadata": {}
},
{
"cell_type": "code",
"id": "79cf79c2",
"source": "!python scripts/smoke_structure.py --n 50 --seed 42",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "32258829",
"source": "# Step 7b - smoke_structure.csv: inspect failure reasons for WARN crops\nimport pandas as pd\nfrom src import config\n\ndf = pd.read_csv(config.LAYOUT_OUTPUT / \"smoke_structure.csv\")\nprint(f\"OK: {df['valid'].sum()} WARN: {(~df['valid']).sum()} total: {len(df)}\")\nprint()\nwarn = df[~df[\"valid\"]].copy()\nif len(warn):\n print(\"WARN crops:\")\n print(warn[[\"crop\", \"rows\", \"cols\", \"cells\", \"failure_reasons\"]].to_string(index=False))\n print()\n print(\"Failure reason counts:\")\n all_reasons = [r.strip() for reasons in warn[\"failure_reasons\"].dropna() for r in reasons.split(\";\") if r.strip()]\n from collections import Counter\n for reason, count in Counter(all_reasons).most_common():\n print(f\" {count:3d} {reason}\")\nelse:\n print(\"All crops valid.\")",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "4efa70e7",
"source": "# Step 7c - historical A/B reference only; dedup is now the default in normalize_tatr_prediction\n# Before dedup: 37 OK / 13 WARN (Step 7b)\n# After dedup: 50 OK / 0 WARN (this run, seed=42, n=50)\n# !python scripts/smoke_structure.py --n 50 --seed 42 --dedup-bands # flag no longer exists",
"metadata": {},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"id": "4faede6e",
"source": "# Step 7d - full crop smoke: all 286 crops, dedup now default\n# Result: 285 OK / 1 WARN (val_000670_table_1: rows=0 cols=4 -> no row boxes detected)\n# WARN rate 0.35%, well under <=5% gate. Phase 2 crop -> structure handoff: PASSED.\n!python scripts/smoke_structure.py --n 286 --seed 42",
"metadata": {},
"execution_count": null,
"outputs": []
}
],
"metadata": {
Expand Down
Loading
Loading