AD2000X · AD2000X · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/notebooks/04_phase2_layout.ipynb b/notebooks/04_phase2_layout.ipynb
@@ -144,15 +144,15 @@
   {
    "cell_type": "code",
    "id": "47c2a078",
-   "source": "!python scripts/run_layout_batch.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --table-threshold 0.5",
+   "source": "!python scripts/run_layout_batch.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "id": "3bc88c2c",
-   "source": "# Manifest preview + spot-check one crop\nimport pandas as pd\nfrom pathlib import Path\nfrom src import config\nfrom IPython.display import display, Image as IPImage\n\nmanifest_path = config.LAYOUT_OUTPUT / \"manifest.csv\"\ndf = pd.read_csv(manifest_path)\nprint(df[[\"page_id\", \"status\", \"gt_tables\", \"num_regions\", \"num_tables\", \"num_cropped\", \"fallback_used\"]].to_string(index=False))\nprint(f\"\\nprocessed={df['status'].eq('processed').sum()}  failed={df['status'].eq('failed').sum()}\")\nprint(f\"gt_table pages={df['gt_tables'].gt(0).sum()}  total crops={df['num_cropped'].sum()}  fallback pages={df['fallback_used'].sum()}\")\n\n# Display first crop found\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\nfirst_crop = next(crops_dir.glob(\"*.png\"), None)\nif first_crop:\n    print(f\"\\nspot-check: {first_crop.name}\")\n    display(IPImage(str(first_crop), width=600))\nelse:\n    print(\"no crops written\")",
+   "source": "# Manifest preview + spot-check one crop\nimport pandas as pd\nfrom pathlib import Path\nfrom src import config\nfrom IPython.display import display, Image as IPImage\n\nmanifest_path = config.LAYOUT_OUTPUT / \"manifest.csv\"\ndf = pd.read_csv(manifest_path)\nprint(df[[\"page_id\", \"status\", \"gt_tables\", \"num_regions\", \"num_tables\", \"num_cropped\", \"fallback_used\"]].to_string(index=False))\nprint(f\"\\nprocessed={df['status'].eq('processed').sum()}  failed={df['status'].eq('failed').sum()}\")\nprint(f\"gt_table pages={df['gt_tables'].gt(0).sum()}  total crops={df['num_cropped'].sum()}  fallback pages={df['fallback_used'].sum()}\")\n\n# Only show crops from this run's pages (avoids stale artifact confusion)\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\nrun_pages = set(df[\"page_id\"])\nrun_crops = sorted(\n    f for f in crops_dir.glob(\"*.png\")\n    if f.stem.rsplit(\"_table_\", 1)[0] in run_pages\n)\nfirst_crop = run_crops[0] if run_crops else None\nif first_crop:\n    print(f\"\\nspot-check: {first_crop.name}\")\n    display(IPImage(str(first_crop), width=600))\nelse:\n    print(\"no crops written for this run\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
@@ -166,15 +166,15 @@
   {
    "cell_type": "code",
    "id": "3b8b19a3",
-   "source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --table-threshold 0.5",
+   "source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "code",
    "id": "4b348fa6",
-   "source": "# Diagnostic CSV preview (sorted by best_iou_final desc)\nimport pandas as pd\nfrom src import config\n\ndf = pd.read_csv(config.LAYOUT_OUTPUT / \"diagnostic.csv\")\ncols = [\"page_id\", \"gt_tables\", \"primary_tables\", \"primary_max_score\",\n        \"fallback_used\", \"best_iou_primary\", \"best_iou_fallback\", \"best_iou_final\"]\nprint(df[cols].sort_values(\"best_iou_final\", ascending=False).to_string(index=False))",
+   "source": "# Diagnostic CSV preview (sorted by best_iou_crop desc)\nimport pandas as pd\nfrom src import config\n\n# --require-table-gt writes diagnostic_pos.csv; fall back to diagnostic.csv\n_diag = config.LAYOUT_OUTPUT / \"diagnostic_pos.csv\"\nif not _diag.exists():\n    _diag = config.LAYOUT_OUTPUT / \"diagnostic.csv\"\ndf = pd.read_csv(_diag)\ncols = [\"page_id\", \"gt_tables\", \"num_crop_tables\", \"matched_50\", \"matched_75\",\n        \"primary_max_score\", \"fallback_used\",\n        \"best_iou_primary\", \"best_iou_crop\"]\nprint(df[cols].sort_values(\"best_iou_crop\", ascending=False).to_string(index=False))\nhas_gt = df[df[\"gt_tables\"] > 0]\nprint(f\"\\ngt_total={has_gt['gt_tables'].sum()}  crops={has_gt['num_crop_tables'].sum()}\"\n      f\"  matched@0.5={has_gt['matched_50'].sum()}  matched@0.75={has_gt['matched_75'].sum()}\"\n      f\"  mean_crop_iou={has_gt['best_iou_crop'].mean():.3f}\")",
    "metadata": {},
    "execution_count": null,
    "outputs": []
@@ -188,15 +188,15 @@
   {
    "cell_type": "code",
    "id": "95b7eab2",
-   "source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --exclude-table-gt --primary-threshold 0.3 --table-threshold 0.5",
+   "source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --exclude-table-gt --primary-threshold 0.3",
    "metadata": {},
    "execution_count": null,
    "outputs": []
   },
   {
    "cell_type": "markdown",
    "id": "3adb928e",
-   "source": "## Step 5c - Retune: confirm threshold=0.30 improvement\n\nQ3 simulation says lowering `table_threshold` from 0.50 → 0.30 should raise\nmean IoU 0.823 → 0.925 (+10%). This is because 3 of 4 fallback triggers at\nthresh=0.50 were false-negatives: primary scored 0.34–0.45 but had IoU ~0.963.\nAt thresh=0.30 only val_000378 (true miss) goes to TATR fallback.\n\nRun the same positive diagnostic with `--table-threshold 0.30` to measure the\nactual (not simulated) improvement. The column `best_iou_final` at thresh=0.30\nshould be higher than at thresh=0.50 for the 3 reclaimed pages.",
+   "source": "## Step 5c - Retune: confirm threshold=0.30 improvement\n\nQ3 simulation (new fallback rule: fires only when `primary_tables >= 1`) predicts\nlowering `table_threshold` from 0.50 → 0.30 reduces fallback pages from 3 → 1\nand raises `mean_iou_crop_sim` (0.963 reclaimed for the 2 pages where primary\nhad a low-score but high-IoU box).\n\nWatch the two key numbers in the output:\n- `mean best_iou_crop` at thresh=0.30 vs thresh=0.50\n- `Fallback used` count (should drop from 3 to 1)\n\nThe 2 reclaimed pages are val_000238 (primary score 0.44, IoU 0.954) and\nval_001347 (primary max 0.45, IoU 0.949). val_004383 (score 0.335) stays in\nfallback territory at both thresholds.",
    "metadata": {}
   },
   {
@@ -206,6 +206,102 @@
    "metadata": {},
    "execution_count": null,
    "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e75e001",
+   "source": "## Step 5d - Spot-check val_005241: dedup collapse (primary IoU 0.944 → crop IoU 0.610)\n\n`val_005241` has **2 GT tables** that sit close together. Primary found 2 boxes that\noverlap each other above `dedup_iou=0.5`, so dedup kept only the higher-scoring one.\nThat surviving box aligns with GT table 2 (IoU ~0.61), not GT table 1 (IoU ~0.94).\nThe box that would have given 0.94 was dropped as a \"duplicate.\"\n\nThis cell re-runs the primary detector directly (no dedup) and shows per-box IoU against\neach GT so the collapse is visible. Also displays any crops saved by Step 4.\n\n**Run Step 4 first** so the regions JSON and crops exist at `config.LAYOUT_OUTPUT`.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "baefa041",
+   "source": "import json\nfrom PIL import Image\nfrom datasets import load_dataset\nfrom IPython.display import display, Image as IPImage\nfrom src import config\nfrom src.bbox_utils import iou, xywh_to_xyxy\nfrom src.layout_detector import build_layout_detector\nfrom src.layout_parsing import TABLE_LABEL\n\nPAGE_ID = \"val_005241\"\nORIG_IDX = 5241\n\n# GT boxes\nds_val = load_dataset(\"docling-project/DocLayNet-v1.1\", split=\"val\")\nex = ds_val[ORIG_IDX]\nbboxes = ex.get(\"bboxes\", ex.get(\"bbox\", []))\ncats = ex.get(\"category_id\", [])\ngt_boxes = [xywh_to_xyxy(tuple(b)) for cat, b in zip(cats, bboxes) if cat == 9]\nprint(f\"GT tables ({len(gt_boxes)}):\")\nfor i, b in enumerate(gt_boxes):\n    print(f\"  GT[{i}]: {[round(c) for c in b]}\")\n\n# Primary detector (no dedup)\ntry:\n    layout_det\nexcept NameError:\n    layout_det = build_layout_detector(config.LAYOUT_MODEL, threshold=0.3)\n\nimg = ex[\"image\"].convert(\"RGB\")\nprimary_tables = [r for r in layout_det(img) if r.label == TABLE_LABEL]\nprint(f\"\\nPrimary-alone table regions ({len(primary_tables)}) — before dedup:\")\nfor i, r in enumerate(primary_tables):\n    ious_vs_gt = [round(iou(r.box, g), 4) for g in gt_boxes]\n    ious_vs_primary = [round(iou(r.box, r2.box), 4) for j, r2 in enumerate(primary_tables) if j != i]\n    print(f\"  P[{i}] score={r.score:.4f}  box={[round(c) for c in r.box]}\")\n    print(f\"       IoU/GT={ious_vs_gt}  IoU/otherPrimary={ious_vs_primary}\")\n\n# Final regions from batch JSON\nregions_path = config.LAYOUT_OUTPUT / \"regions\" / f\"{PAGE_ID}.json\"\nif regions_path.exists():\n    regions = json.loads(regions_path.read_text())\n    final_tables = [r for r in regions if r[\"label\"] == \"table\"]\n    print(f\"\\nFinal table regions from batch JSON ({len(final_tables)}) — after dedup:\")\n    for r in final_tables:\n        box = tuple(r[\"box\"])\n        ious_vs_gt = [round(iou(box, g), 4) for g in gt_boxes]\n        print(f\"  score={r['score']:.4f}  source={r['source']}  box={[round(c) for c in box]}\")\n        print(f\"       IoU/GT={ious_vs_gt}\")\nelse:\n    print(f\"\\n[warn] {regions_path} not found — run Step 4 first\")\n\n# Crops\ncrops_dir = config.LAYOUT_OUTPUT / \"crops\"\ncrop_files = sorted(crops_dir.glob(f\"{PAGE_ID}_table_*.png\"))\nprint(f\"\\nCrops saved by Step 4 ({len(crop_files)}):\")\nfor crop_path in crop_files:\n    print(f\"  {crop_path.name}\")\n    display(IPImage(str(crop_path), width=600))\nif not crop_files:\n    print(\"  none — re-run Step 4 (default thresh=0.3 will now save them)\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7552431",
+   "source": "## Step 5e - dedup sensitivity: test dedup-iou=0.70\n\n`val_005241`: P[0] score=0.484 IoU=0.610, P[1] score=0.342 IoU=0.944.\nTheir mutual IoU is ~0.600. At `dedup_iou=0.50`, NMS collapses them (keeps P[0] by score);\nat `dedup_iou=0.70`, both survive and both GTs get covered.\n\nExpected effect: `best_iou_crop` for val_005241 rises toward 0.944;\nthe trade-off is potentially one extra crop (two overlapping boxes for the same region).\nWatch `mean best_iou_crop` across all 20 pages — it should improve if val_005241 is\nthe main outlier and other pages are unaffected by the looser dedup.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "389a86df",
+   "source": "!python scripts/eval_layout_iou.py --seed 7 --n 20 --require-table-gt --primary-threshold 0.3 --dedup-iou 0.7",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7d14b1a",
+   "source": "## Step 6 - MVP evaluation (seed=42, n=200)\n\nFinal calibration: `table_threshold=0.30`, `dedup_iou=0.70` (both now defaults).\nThis is the gate run before recording Phase 2 layout detection as complete.\n\n**Pass criteria (approximate):**\n- `mean best_iou_crop` ≥ 0.88 on positive set\n- FP crop rate ≤ 15% on negative set\n- `failure` count = 0 or negligible\n\nRun 6a (batch), then 6b (positive IoU), then 6c (false-positive).\nRe-run the Step 4 manifest preview cell after 6a to see n=200 stats.",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "f497cccb",
+   "source": "# Step 6a - MVP batch (seed=42, n=200)\n!python scripts/run_layout_batch.py --seed 42 --n 200 --require-table-gt --primary-threshold 0.3",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "56f9c060",
+   "source": "# Step 6b - positive IoU diagnostic (seed=42, n=200)\n!python scripts/eval_layout_iou.py --seed 42 --n 200 --require-table-gt --primary-threshold 0.3",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "5292729f",
+   "source": "# Step 6c - false-positive diagnostic (seed=42, n=200)\n!python scripts/eval_layout_iou.py --seed 42 --n 200 --exclude-table-gt --primary-threshold 0.3",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "976edd57",
+   "source": "## Step 7 - End-to-end: Phase 2 crop → TATR structure recognition\n\nConfirms the Phase 2 crops are compatible with the Phase 1 TATR structure model.\nFor each selected crop: runs inference → `normalize_tatr_prediction` → `validate_grid_geometry`.\nPrints `rows`, `cols`, `cells`, `valid`, and failure reasons per crop. Writes `smoke_structure.csv`.\n\n**Step 7 (baseline):** no band dedup — measures raw TATR output quality on DocLayNet crops.\n**Step 7c (dedup):** `--dedup-bands` applies 1-D NMS to overlapping row/col bands before normalize.\n\n`--n 50 --seed 42` samples 50 crops from the Step 6a batch (286 available).",
+   "metadata": {}
+  },
+  {
+   "cell_type": "code",
+   "id": "79cf79c2",
+   "source": "!python scripts/smoke_structure.py --n 50 --seed 42",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "32258829",
+   "source": "# Step 7b - smoke_structure.csv: inspect failure reasons for WARN crops\nimport pandas as pd\nfrom src import config\n\ndf = pd.read_csv(config.LAYOUT_OUTPUT / \"smoke_structure.csv\")\nprint(f\"OK: {df['valid'].sum()}  WARN: {(~df['valid']).sum()}  total: {len(df)}\")\nprint()\nwarn = df[~df[\"valid\"]].copy()\nif len(warn):\n    print(\"WARN crops:\")\n    print(warn[[\"crop\", \"rows\", \"cols\", \"cells\", \"failure_reasons\"]].to_string(index=False))\n    print()\n    print(\"Failure reason counts:\")\n    all_reasons = [r.strip() for reasons in warn[\"failure_reasons\"].dropna() for r in reasons.split(\";\") if r.strip()]\n    from collections import Counter\n    for reason, count in Counter(all_reasons).most_common():\n        print(f\"  {count:3d}  {reason}\")\nelse:\n    print(\"All crops valid.\")",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "4efa70e7",
+   "source": "# Step 7c - historical A/B reference only; dedup is now the default in normalize_tatr_prediction\n# Before dedup: 37 OK / 13 WARN (Step 7b)\n# After dedup:  50 OK /  0 WARN (this run, seed=42, n=50)\n# !python scripts/smoke_structure.py --n 50 --seed 42 --dedup-bands  # flag no longer exists",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "id": "4faede6e",
+   "source": "# Step 7d - full crop smoke: all 286 crops, dedup now default\n# Result: 285 OK / 1 WARN (val_000670_table_1: rows=0 cols=4 -> no row boxes detected)\n# WARN rate 0.35%, well under <=5% gate. Phase 2 crop -> structure handoff: PASSED.\n!python scripts/smoke_structure.py --n 286 --seed 42",
+   "metadata": {},
+   "execution_count": null,
+   "outputs": []
   }
  ],
  "metadata": {