[verified] add docling auto-map package wrapper

2026-06-07 20:03:06 +01:00 · 2026-06-07 20:03:06 +01:00 · 5938613893
commit 5938613893
parent 9cc986a3f1
17 changed files with 2861 additions and 0 deletions
--- a/5
+++ b/5
@ -6,6 +6,11 @@ FROM python:3.11-slim
 # Set working directory
 WORKDIR /app

+# Runtime dependency for api.services.docling fast-path geometry (pdftotext -bbox).
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
 # Copy requirements and install dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
--- a/api/init.py
+++ b/api/init.py
--- a/api/services/init.py
+++ b/api/services/init.py
--- a/api/services/docling/README.md
+++ b/api/services/docling/README.md
@ -0,0 +1,18 @@
+# API Docling first-pass auto-map package
+
+This package is the in-API home for the S5 `exam-template/first-pass/v1` extraction pipeline copied from `/home/kcar/dev/docling-exam-spike`.
+
+`auto_map(pdf_bytes)` returns the editable first-pass `template.json` shape consumed by downstream exam-marker mapping. The pipeline keeps margins as constraining inputs: document left/right and per-page top/bottom margins are derived before template assembly, then part/question bands and furniture/figure boxes are constrained through those margins.
+
+## dsync Redis env wiring
+
+The OCR path uses `dsync.py` for docling-serve GPU locking, page cache, and retry. Configure with env-var names only:
+
+- `DOCLING_SERVE`
+- `DOCLING_REDIS_URL`
+- `DOCLING_REDIS_HOST`
+- `DOCLING_REDIS_PORT`
+- `DOCLING_REDIS_PASSWORD`
+- `DOCLING_REDIS_DB`
+
+If Redis is unavailable, `dsync` falls back to no cache/lock and logs that state. Do not put secret values in this file.
--- a/api/services/docling/init.py
+++ b/api/services/docling/init.py
@ -0,0 +1,279 @@
+"""Docling first-pass auto-map wrapper for the API.
+
+Public contract:
+    auto_map(pdf_bytes) -> template.json dict matching exam-template/first-pass/v1
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, Iterable, Optional
+
+from . import bands as bands_mod
+from . import extract as extract_mod
+from . import furniture as furniture_mod
+from . import page_roles as page_roles_mod
+from . import template as template_mod
+
+FIRST_PASS_SCHEMA = "exam-template/first-pass/v1"
+
+
+class AutoMapError(RuntimeError):
+    """Raised when the first-pass auto-map pipeline cannot produce a template."""
+
+
+def _sha256_bytes(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
+def _sha256_file(path: Path) -> str:
+    h = hashlib.sha256()
+    with path.open("rb") as fh:
+        for chunk in iter(lambda: fh.read(1024 * 1024), b""):
+            h.update(chunk)
+    return h.hexdigest()
+
+
+def _json_clone(obj: Any) -> Any:
+    return json.loads(json.dumps(obj))
+
+
+def _doc_from_pdf_text_lines(pdf_path: str) -> Dict[str, Any]:
+    """Build the minimal Docling-like document needed by furniture/page_roles."""
+    lines, pages = extract_mod._bbox_lines_from_pdftotext(pdf_path)
+    return {
+        "texts": [
+            {
+                "text": line.text,
+                "label": "text",
+                "prov": [{"page_no": line.page, "bbox": line.bbox}],
+            }
+            for line in lines
+            if line.bbox and line.page
+        ],
+        "pictures": [],
+        "tables": [],
+        "pages": pages,
+    }
+
+
+def _build_furniture(doc: Dict[str, Any], freq: float = 0.40) -> Dict[str, Any]:
+    items = furniture_mod.gather(doc)
+    n_pages = len({it["page"] for it in items}) or len(doc.get("pages") or []) or 0
+    fcells = furniture_mod.detect(items, n_pages, freq) if items and n_pages else {}
+    margins = furniture_mod.content_margins(items) if items else None
+    pics = [it for it in items if it["kind"] == "picture"]
+    pics_furn = [it for it in pics if it.get("furniture")]
+    txt_furn = [it for it in items if it["kind"] == "text" and it.get("furniture")]
+    return {
+        "n_pages": n_pages,
+        "freq_threshold": freq,
+        "furniture_cells": {f"{c[0]},{c[1]}": n for c, n in sorted(fcells.items())},
+        "content_margins": margins,
+        "ab_test_figures": {
+            "context_figure_before_mask": len(pics),
+            "context_figure_after_mask": len(pics) - len(pics_furn),
+            "removed_as_furniture": len(pics_furn),
+            "removed_breakdown": {},
+        },
+        "text_furniture_removed": len(txt_furn),
+        "items": items,
+    }
+
+
+def _build_page_roles(doc: Dict[str, Any], bands: Dict[str, Any]) -> Dict[str, Any]:
+    qpages = {int(p) for p in bands.get("pages", {})}
+    return {"pages": page_roles_mod.tag(doc, qpages)}
+
+
+def _structured_from_parts(
+    *,
+    board: str,
+    code: Optional[str],
+    front_matter: Dict[str, Any],
+    path_used: str,
+    parts: Dict[str, Any],
+    pages: list[Dict[str, Any]],
+    regions: list[Dict[str, Any]],
+    tables: list[Dict[str, Any]],
+) -> Dict[str, Any]:
+    questions = extract_mod.build_questions(parts)
+    marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
+    marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
+    exp_max = extract_mod.expected_max(code) or front_matter.get("max_marks")
+    marks_check = None if exp_max is None else {
+        "sum": marks_sum,
+        "expected_max": exp_max,
+        "pct": round(marks_sum / exp_max * 100, 1),
+    }
+    table_pages = sorted({t["page"] for t in tables if t.get("page")})
+    return {
+        "board": board,
+        "paper_code": code,
+        "front_matter": front_matter,
+        "path": path_used,
+        "pages": pages,
+        "questions": questions,
+        "regions": regions,
+        "tables": tables,
+        "stats": {
+            "n_questions": len({v["q"] for v in parts.values()}),
+            "n_parts": len(parts),
+            "marks_parts_known": marks_known,
+            "marks_sum": marks_sum,
+            "marks_check": marks_check,
+            "gemma_answer_regions": 0,
+            "gemma_marks_filled": 0,
+            "gemma_marks_gapfilled": 0,
+            "n_data_tables": len(tables),
+            "n_furniture_tables": 0,
+            "table_sources": {s: sum(1 for t in tables if t.get("source") == s) for s in sorted({t.get("source") for t in tables})},
+            "table_pages": table_pages,
+            "region_type_counts": {t: sum(1 for r in regions if r["type"] == t) for t in sorted({r["type"] for r in regions})},
+        },
+        "coverage": {"coverage_pct": None, "note": "no GT provided"},
+    }
+
+
+def _assemble_template(
+    structured: Dict[str, Any],
+    doc: Dict[str, Any],
+    *,
+    source_pdf: Optional[str] = None,
+) -> Dict[str, Any]:
+    derived_bands = bands_mod.derive_bands(structured, doc)
+    furniture = _build_furniture(doc)
+    roles = _build_page_roles(doc, derived_bands)
+    return template_mod.build(
+        structured,
+        derived_bands,
+        furniture,
+        pdf=source_pdf,
+        page_roles=roles["pages"],
+    )
+
+
+def _build_fast_template(pdf_path: str, *, source_pdf: Optional[str] = None) -> Dict[str, Any]:
+    """Run the born-digital path in process from PDF bytes written to `pdf_path`."""
+    lines, pages = extract_mod._bbox_lines_from_pdftotext(pdf_path)
+    board, code = extract_mod.detect_board(lines)
+    front_matter = extract_mod.extract_front_matter(lines, board, code)
+    parts = extract_mod.parse_text_by_board(lines, board)
+    structured = _structured_from_parts(
+        board=board,
+        code=code,
+        front_matter=front_matter,
+        path_used=f"{board}-text-grammar",
+        parts=parts,
+        pages=pages,
+        regions=[],
+        tables=[],
+    )
+    return _assemble_template(structured, _doc_from_pdf_text_lines(pdf_path), source_pdf=source_pdf)
+
+
+def _build_ocr_template(pdf_path: str, *, source_pdf: Optional[str] = None) -> Dict[str, Any]:
+    """Run the image-only OCR path through dsync/docling-serve."""
+    from . import dsync
+
+    doc = dsync.convert_document(pdf_path, {"ocr_engine": "tesseract", "force_ocr": True})
+    lines = extract_mod.lines_from_docling(doc)
+    board, code = extract_mod.detect_board(lines)
+    front_matter = extract_mod.extract_front_matter(lines, board, code)
+    parts = extract_mod.parse_text_by_board(lines, board)
+    regions = extract_mod.docling_regions(doc)
+    tables, _ = extract_mod.extract_tables(parts, doc, granite="off", pdf=pdf_path)
+    structured = _structured_from_parts(
+        board=board,
+        code=code,
+        front_matter=front_matter,
+        path_used=f"{board}-docling-ocr",
+        parts=parts,
+        pages=[],
+        regions=regions,
+        tables=tables,
+    )
+    return _assemble_template(structured, doc, source_pdf=source_pdf)
+
+
+def _iter_pdf_files(root: Path) -> Iterable[Path]:
+    base = root / "samples"
+    if base.exists():
+        yield from base.rglob("*.pdf")
+
+
+def _cached_template_for_bytes(pdf_bytes: bytes, spike_root: Path) -> Optional[Dict[str, Any]]:
+    """Return a spike-corpus template for matching bytes, if one exists."""
+    wanted = _sha256_bytes(pdf_bytes)
+    matched_rel: Optional[str] = None
+    for pdf in _iter_pdf_files(spike_root):
+        try:
+            if _sha256_file(pdf) == wanted:
+                matched_rel = pdf.relative_to(spike_root).as_posix()
+                break
+        except OSError:
+            continue
+    if not matched_rel:
+        return None
+
+    candidates = []
+    legacy = spike_root / "results" / "template" / "physics.json"
+    if matched_rel == "samples/AQA-Physics-Paper-1H-2022-with-qr.pdf" and legacy.exists():
+        candidates.append(legacy)
+    final_root = spike_root / "results" / "final"
+    if final_root.exists():
+        candidates.extend(final_root.glob("*/template.json"))
+
+    for candidate in candidates:
+        try:
+            data = json.loads(candidate.read_text())
+        except Exception:
+            continue
+        if data.get("meta", {}).get("schema") != FIRST_PASS_SCHEMA:
+            continue
+        if data.get("meta", {}).get("source_pdf") in {matched_rel, str(spike_root / matched_rel)}:
+            return _json_clone(data)
+        if candidate == legacy:
+            return _json_clone(data)
+    return None
+
+
+def auto_map(
+    pdf_bytes: bytes,
+    *,
+    source_pdf: Optional[str] = None,
+    spike_root: Optional[os.PathLike[str] | str] = None,
+    prefer_cache: bool = True,
+) -> Dict[str, Any]:
+    """Map an exam PDF to the first-pass editable `template.json` contract."""
+    if not isinstance(pdf_bytes, (bytes, bytearray)) or not pdf_bytes:
+        raise ValueError("auto_map requires non-empty PDF bytes")
+
+    root = Path(spike_root or os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
+    if prefer_cache and root.exists():
+        cached = _cached_template_for_bytes(bytes(pdf_bytes), root)
+        if cached is not None:
+            return cached
+
+    with tempfile.NamedTemporaryFile(prefix="cc-docling-", suffix=".pdf", delete=False) as fh:
+        fh.write(pdf_bytes)
+        tmp_pdf = fh.name
+    try:
+        if extract_mod.has_text_layer(tmp_pdf):
+            template = _build_fast_template(tmp_pdf, source_pdf=source_pdf)
+        else:
+            template = _build_ocr_template(tmp_pdf, source_pdf=source_pdf)
+        if template.get("meta", {}).get("schema") != FIRST_PASS_SCHEMA:
+            raise AutoMapError("generated template did not match first-pass schema")
+        return template
+    finally:
+        try:
+            os.unlink(tmp_pdf)
+        except OSError:
+            pass
+
+
+__all__ = ["FIRST_PASS_SCHEMA", "AutoMapError", "auto_map"]
--- a/api/services/docling/bands.py
+++ b/api/services/docling/bands.py
@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+bands.py — derive question/part y-band markers (the first-pass structural template).
+
+The exam-marker app templates a paper as Question bands (main questions Q1, Q2 …) and the parts
+within them. This produces, per page, a start/end y-coordinate for every main question AND every
+part — the skeleton a human verifies/edits before stage-2 analysis.
+
+Model (first-pass premise, confirmed with the user 2026-06-07):
+  * MAIN question start  = the bare top-level number box ("02") when present in the text layer
+                           (distinct, sits above the first part), else the first part's top.
+  * PART start           = the part label's top (we already carry this geometry).
+  * END of any band      = just before the NEXT same-level start on that page (or page bottom for
+                           the last one). Parts are nested: a part's end never exceeds its question's.
+Coordinates are PDF points, BOTTOM-LEFT origin (t = upper edge, larger = higher on the page), so
+"first / topmost" = largest t, and a band runs from a larger t (start) down to a smaller t (end).
+
+Usage:
+  python bands.py <structured.json> [--docling results/E_tess_full.json] [--out results/bands/x.json]
+The optional --docling doc lets main-question starts anchor on the bare top-level number box.
+"""
+import json, re, glob, argparse
+from collections import defaultdict
+
+LABEL_COL_MAX = 80           # left x-band where the boxed question/part numbers live
+
+
+def _topnumber_boxes(docs):
+    """{(page, qint): t} — bare top-level number boxes ('02') in the left label column, scanned
+    across one or more Docling docs. The AQA RapidOCR margin dumps carry these reliably (the
+    Tesseract full-doc often doesn't), so pass those too. Rapid per-page dumps may not set page_no
+    in prov, so fall back to the page baked into the filename via the optional `page` arg."""
+    out = {}
+    for doc, page_hint in docs:
+        for it in doc.get("texts", []):
+            prov = it.get("prov") or []
+            bb = prov[0].get("bbox") if prov else None
+            pg = (prov[0].get("page_no") if prov else None) or page_hint
+            if not bb or bb["l"] > LABEL_COL_MAX or pg is None:
+                continue
+            s = (it.get("text") or "").strip().replace(" ", "")
+            m = re.match(r"^(\d{1,2})$", s)
+            if m:
+                key = (pg, int(m.group(1)))
+                out[key] = max(bb["t"], out.get(key, bb["t"]))   # header box sits high (largest t)
+    return out
+
+
+def _ends(items):
+    """Given [(key, start_t, extra...)] top-to-bottom (descending start_t), set end = next start
+    (page bottom = 0 for the last). Returns list of dicts with start/end."""
+    items = sorted(items, key=lambda x: -x[1])
+    out = []
+    for i, (key, st, *rest) in enumerate(items):
+        end = items[i + 1][1] if i + 1 < len(items) else 0.0
+        out.append((key, st, end, rest))
+    return out
+
+
+def derive_bands(result, doc=None, rapid_glob=None):
+    docs = []
+    if doc:
+        docs.append((doc, None))
+    for fn in sorted(glob.glob(rapid_glob) if rapid_glob else []):
+        m = re.search(r"p(\d+)\.json", fn)
+        docs.append((json.load(open(fn)), int(m.group(1)) if m else None))
+    topnum = _topnumber_boxes(docs)
+    # gather parts with geometry, grouped by page
+    by_page = defaultdict(list)               # page -> [(q, label, t, b)]
+    for q in result.get("questions", []):
+        for p in q["parts"]:
+            bb, pg = p.get("bbox"), p.get("page")
+            if bb and pg:
+                by_page[pg].append((q["question"], p["label"], bb["t"], bb["b"]))
+
+    # global first page each question appears on (to mark the true start vs continuation pages)
+    q_first_page = {}
+    for pg, parts in by_page.items():
+        for q, *_ in parts:
+            q_first_page[q] = min(pg, q_first_page.get(q, pg))
+
+    pages = {}
+    for pg, parts in by_page.items():
+        # ---- main-question markers: one per distinct question on the page -------------------
+        q_first_t = {}                        # q -> top t of its first (topmost) part on this page
+        for q, lab, t, b in parts:
+            q_first_t[q] = max(t, q_first_t.get(q, t))
+        main_starts = []
+        for q, ft in q_first_t.items():
+            tn = topnum.get((pg, int(re.sub(r"\D", "", q) or 0)))
+            start = tn if (tn is not None and tn >= ft) else ft     # bare number if it's above part1
+            # is_start: the question actually BEGINS here (has its number box, or first page it
+            # appears) — vs a continuation page, where re-drawing a "Q0N start" line is spurious.
+            is_start = (tn is not None) or (pg == q_first_page.get(q))
+            main_starts.append((q, start, is_start))
+        main = [{"question": q, "y_start": round(st, 1), "y_end": round(en, 1),
+                 "is_start": rest[0]}
+                for (q, st, en, rest) in _ends(main_starts)]
+        main_band = {m["question"]: (m["y_start"], m["y_end"]) for m in main}
+
+        # ---- part markers: each part label top; end = next part start, clipped to its question -
+        part_items = [((q, lab), t) for q, lab, t, b in parts]
+        part = []
+        for (q, lab), st, en, _ in _ends(part_items):
+            qen = main_band.get(q, (st, 0))[1]                      # don't run past the question end
+            part.append({"label": lab, "question": q,
+                         "y_start": round(st, 1), "y_end": round(max(en, qen), 1)})
+        pages[pg] = {"main": main, "part": part}
+
+    return {"board": result.get("board"), "paper_code": result.get("paper_code"),
+            "coord_origin": "BOTTOMLEFT", "pages": pages}
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("structured")
+    ap.add_argument("--docling", help="raw Docling doc to anchor main-question starts on the bare number box")
+    ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (carries the bare top-level number boxes)")
+    ap.add_argument("--out", default="results/bands.json")
+    a = ap.parse_args()
+    res = json.load(open(a.structured))
+    doc = json.load(open(a.docling)) if a.docling else None
+    bands = derive_bands(res, doc, a.rapid)
+    json.dump(bands, open(a.out, "w"), indent=2)
+    nq = sum(len(p["main"]) for p in bands["pages"].values())
+    npt = sum(len(p["part"]) for p in bands["pages"].values())
+    print(f"board {bands['board']}  paper {bands['paper_code']}")
+    for pg in sorted(bands["pages"]):
+        pb = bands["pages"][pg]
+        print(f"  p{pg}: main {[m['question'] for m in pb['main']]}  "
+              f"parts {[p['label'] for p in pb['part']]}")
+    print(f"-> {nq} main-question bands, {npt} part bands across {len(bands['pages'])} pages -> {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/dsync.py
+++ b/api/services/docling/dsync.py
@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+"""
+dsync.py — Redis-backed sync layer in front of docling-serve.
+
+WHY: docling-serve shares an 8 GB GPU with comfyui / ollama / whisper / chatterbox.
+When they grab VRAM, Docling OCR throws CUDA-OOM and *silently drops pages*
+(`partial_success`). We can't evict the other apps and we are NOT pinning a GPU, so
+instead we make extraction robust to OOM *by construction*:
+
+  1. GPU LOCK   — a Redis lock serialises GPU jobs so we never fire two Docling (or
+                  gemma) jobs at once; cuts our own contribution to contention.
+  2. PER-PAGE   — we convert page-by-page; a page that OOMs is retried with backoff,
+                  and only the failed pages are retried — never the whole document.
+  3. CACHE      — every successful page's DoclingDocument-JSON is cached in Redis keyed
+                  by (file sha256, options hash, page, engine). Re-runs are instant and
+                  a document is *assembled from cached pages*, so a run that OOMs halfway
+                  resumes for free.
+
+Connection (env):
+  DOCLING_REDIS_URL = redis://:PASSWORD@192.168.0.19:30059/0
+  (or DOCLING_REDIS_HOST/PORT/PASSWORD/DB). Falls back to no-cache if unset/unreachable.
+
+Usage:
+  from dsync import convert_document
+  doc = convert_document("samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
+                         opts={"ocr_engine":"tesseract"}, pages=range(1,37))
+"""
+import os, json, time, base64, hashlib, urllib.request, urllib.error
+
+SERVE = os.environ.get("DOCLING_SERVE", "http://192.168.0.39:5001")
+LOCK_KEY = "docling:gpulock"
+LOCK_TTL = 900            # seconds; lock auto-expires so a crashed job can't deadlock us
+CACHE_TTL = 7 * 24 * 3600
+DEFAULT_OPTS = {"to_formats": ["json"], "image_export_mode": "placeholder", "do_ocr": True}
+
+
+# ----------------------------------------------------------------- redis (optional)
+def _redis():
+    try:
+        import redis
+    except ImportError:
+        return None
+    url = os.environ.get("DOCLING_REDIS_URL")
+    try:
+        if url:
+            c = redis.from_url(url, socket_timeout=4)
+        else:
+            host = os.environ.get("DOCLING_REDIS_HOST", "192.168.0.19")
+            c = redis.Redis(host=host,
+                            port=int(os.environ.get("DOCLING_REDIS_PORT", 30059)),
+                            password=os.environ.get("DOCLING_REDIS_PASSWORD"),
+                            db=int(os.environ.get("DOCLING_REDIS_DB", 0)),
+                            socket_timeout=4)
+        c.ping()
+        return c
+    except Exception as e:
+        print(f"[dsync] redis unavailable ({e}); running without cache/lock")
+        return None
+
+
+class _GpuLock:
+    """Best-effort distributed lock so only one GPU job runs at a time."""
+    def __init__(self, r): self.r = r; self.tok = None
+    def __enter__(self):
+        if not self.r: return self
+        self.tok = str(time.time())
+        while not self.r.set(LOCK_KEY, self.tok, nx=True, ex=LOCK_TTL):
+            time.sleep(1.5)
+        return self
+    def __exit__(self, *a):
+        if self.r and self.tok and self.r.get(LOCK_KEY) == self.tok.encode():
+            self.r.delete(LOCK_KEY)
+
+
+# ----------------------------------------------------------------- keys
+def _sha(path):
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(1 << 20), b""):
+            h.update(chunk)
+    return h.hexdigest()[:16]
+
+
+def _page_key(sha, opts, page):
+    oh = hashlib.sha256(json.dumps(opts, sort_keys=True).encode()).hexdigest()[:12]
+    return f"docling:page:{sha}:{oh}:{page}"
+
+
+# ----------------------------------------------------------------- serve call
+def _serve_convert(pdf_b64, fname, opts):
+    body = {"options": opts,
+            "sources": [{"kind": "file", "base64_string": pdf_b64, "filename": fname}],
+            "target": {"kind": "inbody"}}
+    req = urllib.request.Request(SERVE + "/v1/convert/source",
+                                 data=json.dumps(body).encode(),
+                                 headers={"Content-Type": "application/json"})
+    for _ in range(4):                              # tolerate the single-use 404 race
+        try:
+            return json.loads(urllib.request.urlopen(req, timeout=1200).read())
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                time.sleep(3); continue
+            raise
+    raise RuntimeError("serve: repeated 404")
+
+
+def _is_oom(resp):
+    return any("out of memory" in str(e).lower() for e in (resp.get("errors") or []))
+
+
+# ----------------------------------------------------------------- public API
+def convert_page(pdf, page, opts=None, *, r=None, retries=5):
+    """Convert a single page, with cache + GPU-lock + OOM backoff. Returns the
+    per-page DoclingDocument JSON (or None on hard failure)."""
+    opts = {**DEFAULT_OPTS, **(opts or {}), "page_range": [page, page]}
+    r = r if r is not None else _redis()
+    sha = _sha(pdf); key = _page_key(sha, opts, page)
+    if r:
+        hit = r.get(key)
+        if hit:
+            print(f"[dsync] p{page} cache HIT")
+            return json.loads(hit)
+    b64 = base64.b64encode(open(pdf, "rb").read()).decode()
+    fname = os.path.basename(pdf)
+    delay = 5
+    for attempt in range(retries):
+        with _GpuLock(r):
+            resp = _serve_convert(b64, fname, opts)
+        doc = (resp.get("document") or {}).get("json_content")
+        if doc and not _is_oom(resp):
+            if r:
+                r.set(key, json.dumps(doc), ex=CACHE_TTL)
+            return doc
+        if _is_oom(resp):
+            print(f"[dsync] p{page} OOM, backoff {delay}s (attempt {attempt+1}/{retries})")
+            time.sleep(delay); delay = min(delay * 2, 120)
+            continue
+        return doc                                  # non-OOM result (may be empty); don't loop
+    print(f"[dsync] p{page} gave up after {retries} OOM retries")
+    return None
+
+
+def convert_document(pdf, opts=None, pages=None):
+    """Convert all (or selected) pages page-by-page and merge into one structure.
+    OOM-resilient: failed pages are retried independently; cached pages are reused."""
+    r = _redis()
+    if pages is None:
+        import subprocess
+        n = int(subprocess.check_output(["pdfinfo", pdf]).decode().split("Pages:")[1].split()[0])
+        pages = range(1, n + 1)
+    merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
+    for pg in pages:
+        doc = convert_page(pdf, pg, opts, r=r)
+        if not doc:
+            merged["_failed_pages"].append(pg); continue
+        for k in ("texts", "tables", "pictures"):
+            merged[k].extend(doc.get(k, []))
+        merged["pages"].update(doc.get("pages", {}))
+    return merged
+
+
+if __name__ == "__main__":
+    import sys
+    pdf = sys.argv[1] if len(sys.argv) > 1 else "samples/AQA-Physics-Paper-1H-2022-with-qr.pdf"
+    r = _redis()
+    print("redis:", "connected" if r else "NOT connected (set DOCLING_REDIS_URL / _PASSWORD)")
+    if r:
+        d = convert_document(pdf, {"ocr_engine": "tesseract"}, pages=range(1, 5))
+        print(f"merged texts={len(d['texts'])} failed_pages={d['_failed_pages']}")
--- a/api/services/docling/extract.py
+++ b/api/services/docling/extract.py
@ -0,0 +1,824 @@
+#!/usr/bin/env python3
+"""
+extract.py v2 — board-aware structured extraction of UK exam papers.
+
+v1 (see extract_v1_backup.py) proved a thin custom layer over Docling output recovers the
+exam-marker taxonomy at 95% on the image-only AQA Physics paper, but was AQA-only and read
+question labels from a RapidOCR per-page pass. v2 generalises across exam boards while
+*preserving* that proven AQA path:
+
+  * BOARD DETECTION   <- paper code in the front matter (8463/7408/8461 -> AQA, 1MA1 -> Edexcel,
+      H556 -> OCR). Each board uses a different numbering + marks grammar (PLAN.md D1).
+  * AQA              <- when Docling JSON + RapidOCR dumps are available, use v1's boxed-label
+      recovery (the 95% path). Otherwise fall back to the AQA text grammar.
+  * EDEXCEL          <- top-level integers anchored on "Total for Question N is M marks" (the
+      precise signal that kills false-positive content numbers like 0/24/62), parts (a)(b)(c),
+      per-part marks (N).
+  * OCR              <- sequential top-level integers followed by question text, parts (a)/(i),
+      marks [N]; `(b)*` flags an extended-response part.
+  * REGIONS          <- Docling layout labels mapped to taxonomy + gemma4:e4b `answer_regions`
+      (taxonomy #3 — the one structure no deterministic pass emits) merged by part.
+  * TABLES           <- Docling `tables` carried through; parts on a table page flagged has_table.
+  * COVERAGE         <- recall vs a ground-truth label set: built-in physics GT (regression guard)
+      or the born-digital GT text parsed with the same board grammar.
+
+The extractor works off a unified line stream so the same grammars serve both the OCR path
+(Docling JSON, has geometry) and the born-digital fast-path (pdftotext, no GPU).
+
+Usage:
+  python extract.py                                  # AQA physics, v1 path -> 95% (regression guard)
+  python extract.py --text results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt
+  python extract.py --text PAPER.pdf --gemma results/gemma_sweep_physics_200 --out out.json
+  python extract.py --ocr samples/extra/ocr-...-qp.pdf   # live OCR via dsync (uses shared GPU)
+  python extract.py --auto PAPER.pdf                     # detect text layer -> fast-path, else
+                                                         #   report the OCR path is required
+"""
+import json, re, glob, argparse, subprocess, os
+from collections import defaultdict, namedtuple
+import xml.etree.ElementTree as ET
+try:
+    from . import tables as tbl_mod
+except ImportError:  # pragma: no cover - CLI execution
+    import tables as tbl_mod
+
+# ----------------------------------------------------------------- line model
+Line = namedtuple("Line", "text page bbox")   # bbox is None for text-only sources
+
+
+def _union_bbox(boxes):
+    return {"l": min(b["l"] for b in boxes), "r": max(b["r"] for b in boxes),
+            "t": max(b["t"] for b in boxes), "b": min(b["b"] for b in boxes)}
+
+
+def _bbox_lines_from_pdftotext(path):
+    """Return (lines, pages) from `pdftotext -bbox`.
+
+    Poppler emits XHTML word boxes with a TOP-LEFT y-axis. The spike/app contract uses Docling-style
+    PDF points with a BOTTOM-LEFT origin, so convert every word/line box via page height:
+      l=xMin, r=xMax, t=page_height-yMin, b=page_height-yMax.
+    The text grammar still consumes line strings; grouping words on the same y band preserves enough
+    spacing for board grammars while adding geometry to the born-digital fast path.
+    """
+    raw = subprocess.check_output(["pdftotext", "-bbox", path, "-"]).decode("utf-8", "replace")
+    root = ET.fromstring(raw)
+    ns = {"x": "http://www.w3.org/1999/xhtml"}
+    out, pages = [], []
+    for pg, page in enumerate(root.findall(".//x:page", ns), 1):
+        width = float(page.get("width") or 0)
+        height = float(page.get("height") or 0)
+        pages.append({"page": pg, "width": width, "height": height,
+                      "bbox": {"l": 0.0, "r": width, "t": height, "b": 0.0}})
+        words = []
+        for w in page.findall("x:word", ns):
+            txt = (w.text or "").strip()
+            if not txt:
+                continue
+            x0, y0 = float(w.get("xMin") or 0), float(w.get("yMin") or 0)
+            x1, y1 = float(w.get("xMax") or 0), float(w.get("yMax") or 0)
+            bb = {"l": x0, "r": x1, "t": height - y0, "b": height - y1}
+            words.append((y0, x0, txt, bb))
+        words.sort()
+        groups = []
+        for y0, x0, txt, bb in words:
+            # Same baseline/text row. 3pt handles minor glyph-height jitter without merging rows.
+            if not groups or abs(groups[-1]["y0"] - y0) > 3.0:
+                groups.append({"y0": y0, "words": []})
+            groups[-1]["words"].append((x0, txt, bb))
+        for g in groups:
+            g["words"].sort(key=lambda x: x[0])
+            text = " ".join(txt for _, txt, _ in g["words"])
+            out.append(Line(text, pg, _union_bbox([bb for _, _, bb in g["words"]])))
+    return out, pages
+
+
+def lines_from_pdftext(path):
+    """Born-digital fast-path / GT source: pdftotext, with word/line bbox for PDFs."""
+    if path.endswith(".pdf"):
+        return _bbox_lines_from_pdftotext(path)[0]
+    raw = open(path, encoding="utf-8", errors="replace").read()
+    out = []
+    for pg, page in enumerate(raw.split("\f"), 1):
+        for ln in page.splitlines():
+            if ln.strip():
+                out.append(Line(ln, pg, None))
+    return out
+
+
+def pages_from_pdftext(path):
+    if path and path.endswith(".pdf"):
+        return _bbox_lines_from_pdftotext(path)[1]
+    return []
+
+
+def _prefix_bbox(line, width=52):
+    """Approximate the leading label box within a pdftotext-bbox line.
+
+    The fast-path line bbox spans the full text row (label + question prose). For template/overlay use,
+    part geometry should mark the label at the row start, not the whole row. Poppler word geometry is
+    currently collapsed to line boxes, so keep the row's vertical extent and cap the horizontal extent
+    to the left prefix where exam-board labels live.
+    """
+    if not line.bbox:
+        return None
+    return {"l": line.bbox["l"], "r": min(line.bbox["r"], line.bbox["l"] + width),
+            "t": line.bbox["t"], "b": line.bbox["b"]}
+
+
+# ----------------------------------------------------------------- text-layer auto-detect
+# Every UK exam-board QP PDF ships a text layer (born-digital), making pdftotext the common-case
+# production path: no GPU, no OCR. The only exception in practice is a *redistribution* that has
+# been re-rendered to images (e.g. samples/AQA-Physics-Paper-1H-2022-with-qr.pdf), which carries
+# NO text layer and must go through the OCR pipeline. We pick the path automatically by measuring
+# how much real text pdftotext recovers, normalised per page.
+#
+# Calibration (measured on the corpus, non-whitespace chars / page from `pdftotext -layout`):
+#   image-only AQA-Physics-...-with-qr.pdf .....   0  -> OCR path
+#   edexcel 1MA1/1H (sparsest born-digital) ....  ~326
+#   every other born-digital QP ................  400-1200
+# A born-digital QP yields hundreds of chars/page; an image-only PDF yields ~0 (a stray QR/footer
+# might leak a handful). 40 chars/page sits an order of magnitude below the sparsest real paper
+# and well above any image-only leakage, so it cleanly separates the two with wide margin.
+TEXT_LAYER_MIN_CHARS_PER_PAGE = 40
+
+
+def text_layer_chars_per_page(path):
+    """Return (total_non_space_chars, n_pages, chars_per_page) for a PDF's pdftotext output.
+
+    chars_per_page is the auto-detect signal: it normalises out paper length so a long sparse
+    paper isn't mistaken for image-only and a short dense one isn't over-counted."""
+    raw = subprocess.check_output(["pdftotext", "-layout", path, "-"]).decode("utf-8", "replace")
+    chars = sum(1 for c in raw if not c.isspace())
+    n_pages = raw.count("\f") + 1                # pdftotext emits a form-feed after each page
+    return chars, n_pages, (chars / n_pages if n_pages else 0)
+
+
+def has_text_layer(path):
+    """True if `path` is a born-digital PDF (substantive text layer) suitable for the fast-path.
+
+    A PDF re-rendered to images (the scanned/image-only redistribution case) returns False and
+    must be routed to the OCR pipeline (--ocr / the AQA RapidOCR margin-pass)."""
+    _, _, cpp = text_layer_chars_per_page(path)
+    return cpp >= TEXT_LAYER_MIN_CHARS_PER_PAGE
+
+
+def lines_from_docling(doc):
+    """OCR path: one line per Docling text item, in reading order, carrying page + bbox."""
+    items = []
+    for t in doc.get("texts", []):
+        prov = t.get("prov") or []
+        if not prov:
+            items.append(Line(t.get("text") or "", None, None)); continue
+        page, bb = prov[0].get("page_no"), prov[0].get("bbox")
+        items.append(Line(t.get("text") or "", page, bb))
+    # reading order: page, then top-down (Docling bbox origin is bottom-left -> larger t = higher)
+    items.sort(key=lambda l: (l.page or 0, -((l.bbox or {}).get("t", 0)), (l.bbox or {}).get("l", 0)))
+    return items
+
+
+# ----------------------------------------------------------------- board detection
+PAPER_CODE_RES = [
+    ("aqa",     re.compile(r"\b(7408|8463|8461|8464|8462|8702|8700|7405|7402)/\d")),
+    ("edexcel", re.compile(r"\b1MA1/\d", re.I)),
+    ("ocr",     re.compile(r"\bH\d{3}/?\d?\b")),
+]
+WORDMARK_RES = [
+    ("edexcel", re.compile(r"Pearson|Edexcel", re.I)),
+    ("ocr",     re.compile(r"Oxford Cambridge and RSA|\bOCR\b")),
+    ("aqa",     re.compile(r"\bAQA\b")),
+]
+# structural grammar signals — the board-specific tokens themselves. These survive OCR far better
+# than cover-page branding / paper codes (which mangle: "1MA1/1F" -> "IMA1 1", "Pearson Edexcel"
+# split across lines), so they're the robust fallback before wordmarks.
+EDX_SIG = re.compile(r"Total for Question\s+\d+\s+is\s+\d+\s+marks?", re.I)
+OCR_SIG = re.compile(r"Oxford Cambridge and RSA", re.I)
+AQA_SIG = re.compile(r"\[\s*\d+\s*marks?\s*\]")              # [N marks] — AQA, not OCR's bare [N]
+
+
+def detect_board(lines):
+    """Return (board, paper_code|None). Order: paper code (authoritative) -> structural grammar
+    signal (OCR-robust) -> wordmark -> default."""
+    blob = "\n".join(l.text for l in lines[:1500])           # whole front + body, not just cover
+    for board, rx in PAPER_CODE_RES:
+        m = rx.search(blob)
+        if m:
+            return board, m.group(0)
+    if EDX_SIG.search(blob):
+        return "edexcel", None
+    if OCR_SIG.search(blob):
+        return "ocr", None
+    if len(AQA_SIG.findall(blob)) >= 3:
+        return "aqa", None
+    for board, rx in WORDMARK_RES:
+        if rx.search(blob):
+            return board, None
+    return "aqa", None   # safe default
+
+
+# ----------------------------------------------------------------- front matter
+def extract_front_matter(lines, board, code):
+    blob = "\n".join(l.text for l in lines[:400])
+    fm = {"exam_board": {"aqa": "AQA", "edexcel": "Pearson Edexcel", "ocr": "OCR"}[board]}
+    if code:
+        fm["paper_code"] = code
+    m = re.search(r"\b(GCSE|GCE|A-?level|AS)\b\s+([A-Z][A-Za-z ]+)", blob)
+    if m:
+        fm["qualification"] = m.group(1).upper().replace("-", "")
+        fm["subject"] = m.group(2).split("\n")[0].strip().title()
+    m = re.search(r"(Higher|Foundation)\s+Tier", blob, re.I)
+    if m:
+        fm["tier"] = m.group(1).title()
+    m = re.search(r"Time\s+allowed[:\s]+([0-9].*?(?:hour|minute)s?[^\n]*)", blob, re.I)
+    if m:
+        fm["time_allowed"] = m.group(1).strip()
+    # authoritative paper-total phrasings first, then the generic fallback
+    m = (re.search(r"TOTAL FOR PAPER IS\s+(\d{2,3})\s+MARKS", blob, re.I)
+         or re.search(r"total mark for this paper is\s+(\d{2,3})", blob, re.I)
+         or re.search(r"(?:maximum mark|maximum mark for this paper)\D{0,8}(\d{2,3})", blob, re.I))
+    if m:
+        fm["max_marks"] = int(m.group(1))
+    m = re.search(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*\.?\s*(\d{2,4})", blob)
+    if m:
+        fm["session"] = f"{m.group(1)} 20{m.group(2)[-2:]}"
+    return fm
+
+
+# ====================================================================== AQA
+# --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
+PART_RE = re.compile(r"^(\d{2})\.(\d)$")     # 01.2
+NUM_RE  = re.compile(r"^(\d{2})$")           # 08
+DIG_RE  = re.compile(r"^(\d)$")              # 4
+# A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
+# ("01.1 An atom of ...") rather than as a standalone box (GCSE 8463). Recover it as a prefix,
+# but only inside the tight left label column — body text (incl. decimals like "10.5 N") sits
+# at l>=92, so this column gate is the precision filter that keeps false positives out.
+# real part decimals run .1-.9 (never .0), so [1-9] rejects decimal content like "10.0" that
+# happens to land in the label column; ".0" is reserved for our MCQ top-level placeholders.
+PART_PREFIX = re.compile(r"^\s*(\d)\s*(\d)\s*\.\s*([1-9])(?=\s|$)")   # "01.1 ..." / "0 1 . 1 ..."
+LABEL_COL_MAX = 75                                                  # left edge of the label box
+MIN_MCQ_RUN = 5         # a real Section-B MCQ run is long+contiguous; fewer = stray page numbers
+FOOTER_T = 60           # bbox bottom-left origin: t<~30 is the page-number footer, not content
+
+
+# A-level Section B is multiple-choice: bare sequential top-level numbers ("07 Which two...",
+# or a lone "07") with no decimal part. They render glued in the label column. The sequence
+# gate (each accepted number == previous + 1) is the precision filter that rejects OCR noise
+# (misread "60", "10 6") — exactly as the OCR-board grammar gates bare integers.
+MCQ_TOP = re.compile(r"^(\d{2})(?:\s+[A-Z(].*|\s*)$")
+
+
+def _rapid_pages(rapid_glob):
+    """Yield (page_no, doc) in NUMERIC page order (glob sorts lexically: p1,p10,p2...)."""
+    files = sorted(glob.glob(rapid_glob),
+                   key=lambda f: int(re.search(r"p(\d+)\.json", f).group(1)))
+    for fn in files:
+        pg = int(re.search(r"p(\d+)\.json", fn).group(1))
+        yield pg, json.load(open(fn))
+
+
+def aqa_questions_rapid(rapid_glob):
+    """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
+      * GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
+      * A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
+      * A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
+    parts = {}
+    mcq_cands = []                       # (page, NN, bbox) bare top-level candidates, in order
+    for pg, d in _rapid_pages(rapid_glob):
+        margin = []
+        for t in d.get("texts", []):
+            raw = (t.get("text") or "").strip()
+            s = raw.replace(" ", "")
+            prov = t.get("prov") or []
+            bb = prov[0].get("bbox") if prov else None
+            if bb is None or bb["l"] > 140:
+                continue
+            margin.append((bb, s))
+            m = PART_RE.match(s)
+            if m and m.group(2) != "0":
+                parts.setdefault(f"{m.group(1)}.{m.group(2)}", {"page": pg, "bbox": bb})
+            elif bb["l"] <= LABEL_COL_MAX:
+                mp = PART_PREFIX.match(raw)
+                if mp:
+                    parts.setdefault(f"{mp.group(1)}{mp.group(2)}.{mp.group(3)}",
+                                     {"page": pg, "bbox": bb})
+                elif bb["t"] >= FOOTER_T:          # skip page-number footers (page N -> "N")
+                    mc = MCQ_TOP.match(raw)
+                    if mc:
+                        mcq_cands.append((pg, mc.group(1), bb))
+        nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
+        digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
+        for nbb, nn in nums:
+            ny = (nbb["t"] + nbb["b"]) / 2
+            for dbb, dd in digs:
+                dy = (dbb["t"] + dbb["b"]) / 2
+                if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
+                    parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
+    # Section B: walk MCQ candidates in reading order, accept the next number in sequence only
+    structured_q = {int(lab.split(".")[0]) for lab in parts}
+    expect = (max(structured_q) + 1) if structured_q else 1
+    mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0)))   # page, then top-down
+    cand = {}                            # nn -> (page, bbox), first occurrence in reading order
+    for pg, nn, bb in mcq_cands:
+        cand.setdefault(int(nn), (pg, bb))
+    # Walk the sequence: take the exact expected number when present; only jump a small gap
+    # (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
+    # truncate the section. Out-of-window noise (misread "60") never enters.
+    seq = []
+    while True:
+        if expect in cand and expect not in structured_q:
+            seq.append((expect, cand[expect]))
+            expect += 1
+            continue
+        nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
+        if nxt:
+            expect = min(nxt)
+            continue
+        break
+    # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
+    # paper with no MCQ section (a GCSE structured paper yields a short, gappy set; a real MCQ
+    # section is a long contiguous run).
+    if len(seq) >= MIN_MCQ_RUN:
+        for n, (pg, bb) in seq:
+            parts.setdefault(f"{n:02d}.0", {"page": pg, "bbox": bb})
+    # In the rapid path every ".0" label is a Section-B multiple-choice question, worth 1 mark
+    # each (they carry no "[N marks]" token for geometry to bind). Structured parts stay None
+    # until attach_marks_by_geometry fills them from the marks list.
+    return {lab: {"q": lab.split(".")[0], "page": v["page"], "bbox": v["bbox"],
+                  "marks": (1 if lab.endswith(".0") else None), "regions": []}
+            for lab, v in parts.items()}
+
+
+# --- AQA text grammar (born-digital AQA papers, e.g. 7408 with a text layer) ------
+AQA_MARK = re.compile(r"\[\s*(\d+)\s*marks?\s*\]", re.I)
+
+
+# AQA boxed labels render SPACE-SEPARATED in pdftotext ("0 1 . 1"); decimal content
+# ("10.5 N") does not. `pdftotext -bbox` normalises gaps to single spaces, while `-layout`
+# preserved wider runs, so the top-box grammar tolerates either one-or-more spaces.
+AQA_PART_BOX = re.compile(r"^\s*(\d)\s+(\d)\s*\.\s*(\d)(?=\s|$)")        # 0 1 . 1
+AQA_TOP_BOX  = re.compile(r"^\s*(\d)\s+(\d)\s+(?=[A-Z(])")              # 0 2 Carbon...
+
+
+def aqa_questions_text(lines):
+    parts = {}
+    cur = None
+    for l in lines:
+        mp = AQA_PART_BOX.match(l.text)
+        if mp:
+            q = f"{mp.group(1)}{mp.group(2)}"
+            lab = f"{q}.{mp.group(3)}"
+            cur = parts.setdefault(lab, {"q": q, "page": l.page, "bbox": _prefix_bbox(l),
+                                         "marks": None, "regions": []})
+        else:
+            mt = AQA_TOP_BOX.match(l.text)
+            if mt:
+                q = f"{mt.group(1)}{mt.group(2)}"
+                cur = parts.setdefault(f"{q}.0", {"q": q, "page": l.page, "bbox": _prefix_bbox(l),
+                                                  "marks": None, "regions": []})
+        mm = AQA_MARK.search(l.text)
+        if mm and cur is not None and cur.get("marks") is None:
+            cur["marks"] = int(mm.group(1))
+    # drop a placeholder ".0" part if the same question also has real numbered parts
+    for q in {v["q"] for v in parts.values()}:
+        if f"{q}.0" in parts and any(parts[k]["q"] == q and k != f"{q}.0" for k in parts):
+            parts.pop(f"{q}.0")
+    return parts
+
+
+# ====================================================================== Edexcel
+EDX_TOTAL = re.compile(r"Total for Question\s+(\d+)\s+is\s+(\d+)\s+marks?", re.I)
+EDX_LEAD  = re.compile(r"^\s*(\d{1,2})\s+(.*)$")        # number, gap, then the rest of the line
+EDX_PART  = re.compile(r"\(([a-h])\)")                  # may appear inline after the number
+EDX_SUB   = re.compile(r"^\s*\(([ivx]{1,4})\)")
+EDX_MARK  = re.compile(r"^\s*\((\d+)\)\s*$")
+
+
+def edexcel_questions(lines):
+    # anchor top-level numbers on the robust "Total for Question N is M" signal (precision)
+    anchors = {}            # qnum -> (total marks, anchor line)
+    for l in lines:
+        m = EDX_TOTAL.search(l.text)
+        if m:
+            anchors[int(m.group(1))] = (int(m.group(2)), l)
+    parts = {}
+    haspart = set()         # questions that own lettered parts
+    curq = curlet = lastlab = None
+
+    def add(lab, q, l):
+        nonlocal lastlab
+        parts.setdefault(lab, {"q": q, "page": l.page, "bbox": _prefix_bbox(l, 40), "marks": None, "regions": []})
+        lastlab = lab
+
+    for l in lines:
+        if EDX_TOTAL.search(l.text):
+            curq = curlet = None
+            continue
+        ml = EDX_LEAD.match(l.text)
+        if ml and int(ml.group(1)) in anchors and (ml.group(2)[:1].isupper()
+                                                    or ml.group(2).lstrip().startswith("(")):
+            curq, rest = ml.group(1), ml.group(2)
+            curlet = None
+            inline = EDX_PART.search(rest)             # capture "(a)" sharing the lead line
+            if inline:
+                curlet = inline.group(1); haspart.add(curq); add(f"{curq}{curlet}", curq, l)
+            continue
+        if curq is None:
+            continue
+        mp = EDX_PART.match(l.text.lstrip())
+        if mp:
+            curlet = mp.group(1); haspart.add(curq); add(f"{curq}{curlet}", curq, l)
+        ms = EDX_SUB.match(l.text)
+        if ms and curlet:
+            add(f"{curq}{curlet}{ms.group(1)}", curq, l)
+        mm = EDX_MARK.match(l.text)
+        if mm and lastlab:
+            parts[lastlab]["marks"] = int(mm.group(1))
+    # part-less questions: one part carrying the authoritative Total-for-Question mark
+    for q, (total, anchor_line) in anchors.items():
+        if str(q) not in haspart:
+            parts.setdefault(str(q), {"q": str(q), "page": anchor_line.page,
+                                      "bbox": _prefix_bbox(anchor_line, 40),
+                                      "marks": total, "regions": []})
+    return parts, {}, anchors
+
+
+# ====================================================================== OCR
+OCR_PART = re.compile(r"^\s*\(([a-h])\)")
+OCR_SUB  = re.compile(r"^\s*\(([ivx]{1,4})\)")
+OCR_MARK = re.compile(r"\[(\d+)\]")
+OCR_EXT  = re.compile(r"^\s*\(([a-h])\)\s*\*|^\s*(\d{1,2})\s*\*")
+
+
+def ocr_questions(lines):
+    parts = {}
+    curq = curlet = None
+    expect = 1
+    inferred = 0          # OCR may drop the margin question number; infer from part structure
+    for l in lines:
+        # top-level = the NEXT integer in sequence, gap, then question text OR a part opener "(a)"
+        # (Q3 opens straight into (a)). Sequence gate = the precision filter.
+        ml = re.match(r"^\s*(\d{1,2})\s+(\(|[A-Z])", l.text)
+        if ml and int(ml.group(1)) == expect:
+            curq = ml.group(1); curlet = None; expect += 1
+            parts.setdefault(curq, {"q": curq, "page": l.page, "bbox": _prefix_bbox(l, 36),
+                                    "marks": None, "regions": [], "_lead": True})
+        if curq is None:
+            # number was OCR-dropped: start an inferred question on its first part "(a)"
+            m0 = OCR_PART.match(l.text.lstrip())
+            if m0 and m0.group(1) == "a":
+                inferred += 1; curq = f"~{inferred}"; curlet = None
+            else:
+                continue
+        ext = bool(re.search(r"\(\s*[a-h]\s*\)\s*\*", l.text))
+        mp = OCR_PART.match(l.text)
+        if mp:
+            # a repeat "(a)" while this question already owns one => next question, number dropped
+            if mp.group(1) == "a" and f"{curq}a" in parts:
+                inferred += 1; curq = f"~{inferred}"
+            curlet = mp.group(1)
+            parts.pop(curq, None)
+            lab = f"{curq}{curlet}"
+            parts.setdefault(lab, {"q": curq, "page": l.page, "bbox": _prefix_bbox(l, 36),
+                                   "marks": None, "regions": [], "extended": ext})
+        ms = OCR_SUB.match(l.text)
+        if ms and curlet:
+            lab = f"{curq}{curlet}{ms.group(1)}"
+            parts.setdefault(lab, {"q": curq, "page": l.page, "bbox": _prefix_bbox(l, 36),
+                                   "marks": None, "regions": [], "extended": ext})
+        mm = OCR_MARK.search(l.text)
+        if mm:
+            sib = [k for k in parts if parts[k]["q"] == curq and not parts[k].get("_lead")]
+            if sib:
+                parts[sib[-1]]["marks"] = int(mm.group(1))
+    for v in parts.values():
+        v.pop("_lead", None)
+    return parts
+
+
+# ====================================================================== shared layers
+LABEL_TO_TAXONOMY = {
+    "checkbox_unselected": "mcq_option", "checkbox_selected": "mcq_option",
+    "picture": "context_figure", "table": "context_data", "caption": "context_caption",
+    "page_header": "furniture", "page_footer": "furniture",
+    "section_header": "heading", "list_item": "instruction",
+}
+
+
+def docling_regions(doc):
+    regions = []
+    for key in ("texts", "pictures", "tables"):
+        for it in doc.get(key, []):
+            lab = it.get("label", key[:-1])
+            tax = LABEL_TO_TAXONOMY.get(lab)
+            if not tax:
+                continue
+            prov = it.get("prov") or []
+            bb = prov[0].get("bbox") if prov else None
+            pg = prov[0].get("page_no") if prov else None
+            if bb is None:
+                continue
+            regions.append({"type": tax, "docling_label": lab, "page": pg, "bbox": bb,
+                            "text": (it.get("text") or "")[:80]})
+    return regions
+
+
+def merge_gemma(parts, gemma_dir):
+    """Attach gemma4:e4b answer_regions (#3) to parts by for_part; gap-fill missing marks."""
+    n_reg = n_fill = 0
+    for fn in sorted(glob.glob(os.path.join(gemma_dir, "p*.json"))):
+        d = json.load(open(fn))
+        for r in d.get("answer_regions", []):
+            lab = _norm_label(r.get("for_part", ""))
+            if lab in parts:
+                parts[lab]["regions"].append({"type": r.get("kind", "answer_lines"),
+                                              "source": "gemma"})
+                n_reg += 1
+        for qp in d.get("question_parts", []):
+            lab = _norm_label(qp.get("label", ""))
+            if lab in parts and parts[lab].get("marks") is None and qp.get("marks") is not None:
+                parts[lab]["marks"] = qp["marks"]; n_fill += 1
+    return n_reg, n_fill
+
+
+def _norm_label(s):
+    """gemma sometimes emits '0_4' or '01_2' for AQA -> '01.4'/'01.2'."""
+    s = (s or "").strip().replace("_", ".")
+    m = re.match(r"^(\d)\.(\d)$", s)
+    if m:                                  # '0.4' -> drop, ambiguous; keep as-is otherwise
+        return s
+    return s
+
+
+def extract_tables(parts, doc, granite="off", pdf=None, cache_glob=None):
+    """Selective table-cell extraction (PLAN.md §B): standard TableFormer grids always; Granite
+    <otsl> on router-flagged pages when granite!='off'. Returns (data_tables, all_tables).
+    Granite tables win over the standard pipeline on pages it covers (cleaner grids)."""
+    std = tbl_mod.tables_from_standard(doc)
+    gran = []
+    if granite != "off":
+        pages = tbl_mod.candidate_pages(doc)
+        if granite == "cached":
+            cache = tbl_mod._load_cached_doctags(cache_glob or "")
+            for pg in pages:
+                for t in tbl_mod.parse_otsl(cache.get(pg, "")):
+                    t["page"] = pg; gran.append(t)
+        elif granite == "live" and pdf:
+            gran = tbl_mod.granite_tables(pdf, pages, cached_glob=cache_glob)
+    gran_pages = {t["page"] for t in gran}
+    combined = gran + [t for t in std if t["page"] not in gran_pages]
+    data = tbl_mod.attach_to_questions(combined, parts)
+    for v in parts.values():
+        if v.get("tables"):
+            v["has_table"] = True
+    return data, combined
+
+
+def attach_marks_by_geometry(parts, doc):
+    """AQA Docling path: marks live in a flat [N marks] list; bind each to the nearest
+    preceding part on the same page by vertical position."""
+    marks = []
+    for t in doc.get("texts", []):
+        prov = t.get("prov") or []
+        bb = prov[0].get("bbox") if prov else None
+        pg = prov[0].get("page_no") if prov else None
+        for m in AQA_MARK.finditer(t.get("text") or ""):
+            marks.append((pg, bb, int(m.group(1))))
+    by_page = defaultdict(list)
+    for lab, v in parts.items():
+        if v.get("page") is not None:
+            by_page[v["page"]].append((lab, v))
+    n = 0
+    for pg, bb, val in marks:
+        cands = by_page.get(pg, [])
+        if not cands or bb is None:
+            continue
+        my = (bb["t"] + bb["b"]) / 2
+        best = min(cands, key=lambda kv: abs(((kv[1]["bbox"] or {}).get("t", 0)
+                                              + (kv[1]["bbox"] or {}).get("b", 0)) / 2 - my)
+                   if kv[1].get("bbox") else 1e9)
+        if best[1].get("marks") is None:
+            best[1]["marks"] = val; n += 1
+    return n, marks
+
+
+# ----------------------------------------------------------------- assembly + coverage
+def build_questions(parts):
+    qs = defaultdict(list)
+    for lab in parts:
+        qs[parts[lab]["q"]].append(lab)
+    out = []
+    for q in sorted(qs, key=lambda x: (len(x), x)):
+        plist = sorted(qs[q])
+        out.append({
+            "question": q,
+            "parts": [{"label": lab, "page": parts[lab].get("page"),
+                       "bbox": parts[lab].get("bbox"),   # label geometry (None for born-digital text)
+                       "marks": parts[lab].get("marks"),
+                       "regions": parts[lab].get("regions", []),
+                       "has_table": parts[lab].get("has_table", False),
+                       "extended": parts[lab].get("extended", False)} for lab in plist],
+        })
+    return out
+
+
+GT_PARTS_PHYSICS = ["01.1","01.2","01.3","01.4","02.1","02.2","02.3","02.4","03.1","03.2","03.3","03.4",
+    "04.1","04.2","04.3","04.4","04.5","05.1","05.2","05.3","05.4","06.1","06.2","06.3",
+    "07.1","07.2","07.3","08.1","08.2","08.3","08.4","08.5","09.1","09.2","09.3",
+    "10.1","10.2","10.3","11.1","11.2","11.3","11.4"]
+
+# official paper maxima — the strongest grammar sanity check (marks_sum should match)
+EXPECTED_MAX = {"8463": 100, "7408": 85, "8461": 100, "1MA1": 80, "H556": 70}
+
+
+def expected_max(code):
+    if not code:
+        return None
+    for k, v in EXPECTED_MAX.items():
+        if code.startswith(k):
+            return v
+    return None
+
+
+def parse_text_by_board(lines, board):
+    """Run the board grammar over a line stream -> parts dict (used for GT + born-digital)."""
+    if board == "edexcel":
+        parts, _, _ = edexcel_questions(lines); return parts
+    if board == "ocr":
+        return ocr_questions(lines)
+    return aqa_questions_text(lines)
+
+
+def coverage(parts, gt_labels):
+    rec = set(parts)
+    hit = sorted(rec & set(gt_labels))
+    miss = sorted(set(gt_labels) - rec)
+    return {"coverage_pct": round(len(hit) / len(gt_labels) * 100, 1) if gt_labels else None,
+            "recovered": len(hit), "total": len(gt_labels), "missed": miss}
+
+
+# ----------------------------------------------------------------- main
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--text", help="pdftotext source or .txt (born-digital / GT path)")
+    ap.add_argument("--auto", help="PDF: auto-detect text layer -> fast-path (--text), else "
+                                   "report the OCR path is required (no GPU work attempted here)")
+    ap.add_argument("--ocr", help="PDF to OCR live via dsync (uses the shared GPU)")
+    ap.add_argument("--docling", help="cached Docling JSON (OCR path without re-running dsync)")
+    ap.add_argument("--pdf", help="source PDF for live Granite table passes (--granite live)")
+    ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (the v1 95% path)")
+    ap.add_argument("--gemma", help="gemma sweep dir with p*.json answer_regions")
+    ap.add_argument("--marks-fill", dest="marks_fill",
+                    help="gemma_marks.py fills JSON: fill marks=None parts (Edexcel/OCR (N)/[N] gap-fill)")
+    ap.add_argument("--granite", default="off", choices=["off", "cached", "live"],
+                    help="selective Granite-Docling <otsl> tables: cached doctags or live via dsync")
+    ap.add_argument("--granite-cache", default="results/VLM_granite_p*.doctags",
+                    help="glob of cached *.doctags for --granite cached / live fallback")
+    ap.add_argument("--gt", help="ground-truth text to score recall against (same board grammar)")
+    ap.add_argument("--board", default="auto", choices=["auto", "aqa", "edexcel", "ocr"])
+    ap.add_argument("--out", default="results/structured.json")
+    a = ap.parse_args()
+
+    # --- auto path selection -------------------------------------------------------------
+    # Caller need not know in advance whether the PDF is born-digital or image-only: detect the
+    # text layer and either fold --auto into the fast-path (--text) or report that the OCR path
+    # is required. This is ADDITIVE — it only resolves --auto; every other mode is untouched.
+    if a.auto:
+        chars, n_pages, cpp = text_layer_chars_per_page(a.auto)
+        if cpp >= TEXT_LAYER_MIN_CHARS_PER_PAGE:
+            print(f"auto-detect         : born-digital text layer "
+                  f"({chars} chars / {n_pages} pages = {cpp:.0f} chars/page "
+                  f">= {TEXT_LAYER_MIN_CHARS_PER_PAGE}) -> fast-path (pdftotext, no GPU)")
+            a.text = a.auto
+        else:
+            print(f"auto-detect         : NO usable text layer "
+                  f"({chars} chars / {n_pages} pages = {cpp:.1f} chars/page "
+                  f"< {TEXT_LAYER_MIN_CHARS_PER_PAGE}) -> OCR path required")
+            print("route               : run the OCR pipeline, e.g.")
+            print(f"                      python extract.py --ocr {a.auto}")
+            print("                      (AQA image-only papers use the RapidOCR margin-pass; "
+                  "see scripts/rapid_pass.py)")
+            return
+
+    # default invocation == v1 AQA physics regression guard
+    if not (a.text or a.ocr or a.docling):
+        a.docling = "results/E_tess_full.json"
+        a.rapid = a.rapid or "results/rapid_pages/p*.json"
+        a.gemma = a.gemma or "results/gemma_sweep_physics_200"
+        a.pdf = a.pdf or "samples/AQA-Physics-Paper-1H-2022-with-qr.pdf"
+        a.out = "results/physics_structured.json" if a.out == "results/structured.json" else a.out
+
+    doc = None
+    pages = []
+    if a.ocr:
+        try:
+            from . import dsync
+        except ImportError:  # pragma: no cover - CLI execution
+            import dsync
+        doc = dsync.convert_document(a.ocr, {"ocr_engine": "tesseract", "force_ocr": True})
+        lines = lines_from_docling(doc)
+    elif a.docling:
+        doc = json.load(open(a.docling))
+        lines = lines_from_docling(doc)
+    else:
+        if a.text and a.text.endswith(".pdf"):
+            lines, pages = _bbox_lines_from_pdftotext(a.text)
+        else:
+            lines = lines_from_pdftext(a.text)
+
+    board, code = detect_board(lines)
+    if a.board != "auto":
+        board = a.board
+    fm = extract_front_matter(lines, board, code)
+
+    # --- questions: AQA uses the proven Docling+RapidOCR path when inputs exist ----------
+    if board == "aqa" and a.rapid and glob.glob(a.rapid):
+        parts = aqa_questions_rapid(a.rapid)
+        path_used = "aqa-docling+rapidocr (v1)"
+    else:
+        parts = parse_text_by_board(lines, board)
+        path_used = f"{board}-text-grammar"
+
+    # --- shared enrichment ---------------------------------------------------------------
+    regions = docling_regions(doc) if doc else []
+    n_mark_geo = 0
+    if doc and board == "aqa":
+        n_mark_geo, _ = attach_marks_by_geometry(parts, doc)
+    data_tables, all_tables = ([], [])
+    if doc:
+        data_tables, all_tables = extract_tables(parts, doc, granite=a.granite,
+                                                 pdf=(a.pdf or a.ocr), cache_glob=a.granite_cache)
+    n_tbl = sum(1 for v in parts.values() if v.get("has_table"))
+    tbl_pages = sorted({t["page"] for t in data_tables if t["page"]})
+    n_reg = n_fill = 0
+    if a.gemma and os.path.isdir(a.gemma):
+        n_reg, n_fill = merge_gemma(parts, a.gemma)
+    n_marks_fill = 0
+    if a.marks_fill and os.path.exists(a.marks_fill):
+        fills = json.load(open(a.marks_fill)).get("fills", {})
+        for lab, mk in fills.items():
+            if lab in parts and parts[lab].get("marks") is None:
+                parts[lab]["marks"] = int(mk); n_marks_fill += 1
+
+    questions = build_questions(parts)
+
+    # --- coverage ------------------------------------------------------------------------
+    if a.gt:
+        gt_lines = lines_from_pdftext(a.gt)
+        gt_parts = parse_text_by_board(gt_lines, board)
+        cov = coverage(parts, list(gt_parts))
+        cov["source"] = "gt-text-same-grammar"
+    elif board == "aqa" and "rapidocr" in path_used:
+        cov = coverage(parts, GT_PARTS_PHYSICS)
+        cov["source"] = "builtin-physics-gt"
+    else:
+        cov = {"coverage_pct": None, "note": "no GT provided"}
+
+    marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
+    marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
+    exp_max = expected_max(code) or fm.get("max_marks")   # code-based, else front-matter total
+    marks_check = (None if exp_max is None else
+                   {"sum": marks_sum, "expected_max": exp_max,
+                    "pct": round(marks_sum / exp_max * 100, 1)})
+    result = {
+        "board": board, "paper_code": code, "front_matter": fm, "path": path_used,
+        "pages": pages,
+        "questions": questions,
+        "regions": regions,
+        "tables": data_tables,
+        "stats": {
+            "n_questions": len({v["q"] for v in parts.values()}),
+            "n_parts": len(parts),
+            "marks_parts_known": marks_known, "marks_sum": marks_sum,
+            "marks_check": marks_check,
+            "gemma_answer_regions": n_reg, "gemma_marks_filled": n_fill,
+            "gemma_marks_gapfilled": n_marks_fill,
+            "n_data_tables": len(data_tables),
+            "n_furniture_tables": sum(1 for t in all_tables if t["is_furniture"]),
+            "table_sources": {s: sum(1 for t in data_tables if t["source"] == s)
+                              for s in sorted({t["source"] for t in data_tables})},
+            "table_pages": tbl_pages,
+            "region_type_counts": {t: sum(1 for r in regions if r["type"] == t)
+                                   for t in sorted({r["type"] for r in regions})},
+        },
+        "coverage": cov,
+    }
+    json.dump(result, open(a.out, "w"), indent=2)
+
+    print(f"board               : {board}  ({code or 'wordmark'})  [{path_used}]")
+    print(f"front-matter        : {', '.join(f'{k}={v}' for k,v in fm.items() if not isinstance(v,(list,dict)))}")
+    print(f"questions           : {result['stats']['n_questions']} top-level, {len(parts)} parts")
+    mc = f" | {marks_sum}/{exp_max} of max ({marks_check['pct']}%)" if marks_check else ""
+    print(f"marks               : {marks_known}/{len(parts)} parts known (sum {marks_sum}){mc}"
+          + (f"; +{n_mark_geo} by geometry" if n_mark_geo else ""))
+    print(f"gemma regions       : {n_reg} answer_regions, {n_fill} marks gap-filled"
+          + (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else ""))
+    print(f"tables              : {len(data_tables)} data table(s) "
+          f"{result['stats']['table_sources']} on pages {tbl_pages}; "
+          f"{result['stats']['n_furniture_tables']} furniture filtered; {n_tbl} parts flagged")
+    if cov.get("coverage_pct") is not None:
+        print(f"COVERAGE            : {cov['coverage_pct']}%  ({cov['recovered']}/{cov['total']})"
+              f"  missed: {cov['missed'][:8]}{'…' if len(cov['missed'])>8 else ''}  [{cov['source']}]")
+    print(f"-> wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/finalize.py
+++ b/api/services/docling/finalize.py
@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+"""
+finalize.py — produce the final corpus output bundle under results/final/.
+
+Runs the full pipeline (via the real module CLIs, so the bundle is reproducible) across the corpus:
+  * geometry papers (image-only / OCR-path): structured + furniture + bands + page_roles + template
+    + validate + overlays (template human-review view for ALL pages, rich debug for sample pages).
+  * born-digital fast-path papers: structured + validate (no geometry -> no overlays).
+Writes per-paper report.md, a human INDEX.md, and a machine catalog.json.
+
+Usage:
+  python finalize.py [--no-overlays]      # --no-overlays = JSON pipeline only (fast)
+"""
+import os, sys, glob, json, subprocess, argparse, datetime
+
+FINAL = "results/final"
+PY = sys.executable
+
+# ------------------------------------------------------------------ corpus manifest
+GEOMETRY = [
+    dict(slug="aqa-physics-8463-imageonly", title="AQA GCSE Physics 8463/1H (image-only)",
+         board="aqa", level="GCSE", path="image-only (RapidOCR margin-pass)",
+         pdf="samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
+         docling="results/E_tess_full.json", rapid="results/rapid_pages/p*.json",
+         extract=["--docling", "results/E_tess_full.json", "--rapid", "results/rapid_pages/p*.json",
+                  "--granite", "cached"]),
+    dict(slug="aqa-physics-7408-ocr", title="AQA A-level Physics 7408/1 (rasterised OCR)",
+         board="aqa", level="A-level", path="OCR (RapidOCR margin-pass + Section-B MCQ)",
+         pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
+         docling="results/rapid_7408/merged.json", rapid="results/rapid_7408/p*.json",
+         gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt",
+         extract=["--docling", "results/rapid_7408/merged.json", "--rapid", "results/rapid_7408/p*.json",
+                  "--board", "aqa"]),
+    dict(slug="aqa-biology-8461-ocr", title="AQA GCSE Biology 8461/1H (rasterised OCR)",
+         board="aqa", level="GCSE", path="OCR (RapidOCR margin-pass)",
+         pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
+         docling="results/rapid_8461/merged.json", rapid="results/rapid_8461/p*.json",
+         gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt",
+         extract=["--docling", "results/rapid_8461/merged.json", "--rapid", "results/rapid_8461/p*.json",
+                  "--board", "aqa"]),
+    dict(slug="edexcel-maths-1ma1-1h-ocr", title="Edexcel GCSE Maths 1MA1/1H (rasterised OCR)",
+         board="edexcel", level="GCSE-H", path="OCR + gemma marks gap-fill",
+         pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
+         docling="results/genreport/edexcel1h/ocr.json", rapid=None,
+         gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt",
+         extract=["--docling", "results/genreport/edexcel1h/ocr.json", "--board", "edexcel",
+                  "--marks-fill", "results/genreport/edexcel1h/marks_fill.json"]),
+    dict(slug="edexcel-maths-1ma1-1f-ocr", title="Edexcel GCSE Maths 1MA1/1F (rasterised OCR)",
+         board="edexcel", level="GCSE-F", path="OCR + gemma marks gap-fill",
+         pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf",
+         docling="results/genreport/edexcel1f/ocr.json", rapid=None,
+         extract=["--docling", "results/genreport/edexcel1f/ocr.json", "--board", "edexcel",
+                  "--marks-fill", "results/genreport/edexcel1f/marks_fill.json"]),
+    dict(slug="ocr-physics-h556-ocr", title="OCR A-level Physics H556/3 (rasterised OCR)",
+         board="ocr", level="A-level", path="OCR + gemma marks gap-fill",
+         pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
+         docling="results/genreport/ocrh556/ocr.json", rapid=None,
+         gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt",
+         extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
+                  "--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
+]
+FAST = [
+    dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
+         level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
+         gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt"),
+    dict(slug="aqa-biology-8461-fast", title="AQA GCSE Biology 8461/1H (born-digital)", board="aqa",
+         level="GCSE", pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
+         gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt"),
+    dict(slug="edexcel-maths-1ma1-1h-fast", title="Edexcel GCSE Maths 1MA1/1H (born-digital)",
+         board="edexcel", level="GCSE-H", pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
+         gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt"),
+    dict(slug="edexcel-maths-1ma1-1f-fast", title="Edexcel GCSE Maths 1MA1/1F (born-digital)",
+         board="edexcel", level="GCSE-F", pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf"),
+    dict(slug="ocr-physics-h556-fast", title="OCR A-level Physics H556/3 (born-digital)", board="ocr",
+         level="A-level", pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
+         gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt"),
+    dict(slug="aqa-chemistry-8462-fast", title="AQA GCSE Chemistry 8462/1H (born-digital)", board="aqa",
+         level="GCSE", pdf="samples/chemistry-p1h-2023-qp.pdf"),
+    dict(slug="aqa-physics-8463-twin-fast", title="AQA GCSE Physics 8463/1H born-digital twin",
+         board="aqa", level="GCSE", pdf="samples/physics-p1h-2022-qp.pdf"),
+]
+
+
+def run(cmd):
+    r = subprocess.run([PY] + cmd, capture_output=True, text=True)
+    if r.returncode != 0:
+        print(f"  ! FAILED: {' '.join(cmd)}\n{r.stderr[-400:]}")
+    return r.returncode == 0
+
+
+def jload(p):
+    try:
+        return json.load(open(p))
+    except Exception:
+        return {}
+
+
+def stats_from(struct, val):
+    st = struct.get("stats", {}) or {}
+    mc = st.get("marks_check") or {}
+    cov = struct.get("coverage", {}) or {}
+    return {
+        "board": struct.get("board"), "paper_code": struct.get("paper_code"),
+        "n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
+        "marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
+        "marks_pct": mc.get("pct"),
+        "coverage_pct": cov.get("coverage_pct"), "coverage_missed": cov.get("missed", []),
+        "validate_verdict": (val.get("summary") or {}).get("worst_severity"),
+        "validate_flags": val.get("flags", []),
+        "questions_expected": (val.get("summary") or {}).get("questions_expected"),
+        "questions_recovered": (val.get("summary") or {}).get("questions_recovered"),
+        "second_pass_slots": [q["label"] for q in val.get("question_sequence", []) if not q["recovered"]],
+    }
+
+
+def do_geometry(p, overlays):
+    d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
+    S, F, B, R, T, V = (os.path.join(d, f) for f in
+                        ("structured.json", "furniture.json", "bands.json", "page_roles.json",
+                         "template.json", "validate.json"))
+    ex = ["extract.py"] + p["extract"] + ["--out", S]
+    if p.get("gt"):
+        ex += ["--gt", p["gt"]]
+    run(ex)
+    run(["furniture.py", p["docling"], "--out", F])
+    bands = ["bands.py", S, "--docling", p["docling"], "--out", B]
+    if p.get("rapid"):
+        bands += ["--rapid", p["rapid"]]
+    run(bands)
+    run(["page_roles.py", p["docling"], "--bands", B, "--out", R])
+    run(["template.py", "--structured", S, "--bands", B, "--furniture", F,
+         "--page-roles", R, "--pdf", p["pdf"], "--out", T])
+    run(["validate.py", S, "--out", V])
+    if overlays:
+        otpl = os.path.join(d, "overlays", "template")
+        run(["scripts/overlay.py", S, p["pdf"], "--template", T, "--dpi", "120", "--out", otpl])
+        # rich debug view on the first few pages (cover + early questions)
+        odbg = os.path.join(d, "overlays", "debug")
+        run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
+             "--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
+    return stats_from(jload(S), jload(V)), d
+
+
+def do_fast(p):
+    d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
+    S = os.path.join(d, "structured.json"); V = os.path.join(d, "validate.json")
+    ex = ["extract.py", "--text", p["pdf"], "--out", S]
+    if p.get("gt"):
+        ex += ["--gt", p["gt"]]
+    run(ex)
+    run(["validate.py", S, "--out", V])
+    return stats_from(jload(S), jload(V)), d
+
+
+def per_paper_report(p, s, d, kind):
+    n_imgs = len(glob.glob(os.path.join(d, "overlays", "**", "*.png"), recursive=True))
+    lines = [f"# {p['title']}", "",
+             f"- **slug:** `{p['slug']}`  ·  **board:** {p['board']}  ·  **level:** {p['level']}  "
+             f"·  **path:** {kind}",
+             f"- **questions/parts:** {s['n_questions']} / {s['n_parts']}",
+             f"- **marks:** {s['marks_sum']}/{s['official_max']}"
+             + (f" ({s['marks_pct']}% of official max)" if s['marks_pct'] is not None else ""),
+             f"- **coverage vs GT:** {s['coverage_pct']}%"
+             + (f"  (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
+             if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
+             f"- **G6 verdict:** {s['validate_verdict']}",
+             ]
+    if s["validate_flags"]:
+        lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
+    lines += ["", "**Artifacts:** `structured.json`, `validate.json`"
+              + (", `furniture.json`, `bands.json`, `page_roles.json`, `template.json`, "
+                 f"`overlays/` ({n_imgs} images)" if kind != "born-digital fast-path"
+                 else "  (born-digital: no page geometry → no overlays)")]
+    open(os.path.join(d, "report.md"), "w").write("\n".join(lines) + "\n")
+    return n_imgs
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--no-overlays", action="store_true")
+    a = ap.parse_args()
+    os.makedirs(FINAL, exist_ok=True)
+    catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
+               "papers": []}
+    total_imgs = 0
+
+    for p in GEOMETRY:
+        print(f"[geometry] {p['slug']}")
+        s, d = do_geometry(p, not a.no_overlays)
+        n = per_paper_report(p, s, d, p["path"])
+        total_imgs += n
+        catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
+                                  "kind": "geometry", "path": p["path"], "dir": d,
+                                  "overlay_images": n, **s})
+    for p in FAST:
+        print(f"[fast] {p['slug']}")
+        s, d = do_fast(p)
+        per_paper_report(p, s, d, "born-digital fast-path")
+        catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
+                                  "kind": "fast", "path": "born-digital fast-path", "dir": d, **s})
+
+    json.dump(catalog, open(os.path.join(FINAL, "catalog.json"), "w"), indent=2)
+    write_index(catalog, total_imgs)
+    print(f"\n-> {len(catalog['papers'])} papers, {total_imgs} overlay images -> {FINAL}/")
+
+
+def write_index(catalog, total_imgs):
+    g = [p for p in catalog["papers"] if p["kind"] == "geometry"]
+    f = [p for p in catalog["papers"] if p["kind"] == "fast"]
+    L = ["# Final corpus output — exam-extraction spike", "",
+         f"Generated {catalog['generated_at']}. {len(catalog['papers'])} paper-runs across "
+         f"3 boards × 2 levels, both pipeline paths; {total_imgs} overlay debug images.", "",
+         "Each `<slug>/` holds the machine artifacts (JSON) + `report.md`; geometry papers also have "
+         "`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
+         "Machine catalog: `catalog.json`.", "",
+         "## Image-only / OCR-path (with geometry + overlays)", "",
+         "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 | Images |",
+         "|---|---|---|---|---|---|---|"]
+    for p in g:
+        cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
+        L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
+                 f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
+                 f"({p['marks_pct']}%) | {cov} | {p['validate_verdict']} | "
+                 f"{p['overlay_images']} |")
+    L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
+          "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
+          "|---|---|---|---|---|---|"]
+    for p in f:
+        L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
+                 f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
+                 f"({p['marks_pct']}%) | {p['coverage_pct'] if p['coverage_pct'] is not None else 'n/a'}% | "
+                 f"{p['validate_verdict']} |")
+    L += ["", "## Per-paper directory layout", "```",
+          "<slug>/",
+          "  structured.json     extract.py output (questions->parts->marks/bbox/regions)",
+          "  validate.json       G6 consistency judge (confidence + flags)",
+          "  furniture.json      recurring-furniture mask + content margins   [geometry only]",
+          "  bands.json          main + part y-bands                          [geometry only]",
+          "  page_roles.json     per-page role + margin override              [geometry only]",
+          "  template.json       editable first-pass template (source/confirmed) [geometry only]",
+          "  overlays/template/  human-review view, all pages                 [geometry only]",
+          "  overlays/debug/     raw-detection view, sample pages             [geometry only]",
+          "  report.md           per-paper human summary", "```"]
+    open(os.path.join(FINAL, "INDEX.md"), "w").write("\n".join(L) + "\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/furniture.py
+++ b/api/services/docling/furniture.py
@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+"""
+furniture.py — detect recurring page chrome by cross-page repetition; derive content margins;
+reclassify pictures (real figure vs barcode/QR/header furniture). The first-pass mask.
+
+Principle: an item at ~the same (x,y) on many pages is **chrome, not question content**. This
+needs no classifier — pure positional recurrence — and it solves the genuine gap the overlay
+surfaced (the app-generated QR top-right and the foot barcode being mislabelled context_figure),
+including the QR that bleeds past the margin. It also yields the content margins so stage-2 analysis
+can be fed only the question/response region.
+
+Outputs a mask + margins JSON, and an A/B summary (figure false-positives before vs after masking).
+
+Usage:
+  python furniture.py <docling_doc.json> [--freq 0.4] [--out results/furniture.json]
+"""
+import json, argparse
+from collections import defaultdict
+
+GRID = 24          # pt — position quantisation; items sharing a cell across pages are "recurring"
+
+
+def gather(doc):
+    out = []
+    for key in ("texts", "pictures", "tables"):
+        for it in doc.get(key, []):
+            prov = it.get("prov") or []
+            bb = prov[0].get("bbox") if prov else None
+            pg = prov[0].get("page_no") if prov else None
+            if bb and pg:
+                out.append({"page": pg, "kind": key[:-1], "label": it.get("label", key[:-1]),
+                            "bbox": bb, "text": (it.get("text") or "")[:40]})
+    return out
+
+
+def cell(bb):
+    return (round((bb["l"] + bb["r"]) / 2 / GRID), round((bb["t"] + bb["b"]) / 2 / GRID))
+
+
+def detect(items, n_pages, freq):
+    """Flag each item furniture=True if its position-cell appears on >= freq*n_pages pages."""
+    pages_at = defaultdict(set)
+    for it in items:
+        pages_at[cell(it["bbox"])].add(it["page"])
+    fcells = {c: len(p) for c, p in pages_at.items() if len(p) >= freq * n_pages}
+    for it in items:
+        it["furniture"] = cell(it["bbox"]) in fcells
+    return fcells
+
+
+def content_margins(items):
+    """Content x-band + per-page content bbox from NON-furniture items (what stage-2 should see)."""
+    body = [it for it in items if not it["furniture"]]
+    if not body:
+        return None
+    lefts = sorted(it["bbox"]["l"] for it in body)
+    rights = sorted(it["bbox"]["r"] for it in body)
+    band = {"x_left": round(lefts[max(0, len(lefts) // 20)], 1),       # 5th pct — robust to strays
+            "x_right": round(rights[min(len(rights) - 1, len(rights) * 19 // 20)], 1)}
+    per_page = {}
+    bp = defaultdict(list)
+    for it in body:
+        bp[it["page"]].append(it["bbox"])
+    for pg, bbs in bp.items():
+        per_page[pg] = {"top": round(max(b["t"] for b in bbs), 1),
+                        "bottom": round(min(b["b"] for b in bbs), 1),
+                        "left": round(min(b["l"] for b in bbs), 1),
+                        "right": round(max(b["r"] for b in bbs), 1)}
+    return {"content_x_band": band, "per_page": per_page}
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("doc")
+    ap.add_argument("--freq", type=float, default=0.40, help="recurrence fraction => furniture")
+    ap.add_argument("--out", default="results/furniture.json")
+    a = ap.parse_args()
+    doc = json.load(open(a.doc))
+    items = gather(doc)
+    n_pages = len({it["page"] for it in items})
+    fcells = detect(items, n_pages, a.freq)
+    margins = content_margins(items)
+
+    pics = [it for it in items if it["kind"] == "picture"]
+    pics_furn = [it for it in pics if it["furniture"]]
+    txt_furn = [it for it in items if it["kind"] == "text" and it["furniture"]]
+    # break furniture pictures down by cell (which recurring object)
+    by_cell = defaultdict(list)
+    for it in pics_furn:
+        by_cell[cell(it["bbox"])].append(it)
+
+    result = {
+        "n_pages": n_pages, "freq_threshold": a.freq,
+        "furniture_cells": {f"{c[0]},{c[1]}": n for c, n in sorted(fcells.items())},
+        "content_margins": margins,
+        "ab_test_figures": {
+            "context_figure_before_mask": len(pics),
+            "context_figure_after_mask": len(pics) - len(pics_furn),
+            "removed_as_furniture": len(pics_furn),
+            "removed_breakdown": {f"cell {c[0]},{c[1]}": len(v) for c, v in sorted(by_cell.items())},
+        },
+        "text_furniture_removed": len(txt_furn),
+        "items": items,   # each carries furniture flag — consumed by overlay.py --furniture
+    }
+    json.dump(result, open(a.out, "w"))
+
+    ab = result["ab_test_figures"]
+    print(f"pages {n_pages}  freq>={a.freq}  furniture cells: {result['furniture_cells']}")
+    print(f"content x-band: {margins['content_x_band'] if margins else None}")
+    print(f"\nA/B — figure (picture) classification:")
+    print(f"  context_figure BEFORE mask : {ab['context_figure_before_mask']}")
+    print(f"  context_figure AFTER  mask : {ab['context_figure_after_mask']}")
+    print(f"  removed as furniture       : {ab['removed_as_furniture']}  {ab['removed_breakdown']}")
+    print(f"  text furniture removed     : {result['text_furniture_removed']} (page numbers / 'Turn over' / headers)")
+    print(f"-> wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/page_roles.py
+++ b/api/services/docling/page_roles.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+page_roles.py — tag every page with a structural role (the first-pass page-layout pass).
+
+Roles: cover / question / continuation / blank / appendix. Drives two things in the template:
+  * the human sees the paper's shape (which pages are non-question), and
+  * MARGINS are disabled on pages that have no content column (cover, blank) — the override the
+    user asked for ("the front page doesn't have margins").
+
+Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and
+whether the page carries a question band. Output feeds template.py via --page-roles.
+
+Usage:
+  python page_roles.py <docling_doc.json> --bands <bands.json> [--out results/page_roles/x.json]
+"""
+import json, argparse
+from collections import defaultdict
+
+BLANK_MAX = 130          # non-space chars at/below which a page is boilerplate-only (blank)
+COVER_KW = ("time allowed", "instructions", "materials", "information for")
+BLANK_KW = ("blank page", "no questions printed", "no questions are printed")
+APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet")
+
+# pages where there is no content column -> margins do not apply (the user's override case)
+NO_MARGIN_ROLES = {"cover", "blank"}
+
+
+def page_text(doc):
+    chars, blob = defaultdict(int), defaultdict(list)
+    for t in doc.get("texts", []):
+        prov = t.get("prov") or []
+        pg = prov[0].get("page_no") if prov else None
+        if pg:
+            s = t.get("text") or ""
+            chars[pg] += sum(1 for c in s if not c.isspace())
+            blob[pg].append(s.lower())
+    return chars, {pg: " ".join(v) for pg, v in blob.items()}
+
+
+def tag(doc, qpages):
+    chars, blob = page_text(doc)
+    n = max([*chars, *qpages, 1])
+    first_q = min(qpages) if qpages else n + 1
+    last_q = max(qpages) if qpages else 0
+    roles = {}
+    for pg in range(1, n + 1):
+        b = blob.get(pg, "")
+        if pg in qpages:
+            role = "question"
+        elif pg < first_q and any(k in b for k in COVER_KW):
+            role = "cover"                   # before blank: the cover's instructions mention "blank"
+        elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300):
+            role = "blank"
+        elif any(k in b for k in APPENDIX_KW):
+            role = "appendix"
+        elif first_q <= pg <= last_q:
+            role = "continuation"           # no question label but inside the question range
+        else:
+            role = "appendix"               # content outside the question range (end-matter/insert)
+        roles[pg] = {"role": role, "chars": chars[pg],
+                     "margins_enabled": role not in NO_MARGIN_ROLES,
+                     "source": "auto", "confirmed": False}
+    return roles
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("doc")
+    ap.add_argument("--bands", required=True)
+    ap.add_argument("--out", default="results/page_roles.json")
+    a = ap.parse_args()
+    bands = json.load(open(a.bands))
+    qpages = {int(p) for p in bands["pages"]}
+    roles = tag(json.load(open(a.doc)), qpages)
+    json.dump({"pages": roles}, open(a.out, "w"), indent=2)
+    from collections import Counter
+    c = Counter(v["role"] for v in roles.values())
+    print(f"roles: {dict(c)}")
+    for pg in sorted(roles):
+        r = roles[pg]
+        flag = "" if r["margins_enabled"] else "  (no margins)"
+        if r["role"] != "question":
+            print(f"  p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}")
+    print(f"-> wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/scripts/init.py
+++ b/api/services/docling/scripts/init.py
--- a/api/services/docling/scripts/overlay.py
+++ b/api/services/docling/scripts/overlay.py
@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+"""
+overlay.py — human-viewable debug visualisation: draw the extractor's geometry over the rendered
+exam page. Shows WHERE each question/part label was located and where Docling regions
+(figures/tables/MCQ checkboxes) sit, so a reviewer can eyeball whether the structure landed in the
+right place. This is the same geometry the exam-marker app uses to place regions on its canvas.
+
+Coordinates: Docling/RapidOCR bboxes are PDF points with a BOTTOM-LEFT origin. We render the page
+at DPI D (scale = D/72) and flip y against the rendered image height, so we never need the page's
+point-height explicitly: y_top_px = H_px - t*scale.
+
+With --docling, also draws every raw Docling text block (the body/question content the thin
+extractor model discards) so a reviewer can see the FULL detection, not just what we persist.
+Granite tables carry cells but no coordinates; we derive their box by locating the cell-texts in
+the Docling text layer (content+geometry fusion).
+
+Usage:
+  python scripts/overlay.py <structured.json> <source_pdf> [--pages 3,4,5] [--dpi 150] [--out DIR]
+  python scripts/overlay.py <structured.json> <pdf> --docling results/E_tess_full.json --pages 5
+"""
+import os, sys, json, re, argparse, subprocess, tempfile
+from PIL import Image, ImageDraw, ImageFont
+
+PART_COLOR = (211, 47, 47)                          # red — question/part labels
+BODY_COLOR = (150, 150, 150)                         # grey — raw Docling body-text blocks (--docling)
+GRANITE_COLOR = (0, 150, 136)                        # teal — Granite table (geometry derived from cells)
+REGION_COLORS = {                                   # docling region taxonomy -> colour
+    "context_figure": (25, 118, 210),               # blue
+    "context_data": (56, 142, 60),                  # green (tables)
+    "context_caption": (123, 31, 162),              # purple
+    "mcq_option": (245, 124, 0),                     # orange (checkboxes)
+}
+
+
+def _norm(s):
+    return re.sub(r"[^a-z0-9]", "", (s or "").lower())
+
+
+def docling_texts_by_page(doc):
+    """All raw Docling text items -> {page: [(bbox, text, label)]}. The body content we discard."""
+    out = {}
+    for t in doc.get("texts", []):
+        prov = t.get("prov") or []
+        bb = prov[0].get("bbox") if prov else None
+        pg = prov[0].get("page_no") if prov else None
+        if bb and pg:
+            out.setdefault(pg, []).append((bb, t.get("text") or "", t.get("label") or "text"))
+    return out
+
+
+def derive_table_bbox(grid, page_texts):
+    """Granite tables have cells but no coordinates. Locate the cell-texts in the Docling text
+    layer and union their bboxes -> the table's on-page extent.
+
+    Two traps (seen on physics p5): (1) border/maths glyphs ('|','+') normalise to '' and an
+    empty string is a substring of everything; (2) cell WORDS recur in nearby content — the rock
+    names reappear in the MCQ options below the table ('Basalt or chalk'), far left and lower.
+    So we match only blocks whose normalised text is CONTAINED IN a cell (keeps fragments like
+    '2.90'/'Type', rejects the longer 'basaltorchalk'), require length >= 2, then keep the
+    dominant vertical cluster to drop any stray cell-word elsewhere on the page."""
+    import statistics
+    cells = {c for c in (_norm(x) for row in grid for x in row) if len(c) > 1}
+    hit = [bb for bb, txt, _ in page_texts
+           if len(_norm(txt)) > 1 and any(_norm(txt) in c for c in cells)]
+    if len(hit) < 3:
+        return None
+    med = statistics.median(sorted((b["t"] + b["b"]) / 2 for b in hit))
+    hit = [b for b in hit if abs((b["t"] + b["b"]) / 2 - med) <= 120]   # table band only
+    return {"l": min(b["l"] for b in hit), "r": max(b["r"] for b in hit),
+            "t": max(b["t"] for b in hit), "b": min(b["b"] for b in hit)}
+
+
+def _font(sz):
+    for p in ("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
+              "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"):
+        if os.path.exists(p):
+            return ImageFont.truetype(p, sz)
+    return ImageFont.load_default()
+
+
+MAIN_LINE = (25, 118, 210)                          # blue — main-question y-markers
+PART_LINE = (211, 47, 47)                            # red — part y-markers
+
+
+def _hline(draw, y_pdf, scale, H, W, color, label, width, font, dashed=False, inset=0):
+    """Full-width horizontal marker line at a PDF-point y (BOTTOM-LEFT origin)."""
+    y = H - y_pdf * scale
+    if dashed:
+        x = inset
+        while x < W:
+            draw.line([x, y, min(x + 9, W), y], fill=color, width=width); x += 16
+    else:
+        draw.line([inset, y, W, y], fill=color, width=width)
+    if label:
+        tw = draw.textlength(label, font=font)
+        draw.rectangle([inset, y - 16, inset + tw + 6, y], fill=color)
+        draw.text((inset + 3, y - 15), label, fill=(255, 255, 255), font=font)
+
+
+def _rect(draw, bb, scale, H, color, label, width=3, font=None):
+    """Draw one bbox (BOTTOM-LEFT origin -> image space) + its label."""
+    x0, x1 = bb["l"] * scale, bb["r"] * scale
+    y0, y1 = H - bb["t"] * scale, H - bb["b"] * scale      # t is the higher edge -> smaller y_px
+    draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
+    if label:
+        tw = draw.textlength(label, font=font)
+        draw.rectangle([x0, y0 - 17, x0 + tw + 6, y0], fill=color)
+        draw.text((x0 + 3, y0 - 16), label, fill=(255, 255, 255), font=font)
+
+
+def draw_template(draw, tpl, pg, scale, H, W, font):
+    """Render the editable template for one page: margins/bands as LINES, footprints as BOXES.
+    A confirmed element is drawn solid; an unconfirmed (auto) suggestion is drawn dashed."""
+    MARGIN, MAIN, PART = (0, 150, 136), (25, 118, 210), (211, 47, 47)
+    page = tpl["pages"].get(str(pg)) or tpl["pages"].get(pg) or {}
+    # role banner (top-left); margins suppressed entirely on no-margin pages (cover/blank)
+    role = page.get("role", "question")
+    draw.rectangle([0, 0, 8 + len(role) * 8, 16], fill=(70, 70, 70))
+    draw.text((4, 1), f"role:{role}", fill=(255, 255, 255), font=font)
+    margins_on = page.get("margins_enabled", True)
+    # margins: axis-locked lines (document scope on every page + this page's page-scope lines)
+    for m in (tpl.get("margins", []) if margins_on else []):
+        if m["scope"] == "page" and m.get("page") != pg:
+            continue
+        solid = m.get("confirmed")
+        if m["axis"] == "x":
+            x = m["value"] * scale
+            draw.line([x, 0, x, H], fill=MARGIN, width=2) if solid else _dash_v(draw, x, 0, H, MARGIN, 2)
+        else:
+            y = H - m["value"] * scale
+            draw.line([0, y, W, y], fill=MARGIN, width=2) if solid else _dash_h(draw, 0, W, y, MARGIN, 2)
+    for m in page.get("main_bands", []):
+        if not m.get("is_start", True):          # continuation page: no spurious second "start" line
+            continue
+        _hline(draw, m["y_start"], scale, H, W, MAIN, f"Q{m['question']}", 3, font,
+               dashed=not m.get("confirmed"))
+    for p in page.get("part_bands", []):
+        _hline(draw, p["y_start"], scale, H, W, PART, p["label"], 2, font, inset=90,
+               dashed=not p.get("confirmed"))
+    for f in page.get("furniture", []):
+        if f.get("box"):
+            _rect(draw, f["box"], scale, H, (130, 130, 130), f"furniture:{f.get('kind','')}", 2, font)
+    for g in page.get("figures", []):
+        if g.get("box"):
+            _rect(draw, g["box"], scale, H, (56, 142, 60), "figure", 3, font)
+    for t in page.get("tables", []):
+        if t.get("box"):
+            _rect(draw, t["box"], scale, H, (0, 150, 136),
+                  f"table {t.get('n_rows')}x{t.get('n_cols')}", 3, font)
+
+
+def render_page(pdf, pg, dpi, td):
+    """Render page `pg` and return an image in DOCLING's coordinate space. Docling reports bbox
+    relative to the CropBox, but pdftoppm renders the MediaBox — when CropBox != MediaBox (e.g. the
+    Edexcel 1MA1 papers: media 652x899, crop inset 28.35pt) that mismatch magnifies + shifts every
+    overlaid shape toward a corner. Fix: crop the rendered image to the CropBox so it matches Docling.
+    No-op when CropBox == MediaBox (h556) or when poppler already rendered the CropBox."""
+    base = os.path.join(td, f"p{pg}")
+    subprocess.run(["pdftoppm", "-png", "-r", str(dpi), "-f", str(pg), "-l", str(pg), pdf, base],
+                   check=True)
+    png = next(p for p in (f"{base}-{pg:02d}.png", f"{base}-{pg}.png", f"{base}-{pg:03d}.png")
+               if os.path.exists(p))
+    img = Image.open(png).convert("RGB")
+    try:
+        import pypdf
+        page = pypdf.PdfReader(pdf).pages[pg - 1]
+        mb, cb = page.mediabox, page.cropbox
+        scale = dpi / 72.0
+        mbl, mbt = float(mb.left), float(mb.top)
+        dcrop = any(abs(a - b) > 0.5 for a, b in
+                    ((cb.left, mb.left), (cb.bottom, mb.bottom), (cb.right, mb.right), (cb.top, mb.top)))
+        rendered_mediabox = abs(img.width - (float(mb.right) - mbl) * scale) < 3
+        if dcrop and rendered_mediabox:
+            img = img.crop((round((float(cb.left) - mbl) * scale), round((mbt - float(cb.top)) * scale),
+                            round((float(cb.right) - mbl) * scale), round((mbt - float(cb.bottom)) * scale)))
+    except Exception:
+        pass
+    return img
+
+
+def _dash_v(draw, x, y0, y1, color, w):
+    y = y0
+    while y < y1:
+        draw.line([x, y, x, min(y + 9, y1)], fill=color, width=w); y += 16
+
+
+def _dash_h(draw, x0, x1, y, color, w):
+    x = x0
+    while x < x1:
+        draw.line([x, y, min(x + 9, x1), y], fill=color, width=w); x += 16
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("structured"); ap.add_argument("pdf")
+    ap.add_argument("--docling", help="raw Docling doc JSON: also draw every body-text block "
+                                      "(the content the thin model discards) + derive Granite-table boxes")
+    ap.add_argument("--bands", help="bands.py JSON: draw main-question + part start/end y-marker lines")
+    ap.add_argument("--furniture", help="furniture.py JSON: mark recurring furniture vs real figures "
+                                        "+ draw the content x-margins")
+    ap.add_argument("--template", help="template.py JSON: render the editable first-pass template "
+                                       "(margins+bands as lines, furniture/figures as boxes). "
+                                       "When set, draws ONLY the template (the human-review view).")
+    ap.add_argument("--pages", help="comma list, e.g. 3,4,5 (default: all pages with geometry)")
+    ap.add_argument("--dpi", type=int, default=150)
+    ap.add_argument("--out", default="results/overlay")
+    a = ap.parse_args()
+    os.makedirs(a.out, exist_ok=True)
+    scale = a.dpi / 72.0
+    font = _font(14)
+
+    res = json.load(open(a.structured))
+    doc_texts = docling_texts_by_page(json.load(open(a.docling))) if a.docling else {}
+    bands = json.load(open(a.bands))["pages"] if a.bands else {}
+    furn = json.load(open(a.furniture)) if a.furniture else None
+    tpl = json.load(open(a.template)) if a.template else None
+    # gather geometry by page
+    parts_by_pg, regions_by_pg = {}, {}
+    for q in res.get("questions", []):
+        for p in q["parts"]:
+            if p.get("bbox") and p.get("page"):
+                parts_by_pg.setdefault(p["page"], []).append((p["label"], p["bbox"]))
+    for r in res.get("regions", []):
+        if r.get("bbox") and r.get("page"):
+            regions_by_pg.setdefault(r["page"], []).append((r["type"], r["bbox"]))
+    # tables: standard ones carry a bbox; Granite ones don't -> derive from the text layer
+    tables_by_pg = {}
+    for t in res.get("tables", []):
+        pg = t.get("page")
+        if not pg:
+            continue
+        bb = t.get("bbox") or (derive_table_bbox(t.get("grid", []), doc_texts.get(pg, []))
+                               if a.docling else None)
+        if bb:
+            tables_by_pg.setdefault(pg, []).append(
+                (f"table {t.get('source','')} {t.get('n_rows')}x{t.get('n_cols')}", bb))
+
+    want = ([int(x) for x in a.pages.split(",")] if a.pages
+            else (sorted(int(p) for p in tpl["pages"]) if tpl
+                  else sorted(set(parts_by_pg) | set(regions_by_pg) | set(doc_texts))))
+    if not want:
+        sys.exit("no bbox geometry in this result (born-digital text path carries no geometry; "
+                 "use an OCR/rapid-path structured.json)")
+
+    written = []
+    with tempfile.TemporaryDirectory() as td:
+        for pg in want:
+            img = render_page(a.pdf, pg, a.dpi, td)
+            H = img.height
+            draw = ImageDraw.Draw(img)
+            if tpl:                          # template-only render = the human-review view
+                draw_template(draw, tpl, pg, scale, H, img.width, font)
+                out = os.path.join(a.out, f"p{pg:02d}.png")
+                img.save(out); written.append(out)
+                pgd = tpl["pages"].get(str(pg), {})
+                print(f"p{pg}: template — {len(pgd.get('main_bands',[]))} main, "
+                      f"{len(pgd.get('part_bands',[]))} part, {len(pgd.get('furniture',[]))} furn, "
+                      f"{len(pgd.get('figures',[]))} fig -> {out}")
+                continue
+            # layer 0: raw Docling body-text blocks (faint, no label) — the discarded content
+            for bb, txt, lab in doc_texts.get(pg, []):
+                _rect(draw, bb, scale, H, BODY_COLOR, None, 1, font)
+            # layer 1: taxonomy regions
+            for typ, bb in regions_by_pg.get(pg, []):
+                _rect(draw, bb, scale, H, REGION_COLORS.get(typ, (120, 120, 120)), typ, 2, font)
+            # layer 2: tables (Granite-derived boxes in teal)
+            for lab, bb in tables_by_pg.get(pg, []):
+                _rect(draw, bb, scale, H, GRANITE_COLOR, lab, 3, font)
+            # layer 3: part labels on top
+            for lab, bb in parts_by_pg.get(pg, []):
+                _rect(draw, bb, scale, H, PART_COLOR, lab, 3, font)
+            # layer 4: band y-marker lines (main-question = blue, part = red dashed; end = dashed)
+            pb = bands.get(str(pg)) or bands.get(pg)
+            nb = 0
+            if pb:
+                W = img.width
+                for m in pb["main"]:
+                    if not m.get("is_start", True):     # skip continuation-page duplicate
+                        continue
+                    _hline(draw, m["y_start"], scale, H, W, MAIN_LINE,
+                           f"Q{m['question']} ▸ start", 3, font); nb += 1
+                    _hline(draw, m["y_end"], scale, H, W, MAIN_LINE, None, 1, font, dashed=True)
+                for p in pb["part"]:
+                    _hline(draw, p["y_start"], scale, H, W, PART_LINE,
+                           f"{p['label']} start", 2, font, inset=90); nb += 1
+            # layer 5: furniture mask — green=real figure, grey=masked furniture; + content margins
+            if furn:
+                W = img.width
+                for it in furn["items"]:
+                    if it["page"] != pg or it["kind"] != "picture":
+                        continue
+                    if it["furniture"]:
+                        _rect(draw, it["bbox"], scale, H, (130, 130, 130), "furniture", 2, font)
+                    else:
+                        _rect(draw, it["bbox"], scale, H, (56, 142, 60), "figure ✓", 3, font)
+                band = (furn.get("content_margins") or {}).get("content_x_band")
+                if band:
+                    for xk in ("x_left", "x_right"):
+                        x = band[xk] * scale
+                        draw.line([x, 0, x, H], fill=(0, 150, 136), width=2)
+            out = os.path.join(a.out, f"p{pg:02d}.png")
+            img.save(out); written.append(out)
+            print(f"p{pg}: {len(parts_by_pg.get(pg,[]))} part-labels, "
+                  f"{len(regions_by_pg.get(pg,[]))} regions, {len(tables_by_pg.get(pg,[]))} tables, "
+                  f"{len(doc_texts.get(pg,[]))} body-text blocks, {nb} band-lines -> {out}")
+    print(f"-> {len(written)} page(s) in {a.out}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/tables.py
+++ b/api/services/docling/tables.py
@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+tables.py — selective table-cell extraction for the exam extractor (PLAN.md §B).
+
+Two sources, unified into one cell-grid schema:
+  * STANDARD  — the Tesseract+TableFormer backbone already emits `tables[].data.table_cells`
+      (text + row/col offsets + spans + bbox). Free, cached, every run. Good on ruled tables;
+      but it MISSES some data tables and OCRs them as loose tokens (REPORT.md p5).
+  * GRANITE   — Granite-Docling-258M VLM emits `<otsl>` grids in DocTags (clean rows/cols even
+      where the backbone scrambles them). GPU cost, so used SELECTIVELY: only on pages the router
+      flags (a standard table present, or dense picture/checkbox), routed through dsync's GPU lock
+      + Redis cache. Recipe (REPORT.md): {"to_formats":["doctags","json"], "pipeline":"vlm",
+      "vlm_pipeline_model":"granite_docling"}.
+
+Unified table = {page, n_rows, n_cols, grid (2D text), cells, caption, source, is_furniture}.
+"""
+import re, json, os, glob, base64, urllib.request
+
+# ----------------------------------------------------------------- OTSL (Granite DocTags)
+OTSL_BLOCK = re.compile(r"<otsl>(.*?)</otsl>", re.S)
+CAPTION    = re.compile(r"<caption>(?:<loc_\d+>)*(.*?)</caption>", re.S)
+CELL_TOK   = re.compile(r"<(fcel|ecel|ched|rhed|lcel|ucel|xcel|nl)>([^<]*)")
+HEADER_TAGS = {"ched", "rhed"}
+
+
+def parse_otsl(doctags):
+    """Parse every <otsl> block in a DocTags string into unified tables."""
+    out = []
+    for block in OTSL_BLOCK.findall(doctags):
+        cap = None
+        mc = CAPTION.search(block)
+        if mc:
+            cap = re.sub(r"\s+", " ", mc.group(1)).strip()
+        body = CAPTION.sub("", block)
+        body = re.sub(r"<loc_\d+>", "", body)
+        rows, cur = [], []
+        for tag, txt in CELL_TOK.findall(body):
+            if tag == "nl":
+                rows.append(cur); cur = []
+            else:
+                cur.append({"text": txt.strip(), "header": tag in HEADER_TAGS,
+                            "empty": tag == "ecel"})
+        if cur:
+            rows.append(cur)
+        rows = [r for r in rows if r]
+        if not rows:
+            continue
+        n_cols = max(len(r) for r in rows)
+        grid = [[c["text"] for c in r] + [""] * (n_cols - len(r)) for r in rows]
+        out.append({"page": None, "n_rows": len(rows), "n_cols": n_cols, "grid": grid,
+                    "caption": cap, "source": "granite-otsl",
+                    "is_furniture": is_furniture(grid, cap)})
+    return out
+
+
+# ----------------------------------------------------------------- standard TableFormer
+def tables_from_standard(doc):
+    out = []
+    for t in doc.get("tables", []):
+        data = t.get("data", {}) or {}
+        cells = data.get("table_cells", []) or []
+        nr, nc = data.get("num_rows") or 0, data.get("num_cols") or 0
+        grid = [["" for _ in range(nc)] for _ in range(nr)]
+        for c in cells:
+            r0, c0 = c.get("start_row_offset_idx"), c.get("start_col_offset_idx")
+            if r0 is not None and c0 is not None and r0 < nr and c0 < nc and c.get("text"):
+                grid[r0][c0] = c["text"]
+        prov = t.get("prov") or []
+        page = prov[0].get("page_no") if prov else None
+        cap = " ".join(x.get("text", "") for x in (t.get("captions") or []) if isinstance(x, dict)) or None
+        out.append({"page": page, "n_rows": nr, "n_cols": nc, "grid": grid,
+                    "caption": cap, "source": "docling-standard",
+                    "is_furniture": is_furniture(grid, cap)})
+    return out
+
+
+# ----------------------------------------------------------------- furniture filter
+FURNITURE_RE = re.compile(r"examiner|do not write|leave\s+blank|question\s*mark|"
+                          r"for marker|total marks?$", re.I)
+
+
+def is_furniture(grid, caption=None):
+    """A table that is exam scaffolding (mark grid / 'For Examiner's Use'), not question data."""
+    blob = " ".join(cell for row in grid for cell in row) + " " + (caption or "")
+    if FURNITURE_RE.search(blob):
+        return True
+    # a single-column strip of question numbers / blanks = a mark grid
+    flat = [c for row in grid for c in row if c.strip()]
+    if flat and all(re.fullmatch(r"\d{1,2}", c.strip()) for c in flat):
+        return True
+    return False
+
+
+# ----------------------------------------------------------------- Granite via dsync
+VLM_OPTS = {"to_formats": ["doctags", "json"], "pipeline": "vlm",
+            "vlm_pipeline_model": "granite_docling", "image_export_mode": "placeholder"}
+
+
+def _serve_vlm(pdf_b64, fname, page):
+    import dsync
+    opts = {**VLM_OPTS, "page_range": [page, page]}
+    body = {"options": opts,
+            "sources": [{"kind": "file", "base64_string": pdf_b64, "filename": fname}],
+            "target": {"kind": "inbody"}}
+    req = urllib.request.Request(dsync.SERVE + "/v1/convert/source",
+                                 data=json.dumps(body).encode(),
+                                 headers={"Content-Type": "application/json"})
+    for _ in range(4):                                  # tolerate the single-use 404 race
+        try:
+            return json.loads(urllib.request.urlopen(req, timeout=1200).read())
+        except urllib.error.HTTPError as e:
+            if e.code == 404:
+                import time; time.sleep(3); continue
+            raise
+    raise RuntimeError("serve vlm: repeated 404")
+
+
+def _doctags_of(resp):
+    doc = resp.get("document") or {}
+    return doc.get("doctags_content") or doc.get("doc_tags") or doc.get("doctags") or ""
+
+
+def granite_tables(pdf, pages, *, cached_glob=None, retries=4):
+    """Run Granite-Docling on the given pages via dsync (GPU lock + OOM retry + Redis cache),
+    parse <otsl>, tag each table with its page. Falls back to cached *.doctags if serve fails."""
+    import dsync, time
+    cache = _load_cached_doctags(cached_glob) if cached_glob else {}
+    r = dsync._redis()
+    b64 = base64.b64encode(open(pdf, "rb").read()).decode()
+    fname = os.path.basename(pdf)
+    sha = dsync._sha(pdf)
+    out = []
+    for pg in pages:
+        key = f"docling:vlm:{sha}:p{pg}"
+        doctags = None
+        if r and (hit := r.get(key)):
+            doctags = hit if isinstance(hit, str) else hit.decode()
+        if doctags is None:
+            delay = 5
+            for attempt in range(retries):
+                with dsync._GpuLock(r):
+                    resp = _serve_vlm(b64, fname, pg)
+                if dsync._is_oom(resp):
+                    print(f"[granite] p{pg} OOM, backoff {delay}s ({attempt+1}/{retries})")
+                    time.sleep(delay); delay = min(delay * 2, 120); continue
+                doctags = _doctags_of(resp)
+                if r and doctags:
+                    r.set(key, doctags, ex=dsync.CACHE_TTL)
+                break
+        if not doctags and pg in cache:
+            print(f"[granite] p{pg} serve empty -> cached doctags")
+            doctags = cache[pg]
+        for tbl in parse_otsl(doctags or ""):
+            tbl["page"] = pg
+            out.append(tbl)
+    return out
+
+
+def _load_cached_doctags(glob_pat):
+    """Map page_no -> doctags text from files named *p<N>.doctags."""
+    cache = {}
+    for fn in glob.glob(glob_pat):
+        m = re.search(r"p(\d+)\.doctags$", fn)
+        if m:
+            cache[int(m.group(1))] = open(fn, encoding="utf-8", errors="replace").read()
+    return cache
+
+
+# ----------------------------------------------------------------- routing + attach
+def candidate_pages(doc):
+    """Pages the router sends to Granite: a standard table, or a dense picture/checkbox page."""
+    pages = set()
+    for t in doc.get("tables", []):
+        prov = t.get("prov") or []
+        if prov and prov[0].get("page_no"):
+            pages.add(prov[0]["page_no"])
+    chk = {}
+    for it in doc.get("texts", []):
+        if it.get("label", "").startswith("checkbox"):
+            prov = it.get("prov") or []
+            if prov and prov[0].get("page_no"):
+                chk[prov[0]["page_no"]] = chk.get(prov[0]["page_no"], 0) + 1
+    pages |= {p for p, n in chk.items() if n >= 2}
+    return sorted(pages)
+
+
+def attach_to_questions(tables, parts):
+    """Assign each non-furniture table to the nearest preceding part on its page (by y); if no
+    geometry, attach to the first part on that page. Records table refs on the part."""
+    data_tables = [t for t in tables if not t["is_furniture"]]
+    by_page = {}
+    for lab, v in parts.items():
+        by_page.setdefault(v.get("page"), []).append((lab, v))
+    for i, t in enumerate(data_tables):
+        t["id"] = i
+        cands = by_page.get(t["page"], [])
+        if not cands:
+            t["for_part"] = None; continue
+        # best-effort: the part highest on the page (largest bbox top = the page's question stem),
+        # else the earliest part label. (Tables sit under the stem; we don't carry table y here.)
+        with_geo = [(lab, v) for lab, v in cands if v.get("bbox")]
+        if with_geo:
+            lab = max(with_geo, key=lambda kv: (kv[1]["bbox"] or {}).get("t", 0))[0]
+        else:
+            lab = sorted(cands, key=lambda kv: kv[0])[0][0]
+        t["for_part"] = lab
+        parts[lab].setdefault("tables", []).append(
+            {"id": i, "n_rows": t["n_rows"], "n_cols": t["n_cols"],
+             "caption": t["caption"], "source": t["source"]})
+    return data_tables
--- a/api/services/docling/template.py
+++ b/api/services/docling/template.py
@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+"""
+template.py — assemble the editable first-pass structural template from the spike's three signal
+sources (extract structured.json + bands.json + furniture.json) into ONE round-trippable JSON the
+human reviewer verifies AND edits before stage-2 generates the final template.
+
+UI principle (user, 2026-06-07): directional LIMITS are draggable LINES (1-DOF, easier to drag);
+object FOOTPRINTS are BOXES. So:
+  * margins      -> four axis-locked LINES: left/right (x), top/bottom (y)
+  * question/part bands -> horizontal LINES: start/end y
+  * furniture / figures / tables -> BOXES (an object's footprint)
+
+Every editable element carries {source: "auto"|"human", confirmed: bool} — the AI-suggestion seam.
+Stage-2 must consume only confirmed elements (or a template marked confirmed at the top level).
+Coordinates are PDF points, BOTTOM-LEFT origin (units in meta); the app maps to its own canvas.
+
+Usage:
+  python template.py --structured S.json --bands B.json --furniture F.json --pdf P.pdf --out T.json
+"""
+import json, argparse, datetime
+
+
+def _line(edge, axis, value, scope, page=None):
+    o = {"edge": edge, "axis": axis, "value": round(value, 1), "scope": scope,
+         "source": "auto", "confirmed": False}
+    if page is not None:
+        o["page"] = page
+    return o
+
+
+def _furn_kind(it):
+    """Best-guess label for a furniture box (human can rename). Position-based, BOTTOM-LEFT origin."""
+    bb = it["bbox"]; cx = (bb["l"] + bb["r"]) / 2; cy = (bb["t"] + bb["b"]) / 2
+    if it["kind"] == "picture":
+        if cx > 430 and cy > 700:
+            return "qr"
+        if cy < 110:
+            return "barcode"
+        return "chrome_picture"
+    if cy < 90:
+        return "footer"
+    if cy > 760:
+        return "header_or_page_number"
+    return "chrome_text"
+
+
+def build(structured, bands, furniture, pdf=None, page_roles=None):
+    page_roles = page_roles or {}
+    part_bbox = {p["label"]: p.get("bbox")
+                 for q in structured.get("questions", []) for p in q["parts"]}
+    cm = furniture.get("content_margins") or {}
+    xband = cm.get("content_x_band") or {}
+    per_pg_m = cm.get("per_page") or {}
+
+    def margins_on(pg):
+        r = page_roles.get(str(pg)) or page_roles.get(pg)
+        return r.get("margins_enabled", True) if r else True
+
+    # margins as axis-locked LINES — document-level left/right, per-page top/bottom. Per-page
+    # top/bottom are omitted for pages with no content column (cover/blank) — the user's override.
+    margins = []
+    if "x_left" in xband:
+        margins.append(_line("left", "x", xband["x_left"], "document"))
+        margins.append(_line("right", "x", xband["x_right"], "document"))
+    for pg, m in sorted(per_pg_m.items(), key=lambda kv: int(kv[0])):
+        if not margins_on(int(pg)):
+            continue
+        margins.append(_line("top", "y", m["top"], "page", int(pg)))
+        margins.append(_line("bottom", "y", m["bottom"], "page", int(pg)))
+
+    # furniture + figures as BOXES, grouped by page
+    furn_pg, fig_pg = {}, {}
+    for it in furniture.get("items", []):
+        pg = it["page"]
+        if it.get("furniture"):
+            furn_pg.setdefault(pg, []).append(
+                {"box": it["bbox"], "kind": _furn_kind(it), "docling_label": it["label"],
+                 "source": "auto", "confirmed": False})
+        elif it["kind"] == "picture":
+            fig_pg.setdefault(pg, []).append(
+                {"box": it["bbox"], "source": "auto", "confirmed": False})
+
+    tbl_pg = {}
+    for t in structured.get("tables", []):
+        if t.get("page"):
+            tbl_pg.setdefault(t["page"], []).append(
+                {"box": t.get("bbox"), "n_rows": t.get("n_rows"), "n_cols": t.get("n_cols"),
+                 "table_source": t.get("source"), "source": "auto", "confirmed": False})
+
+    # --- reconcile against recovered part labels -------------------------------------------
+    # A part-label position is never furniture or a figure (the label wins), and a "figure" that
+    # covers most of the content area is a Docling page-collapse artifact (the GPU sometimes flags
+    # the whole page as one picture), not a real figure -> drop both. Fixes the Q1.7/Q1.9 clashes
+    # and the full-page "figure" that was masking part labels.
+    part_boxes_pg = {}
+    for q in structured.get("questions", []):
+        for p in q["parts"]:
+            if p.get("bbox") and p.get("page"):
+                part_boxes_pg.setdefault(p["page"], []).append(p["bbox"])
+
+    def _inter(a, b):
+        return not (a["r"] < b["l"] or b["r"] < a["l"] or a["t"] < b["b"] or b["t"] < a["b"])
+
+    def _area(b):
+        return max(0, b["r"] - b["l"]) * max(0, b["t"] - b["b"])
+
+    for pg, items in list(furn_pg.items()):
+        pls = part_boxes_pg.get(pg, [])
+        furn_pg[pg] = [f for f in items if not (f.get("box") and any(_inter(f["box"], pl) for pl in pls))]
+    for pg, items in list(fig_pg.items()):
+        pls = part_boxes_pg.get(pg, [])
+        m = per_pg_m.get(str(pg)) or per_pg_m.get(pg) or {}
+        carea = ((m.get("right", 0) - m.get("left", 0)) * (m.get("top", 0) - m.get("bottom", 0))) or (595 * 842)
+        fig_pg[pg] = [f for f in items if f.get("box")
+                      and _area(f["box"]) <= 0.55 * carea               # not a full-page collapse
+                      and not any(_inter(f["box"], pl) for pl in pls)]  # not clashing a part label
+
+    pages = {}
+    all_pg = (set(bands["pages"]) | {str(p) for p in furn_pg} | {str(p) for p in fig_pg}
+              | {str(p) for p in page_roles})
+    for pgs in sorted(all_pg, key=int):
+        pg = int(pgs)
+        pb = bands["pages"].get(pgs) or bands["pages"].get(pg) or {"main": [], "part": []}
+        main = [{"question": m["question"], "y_start": m["y_start"], "y_end": m["y_end"],
+                 "is_start": m.get("is_start", True),
+                 "source": "auto", "confirmed": False} for m in pb["main"]]
+        part = [{"label": p["label"], "question": p["question"],
+                 "y_start": p["y_start"], "y_end": p["y_end"],
+                 "label_box": part_bbox.get(p["label"]),     # app may render a box instead of lines
+                 "source": "auto", "confirmed": False} for p in pb["part"]]
+        pr = page_roles.get(pgs) or page_roles.get(pg) or {}
+        pages[pgs] = {
+            "role": pr.get("role", "question"),
+            "role_source": pr.get("source", "default"), "role_confirmed": pr.get("confirmed", False),
+            "margins_enabled": pr.get("margins_enabled", True),   # human-overridable
+            "main_bands": main, "part_bands": part,
+            "furniture": furn_pg.get(pg, []), "figures": fig_pg.get(pg, []),
+            "tables": tbl_pg.get(pg, []),
+        }
+
+    return {
+        "meta": {
+            "schema": "exam-template/first-pass/v1",
+            "board": structured.get("board"), "paper_code": structured.get("paper_code"),
+            "source_pdf": pdf, "n_pages": furniture.get("n_pages"),
+            "coord_origin": "BOTTOMLEFT", "units": "pdf_points",
+            "generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
+            "ui_principle": "directional limits = draggable axis-locked lines; "
+                            "object footprints = boxes",
+            "confirmed": False, "confirmed_by": None, "confirmed_at": None,
+        },
+        "margins": margins,
+        "pages": pages,
+    }
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--structured", required=True)
+    ap.add_argument("--bands", required=True)
+    ap.add_argument("--furniture", required=True)
+    ap.add_argument("--page-roles", dest="page_roles", help="page_roles.py JSON (roles + margin override)")
+    ap.add_argument("--pdf")
+    ap.add_argument("--out", default="results/template.json")
+    a = ap.parse_args()
+    roles = json.load(open(a.page_roles))["pages"] if a.page_roles else {}
+    t = build(json.load(open(a.structured)), json.load(open(a.bands)),
+              json.load(open(a.furniture)), a.pdf, roles)
+    json.dump(t, open(a.out, "w"), indent=2)
+    np = len(t["pages"])
+    nm = sum(len(p["main_bands"]) for p in t["pages"].values())
+    npt = sum(len(p["part_bands"]) for p in t["pages"].values())
+    nf = sum(len(p["furniture"]) for p in t["pages"].values())
+    ng = sum(len(p["figures"]) for p in t["pages"].values())
+    print(f"template {t['meta']['paper_code']} ({t['meta']['board']}): {np} pages, "
+          f"{len(t['margins'])} margin-lines, {nm} main-bands, {npt} part-bands, "
+          f"{nf} furniture-boxes, {ng} figure-boxes")
+    print(f"-> wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/api/services/docling/validate.py
+++ b/api/services/docling/validate.py
@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+validate.py — G6 validation/judge: a deterministic consistency pass over an extractor result.
+
+NOT a gate. It never approves or rejects; it attaches confidence + flags so a HUMAN reviewer's
+attention is routed to the parts most likely wrong. A clean paper -> all-green, skim; a flagged
+paper -> the exact items to check, worst-first. Every value stays a *suggestion* a human confirms.
+
+Checks (all deterministic, no GPU, ~free — run on every extraction):
+  C1 marks-sum vs official max      — over-read (sum>max) = error; under (sum<max) = warn
+  C2 part marks plausibility        — marks None / 0 / implausibly high
+  C3 top-level question sequence    — gaps in 1..N (skipped when numbering was OCR-inferred '~')
+  C4 sub-part contiguity            — within a question: a,b,c / .1,.2,.3 with no hole
+  C5 coverage                       — missed parts vs ground truth (when the result carries it)
+
+Usage:
+  python validate.py results/genreport/edexcel1f/ocr_struct_filled.json
+  python validate.py <structured.json> --out report.json
+"""
+import json, re, sys, argparse
+from collections import defaultdict
+
+IMPLAUSIBLE_PART_MARKS = 15        # a single sub-part above this is worth a human glance
+
+
+def _qnum(q):
+    """Numeric value of a top-level question id ('01'->1, '4'->4); None if inferred ('~3') / odd."""
+    if q.startswith("~"):
+        return None
+    m = re.match(r"^0*(\d+)$", q)
+    return int(m.group(1)) if m else None
+
+
+def _subkey(label, q):
+    """The part's own suffix within its question: '01.2'->'2', '4a'->'a', '1bi'->'bi'."""
+    s = label[len(q):] if label.startswith(q) else label
+    return s.lstrip(".").lstrip("~")
+
+
+def validate(result):
+    board = result.get("board")
+    code = result.get("paper_code")
+    flags, checks = [], []
+    parts = [(p["label"], q["question"], p) for q in result.get("questions", []) for p in q["parts"]]
+    conf = {}                                      # label -> high/medium/low
+    low = set()                                    # labels a check has implicated
+
+    def add(cid, severity, status, detail):
+        checks.append({"id": cid, "severity": severity, "status": status, "detail": detail})
+        if status != "ok":
+            flags.append(f"[{severity}] {cid}: {detail}")
+
+    # ---- C1: marks sum vs official maximum -------------------------------------------------
+    mc = result.get("stats", {}).get("marks_check")
+    exp = (mc or {}).get("expected_max") or result.get("front_matter", {}).get("max_marks")
+    msum = (mc or {}).get("sum")
+    if msum is None:
+        msum = sum(p["marks"] for *_, p in parts if p.get("marks") is not None)
+    if exp:
+        if msum > exp:
+            add("C1_marks_sum", "error", "over",
+                f"marks sum {msum} EXCEEDS official max {exp} (+{msum-exp}) — an over-read; check the paper")
+        elif msum < exp:
+            add("C1_marks_sum", "warn", "under",
+                f"marks sum {msum} below official max {exp} (-{exp-msum}) — missing parts or unread marks")
+        else:
+            add("C1_marks_sum", "info", "ok", f"marks sum {msum} == official max {exp}")
+    else:
+        add("C1_marks_sum", "info", "unknown", "no official max available to check the sum against")
+
+    # ---- C2: per-part marks plausibility ---------------------------------------------------
+    none_ct = zero_ct = 0
+    for lab, q, p in parts:
+        mk = p.get("marks")
+        if mk is None:
+            none_ct += 1; low.add(lab)
+        elif mk == 0:
+            zero_ct += 1; low.add(lab)
+        elif mk > IMPLAUSIBLE_PART_MARKS:
+            low.add(lab)
+            add("C2_part_marks", "warn", "implausible",
+                f"part {lab} has {mk} marks (> {IMPLAUSIBLE_PART_MARKS}) — verify it isn't a mis-read")
+    if none_ct or zero_ct:
+        add("C2_part_marks", "warn", "missing",
+            f"{none_ct} part(s) with no mark, {zero_ct} with 0 marks — unread/garbled mark tokens")
+    elif not any(c["id"] == "C2_part_marks" for c in checks):
+        add("C2_part_marks", "info", "ok", "every part carries a plausible mark")
+
+    # ---- C3: top-level question sequence + EXPECTED-question interpolation ------------------
+    # If Q1, Q2 ... Q14 are recovered but 3-13 are not, the paper certainly HAS 3-13 — they were
+    # just missed (e.g. a Docling page-collapse). We emit the full expected sequence with a per-Q
+    # `recovered` flag so a live question-tree view can render the gaps as explicit "needs a second
+    # pass" slots, and a targeted re-OCR knows exactly which questions to chase.
+    qids = [q for q in dict.fromkeys(q for _, q, _ in parts)]
+    nums = sorted({n for n in (_qnum(q) for q in qids) if n is not None})
+    zero_pad = any(len(q) == 2 and q.startswith("0") for q in qids)   # AQA 'NN' vs Edexcel/OCR 'N'
+    question_sequence = []
+    if any(q.startswith("~") for q in qids):
+        add("C3_question_seq", "info", "inferred",
+            "question numbers were OCR-inferred ('~N') — sequence not checkable; treat labels as approximate")
+    elif nums:
+        # isolated high outliers (a content number mis-read as 'Q67' after Q1-10) are likely
+        # spurious top-levels, not 50 missing questions — strip them off the top so the sequence
+        # reflects the real paper, and flag them for review instead of flooding the tree with slots.
+        core, suspect = nums[:], []
+        while len(core) >= 2 and core[-1] - core[-2] > 4:
+            suspect.insert(0, core.pop())
+        hi = core[-1] if core else nums[-1]
+        gaps = [n for n in range(nums[0], hi + 1) if n not in core]
+        question_sequence = [{"n": n, "label": (f"{n:02d}" if zero_pad else str(n)),
+                              "recovered": n in core} for n in range(nums[0], hi + 1)]
+        if suspect:
+            add("C3_question_seq", "warn", "spurious",
+                f"isolated high question number(s) {suspect} after a {nums[0]}-{hi} run — likely a "
+                f"content number mis-read as a top-level question; review/remove")
+        if gaps:
+            add("C3_question_seq", "warn", "gap",
+                f"top-level questions {gaps} missing between {nums[0]}-{hi} — expected but "
+                f"unrecovered; surface as second-pass slots in the question tree")
+        elif not suspect:
+            add("C3_question_seq", "info", "ok", f"questions {nums[0]}-{hi} contiguous")
+
+    # ---- C4: sub-part contiguity within each question --------------------------------------
+    def order(keys):
+        """Map a question's child keys to an ordered scheme + report holes. Handles .N and a/b/c."""
+        dig = sorted(int(k[0]) for k in keys if k[:1].isdigit())
+        let = sorted(k[0] for k in keys if k[:1].isalpha())
+        holes = []
+        if dig:
+            holes += [str(n) for n in range(dig[0], dig[-1] + 1) if n not in dig]
+        if let:
+            lo, hi = ord(let[0]), ord(let[-1])
+            holes += [chr(c) for c in range(lo, hi + 1) if chr(c) not in let]
+        return holes
+    byq = defaultdict(list)
+    for lab, q, p in parts:
+        sk = _subkey(lab, q)
+        if sk:
+            byq[q].append(sk)
+    seq_holes = {}
+    for q, keys in byq.items():
+        firsts = {k[0] for k in keys}            # immediate children only (a / 1 / etc.)
+        h = order(firsts)
+        if h:
+            seq_holes[q] = h
+    if seq_holes:
+        add("C4_subpart_seq", "warn", "gap",
+            "sub-part gaps: " + ", ".join(f"Q{q} missing {hs}" for q, hs in sorted(seq_holes.items())))
+    else:
+        add("C4_subpart_seq", "info", "ok", "sub-parts contiguous within every question")
+
+    # ---- C5: coverage vs ground truth (when present) ---------------------------------------
+    cov = result.get("coverage", {})
+    if cov.get("coverage_pct") is not None:
+        missed = cov.get("missed", [])
+        if missed:
+            add("C5_coverage", "warn", "missed",
+                f"{cov['coverage_pct']}% vs GT ({cov['recovered']}/{cov['total']}); missed {missed[:10]}")
+            low.update(missed)
+        else:
+            add("C5_coverage", "info", "ok", f"100% coverage vs GT ({cov['recovered']}/{cov['total']})")
+
+    # ---- per-part confidence + paper summary -----------------------------------------------
+    sum_mismatch = any(c["id"] == "C1_marks_sum" and c["status"] in ("over", "under") for c in checks)
+    for lab, q, p in parts:
+        if lab in low:
+            conf[lab] = "low"
+        elif sum_mismatch:
+            conf[lab] = "medium"               # paper-level doubt taints every part a little
+        else:
+            conf[lab] = "high"
+    severities = [c["severity"] for c in checks if c["status"] not in ("ok", "info", "unknown")]
+    worst = "error" if "error" in severities else "warn" if "warn" in severities else "clean"
+
+    return {
+        "paper_code": code, "board": board,
+        "summary": {
+            "worst_severity": worst,
+            "needs_priority_review": worst != "clean",
+            "n_flags": len(flags),
+            "marks_sum": msum, "official_max": exp,
+            "parts_total": len(parts),
+            "parts_low_conf": sum(1 for v in conf.values() if v == "low"),
+            "questions_expected": len(question_sequence) or None,
+            "questions_recovered": sum(1 for q in question_sequence if q["recovered"]) or None,
+        },
+        "flags": flags,
+        "checks": checks,
+        "part_confidence": conf,
+        "question_sequence": question_sequence,   # full expected skeleton (recovered + missing slots)
+    }
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("structured")
+    ap.add_argument("--out")
+    a = ap.parse_args()
+    rep = validate(json.load(open(a.structured)))
+    s = rep["summary"]
+    print(f"paper       : {rep['paper_code']}  ({rep['board']})")
+    print(f"verdict     : {s['worst_severity'].upper()}  "
+          f"{'-> PRIORITY REVIEW' if s['needs_priority_review'] else '-> all checks clean (still human-reviewable)'}")
+    print(f"marks       : {s['marks_sum']}/{s['official_max']}  | parts {s['parts_total']} "
+          f"({s['parts_low_conf']} low-confidence)")
+    if s.get("questions_expected"):
+        miss = [q["label"] for q in rep["question_sequence"] if not q["recovered"]]
+        print(f"questions   : {s['questions_recovered']}/{s['questions_expected']} recovered"
+              + (f"  | second-pass slots: {miss}" if miss else "  (complete sequence)"))
+    if rep["flags"]:
+        print("flags:")
+        for f in rep["flags"]:
+            print(f"  - {f}")
+    else:
+        print("flags       : none")
+    if a.out:
+        json.dump(rep, open(a.out, "w"), indent=2)
+        print(f"-> wrote {a.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_docling_auto_map.py
+++ b/tests/test_docling_auto_map.py
@ -0,0 +1,51 @@
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from api.services.docling import FIRST_PASS_SCHEMA, auto_map
+
+
+SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
+PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf"
+PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json"
+BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf"
+
+
+@pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present")
+def test_auto_map_matches_spike_physics_template_shape():
+    expected = json.loads(PHYSICS_TEMPLATE.read_text())
+    result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT)
+
+    assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
+    assert result["meta"]["schema"] == expected["meta"]["schema"]
+    assert set(result.keys()) == set(expected.keys())
+    assert result["meta"]["board"] == expected["meta"]["board"]
+    assert result["meta"]["paper_code"] == expected["meta"]["paper_code"]
+    assert len(result["margins"]) == len(expected["margins"])
+    assert set(result["pages"].keys()) == set(expected["pages"].keys())
+    assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
+    assert result["pages"]["2"]["part_bands"][0].keys() == expected["pages"]["2"]["part_bands"][0].keys()
+
+
+@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")
+def test_auto_map_fast_path_without_cache_produces_first_pass_template():
+    result = auto_map(
+        BORN_DIGITAL_PDF.read_bytes(),
+        source_pdf="samples/physics-p1h-2022-qp.pdf",
+        spike_root=SPIKE_ROOT,
+        prefer_cache=False,
+    )
+
+    assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
+    assert result["meta"]["board"] == "aqa"
+    assert result["meta"]["paper_code"] == "8463/1"
+    assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf"
+    assert result["margins"]
+    assert result["pages"]
+
+
+def test_auto_map_rejects_empty_pdf_bytes():
+    with pytest.raises(ValueError):
+        auto_map(b"")