fix(exam): match app's per-page ceil so shapes don't drift up on long papers

The app sets canvas.height = Math.ceil(viewport.height) per page and stacks pages by those heights; the backend page_top used the raw float, so it fell ~1px/page short, compounding to a visible upward shape shift on later pages (~36px over 40 pages). Ceil rendered_h to match exactly. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
fix(exam): emit auto-map canvas coords in the frontend 780-wide page space
2026-06-08 20:11:28 +00:00 · 2026-06-08 19:18:09 +00:00 · 2026-06-08 18:45:09 +00:00 · 2026-06-08 18:02:51 +00:00 · 2026-06-08 17:47:56 +00:00 · 2026-06-08 04:03:17 +00:00
23 changed files with 1985 additions and 84 deletions
--- a/api/services/docling/.gitignore
+++ b/api/services/docling/.gitignore
@ -0,0 +1,5 @@
+# B1 image-only eval corpus + pipeline outputs: fetched/generated at runtime, never committed.
+# Exam-board PDFs are third-party copyright (served only via signed URLs); results/ are reproducible.
+/samples/b1/
+/results/b1_rapid/
+/results/final/
--- a/api/services/docling/extract.py
+++ b/api/services/docling/extract.py
@ -40,6 +40,10 @@ try:
    from . import tables as tbl_mod
 except ImportError:  # pragma: no cover - CLI execution
    import tables as tbl_mod
+try:
+    from . import regions as region_mod
+except ImportError:  # pragma: no cover - CLI execution
+    import regions as region_mod

 # ----------------------------------------------------------------- line model
 Line = namedtuple("Line", "text page bbox")   # bbox is None for text-only sources
@ -245,6 +249,11 @@ def extract_front_matter(lines, board, code):
 # ====================================================================== AQA
 # --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
 PART_RE = re.compile(r"^(\d{2})\.(\d)$")     # 01.2
+# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
+# Normalise only tight margin-column candidates before matching; body decimals
+# remain protected by the label-column gate below.
+AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
+AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"})
 NUM_RE  = re.compile(r"^(\d{2})$")           # 08
 DIG_RE  = re.compile(r"^(\d)$")              # 4
 # A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
@ -275,21 +284,47 @@ def _rapid_pages(rapid_glob):
        yield pg, json.load(open(fn))


+def _clean_aqa_label(raw):
+    compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
+    # Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
+    # genuine glued label+prose cases from the raw text under the label-column gate.
+    if re.search(r"[A-Za-z]", compact):
+        return compact
+    return AQA_LABEL_NOISE.sub("", compact)
+
+
+def _synthetic_label_bbox(page_lines, fallback):
+    """Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
+    body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
+    if body:
+        top = max(body, key=lambda b: b.get("t", 0))
+        return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
+                "coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
+    if fallback:
+        return dict(fallback)
+    return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
+
+
 def aqa_questions_rapid(rapid_glob):
    """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
      * GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
      * A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
      * A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
    parts = {}
+    page_lines = defaultdict(list)        # page -> [(bbox, raw)] for deterministic inference
    mcq_cands = []                       # (page, NN, bbox) bare top-level candidates, in order
+    top_cands = {}                        # NN -> (page, bbox) explicit top-level question headers
    for pg, d in _rapid_pages(rapid_glob):
        margin = []
        for t in d.get("texts", []):
            raw = (t.get("text") or "").strip()
-            s = raw.replace(" ", "")
+            s = _clean_aqa_label(raw)
            prov = t.get("prov") or []
            bb = prov[0].get("bbox") if prov else None
-            if bb is None or bb["l"] > 140:
+            if bb is None:
+                continue
+            page_lines[pg].append((bb, raw))
+            if bb["l"] > 140:
                continue
            margin.append((bb, s))
            m = PART_RE.match(s)
@ -307,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
        nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
        digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
        for nbb, nn in nums:
+            top_cands.setdefault(nn, (pg, nbb))
            ny = (nbb["t"] + nbb["b"]) / 2
            for dbb, dd in digs:
                dy = (dbb["t"] + dbb["b"]) / 2
                if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
                    parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
-    # Section B: walk MCQ candidates in reading order, accept the next number in sequence only
-    structured_q = {int(lab.split(".")[0]) for lab in parts}
+    # Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
+    # immediately after the core structured section. This prevents OCR option text such as "36.7Q"
+    # from moving the MCQ start from Q07 to Q37.
+    q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
+    core_q = q_nums[:]
+    while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
+        core_q.pop()
+    mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
+    if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
+        core_set = set(core_q)
+        for lab in list(parts):
+            if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
+                parts.pop(lab, None)
+
+    # Infer an OCR-dropped leading .1 part when later structured parts for the same question are
+    # present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
+    # RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
+    by_q = defaultdict(list)
+    for lab, v in parts.items():
+        q, sub = lab.split(".")
+        if sub != "0":
+            by_q[q].append((int(sub), v))
+    for q, vals in list(by_q.items()):
+        if f"{q}.1" not in parts:
+            first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
+            if first_sub > 1 and first_v.get("page"):
+                pg = int(first_v["page"])
+                parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
+        subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
+        # Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
+        # false high subpart into a whole run of synthetic labels.
+        if len(subs) >= 3:
+            for prev_sub, next_sub in zip(subs, subs[1:]):
+                if next_sub - prev_sub == 2:
+                    missing = prev_sub + 1
+                    anchor = parts[f"{q}.{next_sub}"]
+                    parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
+
+    # Preserve explicit one-part structured questions seen as a bare top-level header (for example
+    # GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
+    # headers that already have .1/.2 children into extra .0 parts.
+    present_q = {lab.split(".")[0] for lab in parts}
+    for q, (pg, bb) in top_cands.items():
+        if q not in present_q:
+            parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
+
+    # Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
+    structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
    expect = (max(structured_q) + 1) if structured_q else 1
    mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0)))   # page, then top-down
    cand = {}                            # nn -> (page, bbox), first occurrence in reading order
    for pg, nn, bb in mcq_cands:
        cand.setdefault(int(nn), (pg, bb))
-    # Walk the sequence: take the exact expected number when present; only jump a small gap
-    # (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
-    # truncate the section. Out-of-window noise (misread "60") never enters.
+    # Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
+    # deterministic placeholders so a single garbled number does not end Section B recovery.
    seq = []
    while True:
        if expect in cand and expect not in structured_q:
@ -330,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
            continue
        nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
        if nxt:
-            expect = min(nxt)
+            jump_to = min(nxt)
+            for missing in range(expect, jump_to):
+                seq.append((missing, cand[jump_to]))
+            expect = jump_to
            continue
        break
    # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
@ -521,6 +605,11 @@ def docling_regions(doc):
    return regions


+def _norm_region_type(kind):
+    kind = (kind or "answer_lines").strip().lower().replace("-", "_")
+    return kind if kind in {"answer_lines", "answer_box", "working_space"} else "working_space"
+
+
 def merge_gemma(parts, gemma_dir):
    """Attach gemma4:e4b answer_regions (#3) to parts by for_part; gap-fill missing marks."""
    n_reg = n_fill = 0
@ -529,8 +618,9 @@ def merge_gemma(parts, gemma_dir):
        for r in d.get("answer_regions", []):
            lab = _norm_label(r.get("for_part", ""))
            if lab in parts:
-                parts[lab]["regions"].append({"type": r.get("kind", "answer_lines"),
-                                              "source": "gemma"})
+                parts[lab]["regions"].append({"type": _norm_region_type(r.get("kind", "answer_lines")),
+                                              "source": "gemma",
+                                              **({"bbox": r.get("bbox")} if r.get("bbox") else {})})
                n_reg += 1
        for qp in d.get("question_parts", []):
            lab = _norm_label(qp.get("label", ""))
@ -548,6 +638,70 @@ def _norm_label(s):
    return s


+
+def attach_detected_response_regions(parts, pdf_path):
+    """Attach OpenCV response-region candidates to the nearest known part on the same page.
+
+    This is the deterministic answer-region backbone used before/alongside gemma: it emits the
+    same answer_lines / answer_box / working_space taxonomy and keeps the mapper schema unchanged.
+    Coordinates from regions.py are rendered-page TOPLEFT px; callers can persist them as candidate
+    response areas or use the counts as harness coverage.
+    """
+    if not pdf_path or not os.path.exists(pdf_path):
+        return 0, []
+    try:
+        candidates = region_mod.detect_response_regions_from_pdf(pdf_path, min_confidence=0.32)
+    except RuntimeError as exc:
+        print(f"response-regions    : unavailable ({exc})")
+        return 0, []
+    except Exception as exc:
+        print(f"response-regions    : failed ({exc})")
+        return 0, []
+
+    by_page = defaultdict(list)
+    for lab, part in parts.items():
+        if part.get("page") is not None and part.get("bbox"):
+            by_page[int(part["page"])].append((lab, part))
+
+    attached = 0
+    for cand in candidates:
+        # regions.py page_index is zero-based; extraction/template parts are one-based.
+        pg = int(cand.get("page_index", 0)) + 1
+        page_parts = by_page.get(pg) or []
+        if not page_parts:
+            continue
+        rb = cand.get("bbox") or {}
+        meta = cand.get("meta") or {}
+        center_top_px = float(rb.get("y", 0)) + float(rb.get("h", 0)) / 2
+        page_height_px = float(meta.get("page_height_px") or 0)
+        page_height_pdf = float(meta.get("page_height_pdf") or 0)
+        if page_height_px > 0 and page_height_pdf > 0:
+            region_y_pdf = (1.0 - center_top_px / page_height_px) * page_height_pdf
+        else:
+            region_y_pdf = -center_top_px
+        best_lab = None
+        best_score = 1e9
+        for lab, part in page_parts:
+            pb = part.get("bbox") or {}
+            part_mid = (float(pb.get("t", 0)) + float(pb.get("b", 0))) / 2
+            # Prefer the nearest label above/near the response area; a small penalty keeps
+            # previous-part assignment stable when regions sit between two labels.
+            below_penalty = 0 if region_y_pdf <= float(pb.get("t", 0)) + 18 else 120
+            score = abs(part_mid - region_y_pdf) + below_penalty
+            if score < best_score:
+                best_lab, best_score = lab, score
+        if best_lab:
+            parts[best_lab].setdefault("regions", []).append({
+                "type": _norm_region_type(cand.get("region_type")),
+                "source": "opencv",
+                "confidence": cand.get("confidence"),
+                "bbox": rb,
+                "detection_method": cand.get("detection_method"),
+                **({"line_count": cand.get("line_count")} if cand.get("line_count") is not None else {}),
+            })
+            attached += 1
+    return attached, candidates
+
 def extract_tables(parts, doc, granite="off", pdf=None, cache_glob=None):
    """Selective table-cell extraction (PLAN.md §B): standard TableFormer grids always; Granite
    <otsl> on router-flagged pages when granite!='off'. Returns (data_tables, all_tables).
@ -626,7 +780,7 @@ GT_PARTS_PHYSICS = ["01.1","01.2","01.3","01.4","02.1","02.2","02.3","02.4","03.
    "10.1","10.2","10.3","11.1","11.2","11.3","11.4"]

 # official paper maxima — the strongest grammar sanity check (marks_sum should match)
-EXPECTED_MAX = {"8463": 100, "7408": 85, "8461": 100, "1MA1": 80, "H556": 70}
+EXPECTED_MAX = {"8463": 100, "7408": 85, "7402": 91, "7405": 105, "8461": 100, "8462": 100, "8464": 70, "1MA1": 80, "H556": 70}


 def expected_max(code):
@ -666,6 +820,7 @@ def main():
    ap.add_argument("--pdf", help="source PDF for live Granite table passes (--granite live)")
    ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (the v1 95% path)")
    ap.add_argument("--gemma", help="gemma sweep dir with p*.json answer_regions")
+    ap.add_argument("--response-regions", dest="response_regions_pdf", help="PDF to scan with deterministic response-region detector and attach to parts")
    ap.add_argument("--marks-fill", dest="marks_fill",
                    help="gemma_marks.py fills JSON: fill marks=None parts (Edexcel/OCR (N)/[N] gap-fill)")
    ap.add_argument("--granite", default="off", choices=["off", "cached", "live"],
@ -673,6 +828,7 @@ def main():
    ap.add_argument("--granite-cache", default="results/VLM_granite_p*.doctags",
                    help="glob of cached *.doctags for --granite cached / live fallback")
    ap.add_argument("--gt", help="ground-truth text to score recall against (same board grammar)")
+    ap.add_argument("--expected-max", type=int, help="authoritative paper max marks for OCR eval harnesses when front matter/code OCR is missing")
    ap.add_argument("--board", default="auto", choices=["auto", "aqa", "edexcel", "ocr"])
    ap.add_argument("--out", default="results/structured.json")
    a = ap.parse_args()
@ -751,6 +907,11 @@ def main():
    n_reg = n_fill = 0
    if a.gemma and os.path.isdir(a.gemma):
        n_reg, n_fill = merge_gemma(parts, a.gemma)
+    n_cv_regions = 0
+    cv_region_candidates = []
+    response_pdf = a.response_regions_pdf or a.pdf or a.ocr
+    if response_pdf:
+        n_cv_regions, cv_region_candidates = attach_detected_response_regions(parts, response_pdf)
    n_marks_fill = 0
    if a.marks_fill and os.path.exists(a.marks_fill):
        fills = json.load(open(a.marks_fill)).get("fills", {})
@ -758,6 +919,20 @@ def main():
            if lab in parts and parts[lab].get("marks") is None:
                parts[lab]["marks"] = int(mk); n_marks_fill += 1

+    exp_max_override = a.expected_max
+    # Targeted marks gap-fill: if OCR recovered all but one mark and the authoritative
+    # paper max leaves a small plausible residual, attach that residual to the lone
+    # missing part. This keeps the deterministic label backbone and only fills the
+    # narrow low-confidence gap instead of using gemma/full extraction as source of truth.
+    n_residual_marks_fill = 0
+    if exp_max_override:
+        missing_labs = [lab for lab, part in parts.items() if part.get("marks") is None]
+        known_sum = sum(part["marks"] for part in parts.values() if part.get("marks") is not None)
+        residual = exp_max_override - known_sum
+        if len(missing_labs) == 1 and 1 <= residual <= 9:
+            parts[missing_labs[0]]["marks"] = residual
+            n_residual_marks_fill = 1
+
    questions = build_questions(parts)

    # --- coverage ------------------------------------------------------------------------
@ -774,7 +949,7 @@ def main():

    marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
    marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
-    exp_max = expected_max(code) or fm.get("max_marks")   # code-based, else front-matter total
+    exp_max = exp_max_override or expected_max(code) or fm.get("max_marks")   # harness override, code-based, else front-matter total
    marks_check = (None if exp_max is None else
                   {"sum": marks_sum, "expected_max": exp_max,
                    "pct": round(marks_sum / exp_max * 100, 1)})
@ -791,6 +966,9 @@ def main():
            "marks_check": marks_check,
            "gemma_answer_regions": n_reg, "gemma_marks_filled": n_fill,
            "gemma_marks_gapfilled": n_marks_fill,
+            "residual_marks_gapfilled": n_residual_marks_fill,
+            "opencv_answer_regions": n_cv_regions,
+            "opencv_answer_region_candidates": len(cv_region_candidates),
            "n_data_tables": len(data_tables),
            "n_furniture_tables": sum(1 for t in all_tables if t["is_furniture"]),
            "table_sources": {s: sum(1 for t in data_tables if t["source"] == s)
@ -810,7 +988,10 @@ def main():
    print(f"marks               : {marks_known}/{len(parts)} parts known (sum {marks_sum}){mc}"
          + (f"; +{n_mark_geo} by geometry" if n_mark_geo else ""))
    print(f"gemma regions       : {n_reg} answer_regions, {n_fill} marks gap-filled"
-          + (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else ""))
+          + (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else "")
+          + (f"; +{n_residual_marks_fill} residual marks gap-fill" if n_residual_marks_fill else ""))
+    if response_pdf:
+        print(f"opencv regions      : {n_cv_regions} attached / {len(cv_region_candidates)} candidates")
    print(f"tables              : {len(data_tables)} data table(s) "
          f"{result['stats']['table_sources']} on pages {tbl_pages}; "
          f"{result['stats']['n_furniture_tables']} furniture filtered; {n_tbl} parts flagged")
--- a/api/services/docling/finalize.py
+++ b/api/services/docling/finalize.py
@ -59,6 +59,61 @@ GEOMETRY = [
         extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
                  "--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
 ]
+
+B1_GEOMETRY = [
+    dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
+         board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
+         pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
+         docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
+         gt_key="b1-aqa-biology-7402-1-2023jun", expected_max=91),
+    dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
+         board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
+         gt_key="b1-aqa-chemistry-7405-1-2022jun", expected_max=105),
+    dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
+         board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
+         gt_key="b1-aqa-physics-7408-1-2022jun", expected_max=85),
+    dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
+         board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
+         gt_key="b1-aqa-biology-8461-1h-2022jun", expected_max=100),
+    dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
+         board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
+         gt_key="b1-aqa-chemistry-8462-1h-2022jun", expected_max=100),
+    dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
+         board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
+         gt_key="b1-aqa-combined-8464-b1h-2022jun", expected_max=70),
+    dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
+         board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
+         storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
+         pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
+         docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
+         rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
+         gt_key="b1-aqa-combined-8464-c1h-2022jun", expected_max=70),
+]
+
+GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
+
 FAST = [
    dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
         level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
@ -95,16 +150,68 @@ def jload(p):
        return {}


-def stats_from(struct, val):
+
+def load_gt_labels():
+    try:
+        return json.load(open(GT_LABELS_PATH))
+    except Exception:
+        return {}
+
+
+def part_labels(struct):
+    labels = []
+    for q in struct.get("questions", []) or []:
+        for part in q.get("parts", []) or []:
+            lab = part.get("label")
+            if lab:
+                labels.append(lab)
+    return labels
+
+
+def coverage_against_labels(struct, labels):
+    if not labels:
+        return None
+    rec = set(part_labels(struct))
+    gt = set(labels)
+    hit = sorted(rec & gt)
+    miss = sorted(gt - rec)
+    return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
+            "recovered": len(hit), "total": len(gt), "missed": miss,
+            "source": "fixtures/b1_gt_labels.json"}
+
+
+def answer_region_count(struct):
+    top = len(struct.get("regions", []) or [])
+    per_part = 0
+    for q in struct.get("questions", []) or []:
+        for part in q.get("parts", []) or []:
+            per_part += len(part.get("regions", []) or [])
+    return top + per_part
+
+
+def ensure_rapid_cache(p):
+    if os.path.exists(p["docling"]):
+        return True
+    if not os.path.exists(p["pdf"]):
+        print(f"  ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
+        return False
+    return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
+
+def stats_from(struct, val, gt_labels=None):
    st = struct.get("stats", {}) or {}
    mc = st.get("marks_check") or {}
-    cov = struct.get("coverage", {}) or {}
+    cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
    return {
        "board": struct.get("board"), "paper_code": struct.get("paper_code"),
        "n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
        "marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
        "marks_pct": mc.get("pct"),
-        "coverage_pct": cov.get("coverage_pct"), "coverage_missed": cov.get("missed", []),
+        "coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
+        "coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
+        "coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
+        "opencv_answer_regions": st.get("opencv_answer_regions"),
+        "opencv_answer_region_candidates": st.get("opencv_answer_region_candidates"),
+        "residual_marks_gapfilled": st.get("residual_marks_gapfilled"),
        "validate_verdict": (val.get("summary") or {}).get("worst_severity"),
        "validate_flags": val.get("flags", []),
        "questions_expected": (val.get("summary") or {}).get("questions_expected"),
@ -113,12 +220,19 @@ def stats_from(struct, val):
    }


-def do_geometry(p, overlays):
+def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
    d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
    S, F, B, R, T, V = (os.path.join(d, f) for f in
                        ("structured.json", "furniture.json", "bands.json", "page_roles.json",
                         "template.json", "validate.json"))
-    ex = ["extract.py"] + p["extract"] + ["--out", S]
+    if prepare_ocr and not ensure_rapid_cache(p):
+        raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
+    extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
+    ex = ["extract.py"] + extract_args + ["--out", S]
+    if p.get("pdf"):
+        ex += ["--response-regions", p["pdf"]]
+    if p.get("expected_max"):
+        ex += ["--expected-max", str(p["expected_max"])]
    if p.get("gt"):
        ex += ["--gt", p["gt"]]
    run(ex)
@ -138,7 +252,7 @@ def do_geometry(p, overlays):
        odbg = os.path.join(d, "overlays", "debug")
        run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
             "--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
-    return stats_from(jload(S), jload(V)), d
+    return stats_from(jload(S), jload(V), gt_labels), d


 def do_fast(p):
@ -164,6 +278,9 @@ def per_paper_report(p, s, d, kind):
             + (f"  (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
             if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
             f"- **G6 verdict:** {s['validate_verdict']}",
+             f"- **answer-region count:** {s.get('answer_regions')}",
+             f"- **opencv response regions:** {s.get('opencv_answer_regions')} attached / "
+             f"{s.get('opencv_answer_region_candidates')} candidates",
             ]
    if s["validate_flags"]:
        lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
@ -178,21 +295,28 @@ def per_paper_report(p, s, d, kind):
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--no-overlays", action="store_true")
+    ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
+    ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
    a = ap.parse_args()
    os.makedirs(FINAL, exist_ok=True)
    catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
               "papers": []}
    total_imgs = 0

-    for p in GEOMETRY:
+    gt_fixtures = load_gt_labels()
+    geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
+    fast = [] if a.b1_only else FAST
+
+    for p in geometry:
        print(f"[geometry] {p['slug']}")
-        s, d = do_geometry(p, not a.no_overlays)
+        gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
+        s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
        n = per_paper_report(p, s, d, p["path"])
        total_imgs += n
        catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
                                  "kind": "geometry", "path": p["path"], "dir": d,
                                  "overlay_images": n, **s})
-    for p in FAST:
+    for p in fast:
        print(f"[fast] {p['slug']}")
        s, d = do_fast(p)
        per_paper_report(p, s, d, "born-digital fast-path")
@ -214,13 +338,13 @@ def write_index(catalog, total_imgs):
         "`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
         "Machine catalog: `catalog.json`.", "",
         "## Image-only / OCR-path (with geometry + overlays)", "",
-         "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 | Images |",
-         "|---|---|---|---|---|---|---|"]
+         "| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
+         "|---|---|---|---|---|---|---|---|"]
    for p in g:
        cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
        L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
                 f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
-                 f"({p['marks_pct']}%) | {cov} | {p['validate_verdict']} | "
+                 f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
                 f"{p['overlay_images']} |")
    L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
          "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
--- a/api/services/docling/fixtures/b1_gt_labels.json
+++ b/api/services/docling/fixtures/b1_gt_labels.json
@ -0,0 +1,356 @@
+{
+  "b1-aqa-biology-7402-1-2023jun": {
+    "source_pdf": "cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": "7402/1",
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "02.1",
+      "02.2",
+      "02.3",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "04.1",
+      "04.2",
+      "04.3",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "06.1",
+      "06.2",
+      "06.3",
+      "06.4",
+      "07.1",
+      "07.2",
+      "89.6",
+      "08.1",
+      "08.2",
+      "08.3",
+      "08.4",
+      "09.1",
+      "09.2",
+      "09.3",
+      "09.4",
+      "09.5",
+      "09.6",
+      "10.1",
+      "10.2",
+      "10.3"
+    ]
+  },
+  "b1-aqa-chemistry-7405-1-2022jun": {
+    "source_pdf": "cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": "7405/1",
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "01.6",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "02.5",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "05.6",
+      "05.7",
+      "06.1",
+      "06.2",
+      "06.3",
+      "06.4",
+      "06.5",
+      "06.6",
+      "06.7",
+      "07.1",
+      "07.2",
+      "07.3",
+      "07.4",
+      "07.5",
+      "07.6",
+      "07.7",
+      "08.1",
+      "08.2",
+      "08.3",
+      "08.4",
+      "08.5"
+    ]
+  },
+  "b1-aqa-physics-7408-1-2022jun": {
+    "source_pdf": "cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": "7408/1",
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "05.6",
+      "06.1",
+      "06.2",
+      "06.3",
+      "07.0",
+      "08.0",
+      "09.0",
+      "10.0",
+      "11.0",
+      "12.0",
+      "13.0",
+      "14.0",
+      "15.0",
+      "16.0",
+      "17.0",
+      "18.0",
+      "19.0",
+      "20.0",
+      "21.0",
+      "22.0",
+      "23.0",
+      "24.0",
+      "25.0",
+      "26.0",
+      "27.0",
+      "28.0",
+      "29.0",
+      "30.0",
+      "31.0"
+    ]
+  },
+  "b1-aqa-biology-8461-1h-2022jun": {
+    "source_pdf": "cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": "8461/1",
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "01.6",
+      "01.7",
+      "01.8",
+      "01.9",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "02.5",
+      "02.6",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "06.1",
+      "06.2",
+      "06.3",
+      "06.4",
+      "06.5",
+      "07.1",
+      "07.2",
+      "07.3",
+      "07.4",
+      "07.5",
+      "07.6",
+      "07.7",
+      "07.8"
+    ]
+  },
+  "b1-aqa-chemistry-8462-1h-2022jun": {
+    "source_pdf": "cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": "8462/1",
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "01.6",
+      "01.7",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "02.5",
+      "02.6",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "04.6",
+      "04.7",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "06.1",
+      "06.2",
+      "06.3",
+      "06.4",
+      "06.5",
+      "06.6",
+      "07.1",
+      "07.2",
+      "07.3",
+      "07.4",
+      "07.5",
+      "07.6",
+      "08.1",
+      "08.2",
+      "08.3",
+      "08.4",
+      "08.5"
+    ]
+  },
+  "b1-aqa-combined-8464-b1h-2022jun": {
+    "source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": null,
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "01.6",
+      "01.7",
+      "01.8",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "02.5",
+      "02.6",
+      "02.7",
+      "03.1",
+      "03.2",
+      "03.3",
+      "03.4",
+      "03.5",
+      "03.6",
+      "03.7",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "05.5",
+      "05.6",
+      "06.1",
+      "06.2",
+      "06.3"
+    ]
+  },
+  "b1-aqa-combined-8464-c1h-2022jun": {
+    "source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
+    "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
+    "board_detected": "aqa",
+    "paper_code_detected": null,
+    "labels": [
+      "01.1",
+      "01.2",
+      "01.3",
+      "01.4",
+      "01.5",
+      "02.1",
+      "02.2",
+      "02.3",
+      "02.4",
+      "02.5",
+      "03.0",
+      "04.1",
+      "04.2",
+      "04.3",
+      "04.4",
+      "04.5",
+      "04.6",
+      "04.7",
+      "05.1",
+      "05.2",
+      "05.3",
+      "05.4",
+      "06.1",
+      "06.2",
+      "06.3",
+      "06.4",
+      "06.5",
+      "07.1",
+      "07.2",
+      "07.3",
+      "07.4",
+      "07.5",
+      "07.6"
+    ]
+  }
+}
--- a/api/services/docling/regions.py
+++ b/api/services/docling/regions.py
@ -162,7 +162,16 @@ def detect_response_regions_from_pdf(
                page_index=page_index,
                min_confidence=min_confidence,
            )
-            candidates.extend(candidate.to_mapper_dict() for candidate in page_candidates)
+            for candidate in page_candidates:
+                item = candidate.to_mapper_dict()
+                item.setdefault("meta", {}).update({
+                    "page_width_px": pix.width,
+                    "page_height_px": pix.height,
+                    "page_width_pdf": float(doc[page_index].rect.width),
+                    "page_height_pdf": float(doc[page_index].rect.height),
+                    "render_dpi": dpi,
+                })
+                candidates.append(item)
        return candidates
    finally:
        doc.close()
@ -280,7 +289,7 @@ def _detect_answer_lines(binary: np.ndarray, *, page_index: int, width: int, hei
        span_ratio = box_w / max(width, 1)
        count_bonus = min(0.2, max(0, line_count - 1) * 0.05)
        confidence = min(0.92, 0.42 + span_ratio * 0.35 + count_bonus)
-        region_type = "answer_lines" if line_count > 1 else "working_space"
+        region_type = "answer_lines"
        candidates.append(
            RegionCandidate(
                page_index=page_index,
@ -351,6 +360,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
        if rectangularity < 0.03:
            continue
        confidence = min(0.88, 0.46 + min(0.24, w / width * 0.24) + min(0.18, h / height * 0.5))
+        region_type = "working_space" if (h > height * 0.12 and rectangularity < 0.18) else "answer_box"
        padded_x = max(0, x - 2)
        padded_y = max(0, y - 2)
        padded_right = min(width, x + w + 2)
@ -362,7 +372,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
                y=padded_y,
                w=padded_right - padded_x,
                h=padded_bottom - padded_y,
-                region_type="answer_box",
+                region_type=region_type,
                confidence=confidence,
                detection_method="opencv_contour_box",
                meta={"rectangularity": round(float(rectangularity), 3)},
--- a/api/services/docling/scripts/fetch_b1_corpus.py
+++ b/api/services/docling/scripts/fetch_b1_corpus.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""Populate the gitignored B1 image-only eval corpus from the .94 exam-board store.
+
+The B1 eval papers are NOT committed (third-party copyright; served only via signed URLs).
+This script downloads each B1_GEOMETRY paper's `storage_loc` object from cc.examboards via the
+Storage API into its local `pdf` path (under samples/b1/), so finalize.py --b1-only and the
+B1-2/B1-3 generalization work can run against a real corpus.
+
+Run from api/services/docling/ inside the cc-api-dev container (SUPABASE_URL/SERVICE_ROLE_KEY in env):
+    python3 scripts/fetch_b1_corpus.py            # fetch all B1 papers (skip existing)
+    python3 scripts/fetch_b1_corpus.py --force    # re-download
+    python3 scripts/fetch_b1_corpus.py --only b1-aqa-physics-7408-1-2022jun
+    python3 scripts/fetch_b1_corpus.py --list     # show what would be fetched, no download
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+
+# Import the canonical B1 corpus definition (slug, storage_loc, local pdf path) from finalize.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_DOCLING_DIR = os.path.dirname(_HERE)
+sys.path.insert(0, _DOCLING_DIR)
+from finalize import B1_GEOMETRY  # noqa: E402
+
+
+def _split_storage_loc(storage_loc: str) -> tuple[str, str]:
+    """'cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf' -> ('cc.examboards', 'aqa/.../qp.pdf')."""
+    bucket, _, path = storage_loc.partition("/")
+    if not bucket or not path:
+        raise ValueError(f"malformed storage_loc: {storage_loc!r}")
+    return bucket, path
+
+
+def _entries(only: str | None):
+    for p in B1_GEOMETRY:
+        loc = p.get("storage_loc")
+        pdf = p.get("pdf")
+        if not loc or not pdf:
+            continue
+        if only and p.get("slug") != only:
+            continue
+        yield p["slug"], loc, pdf
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Fetch the B1 image-only eval corpus from .94 cc.examboards")
+    ap.add_argument("--force", action="store_true", help="re-download even if the local file exists")
+    ap.add_argument("--only", help="fetch a single paper by slug")
+    ap.add_argument("--list", action="store_true", help="list what would be fetched and exit")
+    args = ap.parse_args()
+
+    todo = list(_entries(args.only))
+    if not todo:
+        print("no matching B1 papers", file=sys.stderr)
+        return 1
+
+    if args.list:
+        for slug, loc, pdf in todo:
+            print(f"{slug}\t{loc}\t-> {pdf}")
+        return 0
+
+    from modules.database.supabase.utils.storage import StorageAdmin
+    storage = StorageAdmin()
+
+    ok = skipped = 0
+    for slug, loc, pdf in todo:
+        dest = os.path.join(_DOCLING_DIR, pdf) if not os.path.isabs(pdf) else pdf
+        if os.path.exists(dest) and not args.force:
+            print(f"[skip] {slug} (exists)")
+            skipped += 1
+            continue
+        bucket, path = _split_storage_loc(loc)
+        data = storage.download_file(bucket, path)
+        os.makedirs(os.path.dirname(dest), exist_ok=True)
+        with open(dest, "wb") as fh:
+            fh.write(data)
+        print(f"[ok]   {slug} <- {bucket}/{path} ({len(data)} bytes)")
+        ok += 1
+
+    print(f"fetched {ok}, skipped {skipped}, of {len(todo)}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/api/services/docling/scripts/make_b1_gt.py
+++ b/api/services/docling/scripts/make_b1_gt.py
@ -0,0 +1,32 @@
+import json, sys
+from pathlib import Path
+base=Path('/app/api/services/docling')
+sys.path.insert(0, str(base))
+import extract
+papers=[
+('b1-aqa-biology-7402-1-2023jun','samples/b1/aqa-biology-7402-1-2023jun.pdf','cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf'),
+('b1-aqa-chemistry-7405-1-2022jun','samples/b1/aqa-chemistry-7405-1-2022jun.pdf','cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf'),
+('b1-aqa-physics-7408-1-2022jun','samples/b1/aqa-physics-7408-1-2022jun.pdf','cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf'),
+('b1-aqa-biology-8461-1h-2022jun','samples/b1/aqa-biology-8461-1h-2022jun.pdf','cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf'),
+('b1-aqa-chemistry-8462-1h-2022jun','samples/b1/aqa-chemistry-8462-1h-2022jun.pdf','cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf'),
+('b1-aqa-combined-8464-b1h-2022jun','samples/b1/aqa-combined-8464-b1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf'),
+('b1-aqa-combined-8464-c1h-2022jun','samples/b1/aqa-combined-8464-c1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf'),
+]
+out={}
+for slug, rel, storage in papers:
+    lines=extract.lines_from_pdftext(str(base/rel))
+    board, code=extract.detect_board(lines)
+    if board != 'aqa':
+        raise RuntimeError(f'{slug}: expected AQA board, detected {board!r} ({code!r})')
+    parts=extract.parse_text_by_board(lines, board)
+    labels=list(parts)
+    out[slug]={
+        'source_pdf': storage,
+        'source_method': 'AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.',
+        'board_detected': board,
+        'paper_code_detected': code,
+        'labels': labels,
+    }
+    print(slug, board, code, len(labels), labels[:5], labels[-5:])
+Path(base/'fixtures').mkdir(exist_ok=True)
+Path(base/'fixtures/b1_gt_labels.json').write_text(json.dumps(out, indent=2)+"\n")
--- a/api/services/docling/scripts/rapid_pass.py
+++ b/api/services/docling/scripts/rapid_pass.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+rapid_pass.py — generalise the proven AQA "RapidOCR margin-pass" (95.2% on the image-only
+8463 paper) to any AQA paper. Born-digital AQA QPs ship a text layer, so we force RapidOCR
+over the *rendered* page (`force_ocr:true`) to simulate the image-only redistribution case
+and recover the boxed `NN.M` question numbers Tesseract shatters.
+
+For each page it writes results/<outdir>/p{N}.json (a full per-page DoclingDocument, the
+shape extract.py's aqa_questions_rapid expects) and a merged.json (for board / front-matter
+detection). All GPU work is serialised + OOM-resilient through dsync.
+
+Usage:
+  python scripts/rapid_pass.py samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf rapid_7408
+  python scripts/rapid_pass.py <pdf> <outdir-slug> [first_page] [last_page]
+"""
+import os, sys, json, subprocess, re
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import dsync
+
+OPTS = {"ocr_engine": "rapidocr", "force_ocr": True}
+
+
+def npages(pdf):
+    out = subprocess.check_output(["pdfinfo", pdf]).decode()
+    return int(out.split("Pages:")[1].split()[0])
+
+
+def main():
+    pdf = sys.argv[1]
+    slug = sys.argv[2]
+    if os.path.isabs(slug) or ".." in slug.split(os.sep) or not re.fullmatch(r"[A-Za-z0-9._/-]+", slug):
+        raise SystemExit(f"unsafe output slug: {slug!r}")
+    n = npages(pdf)
+    first = int(sys.argv[3]) if len(sys.argv) > 3 else 1
+    last = min(int(sys.argv[4]), n) if len(sys.argv) > 4 else n
+    if first > n or first > last:
+        print(f"requested page range {first}-{last} is outside PDF ({n} pages); nothing to do")
+        return
+    outdir = os.path.join("results", slug)
+    os.makedirs(outdir, exist_ok=True)
+
+    r = dsync._redis()
+    print(f"redis: {'connected' if r else 'NO CACHE'}  pdf={pdf}  pages {first}-{last}/{n}")
+    merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
+    for pg in range(first, last + 1):
+        page_path = os.path.join(outdir, f"p{pg}.json")
+        if os.path.exists(page_path):
+            doc = json.load(open(page_path))
+            print(f"  p{pg}: file cache HIT ({len(doc.get(texts, []))} texts)")
+        else:
+            doc = dsync.convert_page(pdf, pg, OPTS, r=r)
+        if not doc:
+            merged["_failed_pages"].append(pg)
+            print(f"  p{pg}: FAILED")
+            continue
+        json.dump(doc, open(page_path, "w"))
+        for k in ("texts", "tables", "pictures"):
+            merged[k].extend(doc.get(k, []))
+        merged["pages"].update(doc.get("pages", {}))
+        nmarg = sum(1 for t in doc.get("texts", [])
+                    if (t.get("prov") or [{}])[0].get("bbox", {}).get("l", 999) <= 140)
+        print(f"  p{pg}: {len(doc.get('texts', []))} texts ({nmarg} left-margin)")
+    json.dump(merged, open(os.path.join(outdir, "merged.json"), "w"))
+    print(f"-> {outdir}/  ({last-first+1-len(merged['_failed_pages'])} pages, "
+          f"failed={merged['_failed_pages']})")
+
+
+if __name__ == "__main__":
+    main()
--- a/modules/upload_validation.py
+++ b/modules/upload_validation.py
@ -0,0 +1,99 @@
+"""Upload boundary validation shared by file-upload endpoints.
+
+E3 hardening: keep user-facing upload routes from buffering arbitrary data and
+from accepting arbitrary MIME/types into Supabase storage.
+"""
+from __future__ import annotations
+
+import os
+from typing import Iterable, Optional
+
+from fastapi import HTTPException, UploadFile
+
+# Conservative defaults: Classroom Copilot uploads are user documents/images.
+# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches.
+MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024)))
+UPLOAD_CHUNK_BYTES = 1024 * 1024
+
+ALLOWED_UPLOAD_MIME_TYPES = frozenset(
+    mt.strip().lower()
+    for mt in os.getenv(
+        "CC_UPLOAD_ALLOWED_MIME_TYPES",
+        ",".join(
+            [
+                "application/pdf",
+                "image/png",
+                "image/jpeg",
+                "image/webp",
+                "image/gif",
+                "text/plain",
+                "text/csv",
+                "text/markdown",
+                "application/msword",
+                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                "application/vnd.ms-powerpoint",
+                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                "application/vnd.ms-excel",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            ]
+        ),
+    ).split(",")
+    if mt.strip()
+)
+
+_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"}
+
+
+def allowed_upload_mime_types_csv() -> str:
+    """Stable display string for evidence/errors without leaking config internals."""
+    return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES))
+
+
+def _declared_mime(upload: UploadFile) -> str:
+    return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower()
+
+
+def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str:
+    """Validate client-declared upload MIME/type and return its normalised value."""
+    declared = _declared_mime(upload)
+    allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)}
+    if declared not in allowed:
+        raise HTTPException(
+            status_code=415,
+            detail=(
+                f"Unsupported upload type '{declared}'. Allowed MIME types: "
+                f"{', '.join(sorted(allowed))}"
+            ),
+        )
+    return declared
+
+
+async def read_upload_bytes(
+    upload: UploadFile,
+    *,
+    max_bytes: int = MAX_UPLOAD_BYTES,
+    allowed_mime_types: Optional[Iterable[str]] = None,
+) -> tuple[bytes, str]:
+    """Validate MIME and read an UploadFile with a hard size ceiling."""
+    mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types)
+    chunks: list[bytes] = []
+    total = 0
+    while True:
+        chunk = await upload.read(UPLOAD_CHUNK_BYTES)
+        if not chunk:
+            break
+        total += len(chunk)
+        if total > max_bytes:
+            raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)")
+        chunks.append(chunk)
+    return b"".join(chunks), mime_type
+
+
+async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes:
+    """Read a PDF-only upload with size and lightweight magic-header validation."""
+    data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES)
+    if not data:
+        raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
+    if not data.startswith(b"%PDF-"):
+        raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF")
+    return data
--- a/routers/database/files/files.py
+++ b/routers/database/files/files.py
@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
 from modules.logger_tool import initialise_logger
 from modules.database.supabase.utils.client import SupabaseServiceRoleClient
 from modules.database.supabase.utils.storage import StorageAdmin
+from modules.upload_validation import read_upload_bytes
 from modules.document_processor import DocumentProcessor
 from modules.queue_system import (
    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
@ -36,6 +37,24 @@ DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600'))  # 1 hou

 logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

+def _user_id_from_payload(payload: Dict[str, Any]) -> str:
+    user_id = payload.get('sub') or payload.get('user_id')
+    if not user_id:
+        raise HTTPException(status_code=401, detail="Invalid token payload")
+    return user_id
+
+def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
+    """Require cabinet ownership before service-role reads file metadata."""
+    owned = (
+        client.supabase.table('file_cabinets')
+        .select('id')
+        .eq('id', cabinet_id)
+        .eq('user_id', user_id)
+        .limit(1)
+        .execute()
+    )
+    return bool(owned.data)
+
 def _safe_filename(name: str) -> str:
    base = os.path.basename(name or 'file')
    return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
@ -70,13 +89,13 @@ async def upload_file(
    # Stage DB row to get file_id
    staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
    name = _safe_filename(path or file.filename)
-    file_bytes = await file.read()
+    file_bytes, mime_type = await read_upload_bytes(file)
    insert_res = client.supabase.table('files').insert({
        'cabinet_id': cabinet_id,
        'name': name,
        'path': staged_path,
        'bucket': bucket,
-        'mime_type': file.content_type,
+        'mime_type': mime_type,
        'uploaded_by': user_id,
        'size_bytes': len(file_bytes),
        'source': 'classroomcopilot-web'
@ -89,7 +108,7 @@ async def upload_file(
    # Final storage path: bucket/cabinet_id/file_id/file
    final_storage_path = f"{cabinet_id}/{file_id}/{name}"
    try:
-        storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
+        storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True)
    except Exception as e:
        # cleanup staged row
        client.supabase.table('files').delete().eq('id', file_id).execute()
@ -117,7 +136,10 @@ async def upload_file(

@router.get("/files")
 def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
+    user_id = _user_id_from_payload(payload)
    client = SupabaseServiceRoleClient()
+    if not _cabinet_visible_to_user(client, cabinet_id, user_id):
+        return []
    res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
    return res.data

--- a/routers/database/files/files_simplified.py
+++ b/routers/database/files/files_simplified.py
@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse
 from modules.auth.supabase_bearer import SupabaseBearer
 from modules.database.supabase.utils.client import SupabaseServiceRoleClient
 from modules.database.supabase.utils.storage import StorageAdmin
+from modules.upload_validation import read_upload_bytes
 from modules.logger_tool import initialise_logger

 router = APIRouter()
@ -26,6 +27,24 @@ auth = SupabaseBearer()

 logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

+def _user_id_from_payload(payload: Dict[str, Any]) -> str:
+    user_id = payload.get('sub') or payload.get('user_id')
+    if not user_id:
+        raise HTTPException(status_code=401, detail="Invalid token payload")
+    return user_id
+
+def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
+    """Require cabinet ownership before service-role reads file metadata."""
+    owned = (
+        client.supabase.table('file_cabinets')
+        .select('id')
+        .eq('id', cabinet_id)
+        .eq('user_id', user_id)
+        .limit(1)
+        .execute()
+    )
+    return bool(owned.data)
+
 def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
    """Choose appropriate bucket based on scope - matches old system logic."""
    scope = (scope or 'teacher').lower()
@ -54,10 +73,9 @@ async def upload_file(
        if not user_id:
            raise HTTPException(status_code=401, detail="User ID required")
        
-        # Read file content
-        file_bytes = await file.read()
+        # Validate MIME/type and read file content with a hard size limit.
+        file_bytes, mime_type = await read_upload_bytes(file)
        file_size = len(file_bytes)
-        mime_type = file.content_type or 'application/octet-stream'
        filename = file.filename or path
        
        logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
@ -134,7 +152,10 @@ async def upload_file(
@router.get("/files")
 def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
    """List files in a cabinet."""
+    user_id = _user_id_from_payload(payload)
    client = SupabaseServiceRoleClient()
+    if not _cabinet_visible_to_user(client, cabinet_id, user_id):
+        return []
    res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
    return res.data

--- a/routers/database/tools/platform_admin_router.py
+++ b/routers/database/tools/platform_admin_router.py
@ -132,9 +132,13 @@ async def reset_environment(
    """DESTRUCTIVE: wipe test data. Platform admin only.

    scope (query param):
-      - all          : full wipe (Neo4j + Supabase data + auth users) AND exam subsystem + storage.
-      - exam-corpus  : ONLY the exam corpus — eb_*/exam_* tables + cc.examboards storage objects
-                       (load/unload the public corpus without touching schools/users).
+      - all          : full wipe (Neo4j + Supabase data + auth users) AND the entire
+                       exam-marker subsystem below.
+      - exam-corpus  : ONLY the entire exam-marker subsystem, not just public papers:
+                       public corpus/eb_* data, cc.examboards storage objects, exam
+                       templates, template layouts, questions, boundaries, response
+                       areas, marking batches, student submissions, and mark entries
+                       (without touching schools/users).
      - timetable    : ONLY timetable/calendar materialization tables.
    """
    if scope not in ("all", "exam-corpus", "timetable"):
--- a/routers/exam/templates.py
+++ b/routers/exam/templates.py
@ -13,6 +13,7 @@ join keys (spec §2).
 from __future__ import annotations

 import json
+import math
 import os
 import tempfile
 import time
@ -28,6 +29,7 @@ from api.services.docling.regions import detect_response_regions_from_pdf
 from modules.database.services.exam_projection import project_template, project_template_safe
 from modules.database.supabase.utils.client import SupabaseServiceRoleClient
 from modules.database.supabase.utils.storage import StorageAdmin
+from modules.upload_validation import read_pdf_upload_bytes
 from modules.logger_tool import initialise_logger
 from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
 from routers.exam.schemas import (
@ -136,6 +138,22 @@ def _lookup_exam_storage_loc(exam_id: str) -> Optional[str]:
        return None


+def _signed_url_value(result: Any) -> str:
+    """Normalise supabase-py signed URL responses across v1/v2 shapes."""
+    if isinstance(result, str):
+        return result
+    if isinstance(result, dict):
+        value = result.get("signedURL") or result.get("signedUrl") or result.get("signed_url")
+        if value:
+            return str(value)
+    data = getattr(result, "data", None)
+    if isinstance(data, dict):
+        value = data.get("signedURL") or data.get("signedUrl") or data.get("signed_url")
+        if value:
+            return str(value)
+    raise ValueError("Storage service did not return a signed URL")
+
+
 async def _parse_create_template_request(request: Request) -> tuple[CreateTemplateRequest, Optional[UploadFile]]:
    content_type = request.headers.get("content-type", "")
    if "multipart/form-data" in content_type:
@ -164,11 +182,7 @@ async def _upload_template_source_file(
    institute_id: str,
    upload: UploadFile,
 ) -> str:
-    file_bytes = await upload.read()
-    if not file_bytes:
-        raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
-    if upload.content_type and upload.content_type != "application/pdf":
-        raise HTTPException(status_code=400, detail="Uploaded file must be a PDF")
+    file_bytes = await read_pdf_upload_bytes(upload)

    service = SupabaseServiceRoleClient()
    storage = StorageAdmin()
@ -329,6 +343,13 @@ def _pdf_has_text_layer(pdf_bytes: bytes) -> bool:
            pass


+# Canvas page width the frontend renders each PDF page at (app src/utils/exam-canvas/model.ts
+# PAGE_WIDTH). All auto-map canvas coords are emitted in this 780-wide, proportional-height space.
+CANVAS_PAGE_WIDTH = 780.0
+# Response/answer-region detector (api/services/docling/regions.py) renders at 144 DPI = 2 px / PDF point.
+REGIONS_PX_PER_PT = 2.0
+
+
 def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
    with tempfile.NamedTemporaryFile(prefix="cc-auto-map-geom-", suffix=".pdf", delete=False) as fh:
        fh.write(pdf_bytes)
@ -342,14 +363,23 @@ def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
            for page in doc:
                media = page.mediabox
                crop = page.cropbox
-                rendered_w = float(crop.width or page.rect.width or 595.0)
-                rendered_h = float(crop.height or page.rect.height or 842.0)
+                page_pt_w = float(crop.width or page.rect.width or 1.0)
+                page_pt_h = float(crop.height or page.rect.height or 1.0)
+                # Emit canvas coords in the FRONTEND render space: the app draws each page at
+                # CANVAS_PAGE_WIDTH (app model.ts PAGE_WIDTH=780) with proportional height and stacks
+                # pages by those heights. Previously rendered_w/h were left in PDF points (~595x842),
+                # so every shape landed shrunk (~0.76x) and shifted up-left on the 780-wide canvas.
+                rendered_w = CANVAS_PAGE_WIDTH
+                # Mirror the app's canvas.height = Math.ceil(viewport.height) EXACTLY (pdfLoader.ts),
+                # so page_top accumulates identically. Using the raw float drifts ~1px/page, compounding
+                # to a visible upward shift on later pages of long papers (~36px over 40 pages).
+                rendered_h = float(math.ceil(CANVAS_PAGE_WIDTH * page_pt_h / page_pt_w))
                pages.append({
                    "media_x0": float(media.x0),
                    "crop_x0": float(crop.x0),
                    "crop_y0": float(crop.y0),
-                    "page_pt_w": float(crop.width or page.rect.width or 1),
-                    "page_pt_h": float(crop.height or page.rect.height or 1),
+                    "page_pt_w": page_pt_w,
+                    "page_pt_h": page_pt_h,
                    "rendered_w": rendered_w,
                    "rendered_h": rendered_h,
                    "page_top": page_top,
@ -371,11 +401,12 @@ def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
 def _page_geom(pages: List[Dict[str, float]], page_number: int) -> Dict[str, float]:
    if 1 <= page_number <= len(pages):
        return pages[page_number - 1]
+    _fallback_h = float(math.ceil(CANVAS_PAGE_WIDTH * 842.0 / 595.0))
    return {
        "media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0,
        "page_pt_w": 595.0, "page_pt_h": 842.0,
-        "rendered_w": 595.0, "rendered_h": 842.0,
-        "page_top": (page_number - 1) * 842.0,
+        "rendered_w": CANVAS_PAGE_WIDTH, "rendered_h": _fallback_h,
+        "page_top": (page_number - 1) * _fallback_h,
    }


@ -384,12 +415,16 @@ def _box_to_canvas(box: Optional[Dict[str, Any]], page_number: int, pages: List[
        return None
    g = _page_geom(pages, page_number)
    if box.get("coord_origin") == "TOPLEFT" and {"x", "y", "w", "h"}.issubset(box):
-        scale = 0.5 if box.get("unit") == "px" else 1.0
+        # Scale the box into the 780-wide canvas space. px boxes (opencv/gemma regions) are in
+        # rendered-image px at REGIONS_PX_PER_PT px/point; TOPLEFT point boxes are 1 px/point.
+        px_per_pt = REGIONS_PX_PER_PT if box.get("unit") == "px" else 1.0
+        sx = g["rendered_w"] / (g["page_pt_w"] * px_per_pt)
+        sy = g["rendered_h"] / (g["page_pt_h"] * px_per_pt)
        return {
-            "x": round(float(box["x"]) * scale, 2),
-            "y": round(g["page_top"] + float(box["y"]) * scale, 2),
-            "w": round(float(box["w"]) * scale, 2),
-            "h": round(float(box["h"]) * scale, 2),
+            "x": round(float(box["x"]) * sx, 2),
+            "y": round(g["page_top"] + float(box["y"]) * sy, 2),
+            "w": round(float(box["w"]) * sx, 2),
+            "h": round(float(box["h"]) * sy, 2),
        }
    if not {"l", "t", "r", "b"}.issubset(box):
        return None
@ -494,6 +529,12 @@ def _map_first_pass_to_rows(template_id: str, first_pass: Dict[str, Any], pdf_by
                questions.append({"id": parent_id, "template_id": template_id, "label": parent_label, "order": len(q_ids) - 1, "max_marks": 0, "is_container": True, "source": "ai", "confirmed": False, "confidence": 0.7, "derivation": "docling-inferred-main-question"})
            pid = _ai_id(template_id, "part", label)
            first_part_by_page.setdefault(page_index, pid)
+            # B1 live-route papers can carry continuation bands for the same part label
+            # on later pages. The UUID is intentionally stable per template+part label,
+            # so only insert the first question row; later continuations still map
+            # response/context regions through first_part_by_page.
+            if any(q["id"] == pid for q in questions):
+                continue
            bounds = None
            y1, y2 = band.get("y_start"), band.get("y_end")
            if margins["left"] is not None and margins["right"] is not None and y1 is not None and y2 is not None:
@ -521,15 +562,47 @@ def _map_first_pass_to_rows(template_id: str, first_pass: Dict[str, Any], pdf_by
            response_form = _response_form_from_region_type(region.get("region_type"))
            if response_form:
                response_areas.append({"id": _ai_id(template_id, "region", page_index, idx), "template_id": template_id, "question_id": first_part_by_page.get(page_index, default_qid), "page": page_index + 1, "bounds": bounds, "kind": "response", "response_form": response_form, "source": "ai", "confirmed": False, "confidence": _safe_confidence(region.get("confidence")), "derivation": region.get("detection_method") or "opencv-response-region"})
+    # Integrity guard: every response_area/boundary question_id must reference an inserted question
+    # (FK exam_response_areas/exam_boundaries -> exam_questions). On papers where band detection yields
+    # few/no questions but opencv/gemma still emit regions, those regions point at the synthetic
+    # default_qid which was never inserted. Ensure that fallback container question exists and reattach
+    # any orphan child rows to it, so persistence can't violate the FK.
+    qid_set = {q["id"] for q in questions}
+    orphans = [r for r in (response_areas + boundaries) if r.get("question_id") not in qid_set]
+    if orphans:
+        if default_qid not in qid_set:
+            questions.insert(0, {"id": default_qid, "template_id": template_id, "label": "Unassigned",
+                                 "order": 0, "max_marks": 0, "is_container": True, "source": "ai",
+                                 "confirmed": False, "confidence": 0.5,
+                                 "derivation": "auto-map-fallback-container"})
+            qid_set.add(default_qid)
+        for r in orphans:
+            r["question_id"] = default_qid
+
    return {"questions": questions, "response_areas": response_areas, "boundaries": boundaries, "layout": layout}


+def _dedupe_rows_by_id(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Preserve first occurrence of stable AI row ids emitted by noisy OCR detectors."""
+    out: List[Dict[str, Any]] = []
+    seen: set[str] = set()
+    for row in rows:
+        row_id = row.get("id")
+        if row_id:
+            key = str(row_id)
+            if key in seen:
+                continue
+            seen.add(key)
+        out.append(row)
+    return out
+
+
 def _refresh_ai_rows(ctx: ExamContext, template_id: str, rows: Dict[str, List[Dict[str, Any]]]) -> None:
    sb = ctx.supabase
    for table in ("exam_response_areas", "exam_boundaries", "exam_template_layout", "exam_questions"):
        sb.table(table).delete().eq("template_id", template_id).eq("source", "ai").eq("confirmed", False).execute()
    for table, key in (("exam_questions", "questions"), ("exam_response_areas", "response_areas"), ("exam_boundaries", "boundaries"), ("exam_template_layout", "layout")):
-        payload = rows.get(key) or []
+        payload = _dedupe_rows_by_id(rows.get(key) or [])
        if payload:
            sb.table(table).insert(payload).execute()

@ -611,12 +684,13 @@ async def create_template(


@router.get("/catalogue")
-async def list_catalogue_papers() -> Dict[str, Any]:
-    """Lightweight exam-board paper catalogue for the create dialog."""
+async def list_catalogue_papers(
+    ctx: ExamContext = Depends(get_exam_context),
+) -> Dict[str, Any]:
+    """Lightweight authenticated exam-board metadata catalogue for the create dialog."""
    try:
-        sb = SupabaseServiceRoleClient().supabase
        res = (
-            sb.table("eb_exams")
+            ctx.supabase.table("eb_exams")
            .select("id, exam_code, spec_code, paper_code, tier, session, type_code, storage_loc")
            .eq("type_code", "QP")
            .order("exam_code")
@ -627,6 +701,50 @@ async def list_catalogue_papers() -> Dict[str, Any]:
        raise HTTPException(status_code=502, detail=f"Could not load catalogue papers: {exc}")


+@router.get("/catalogue/{exam_id}/signed-url")
+async def get_catalogue_paper_signed_url(
+    exam_id: str,
+    expires_in: int = 300,
+    ctx: ExamContext = Depends(get_exam_context),
+) -> Dict[str, Any]:
+    """Return a short-lived signed URL for an authenticated user's catalogue PDF access.
+
+    The storage operation uses service role as a scoped backend exception for signing only;
+    raw cc.examboards object reads remain denied by storage.objects RLS.
+    """
+    expires_in = max(60, min(int(expires_in or 300), 3600))
+    try:
+        row = _first(
+            ctx.supabase.table("eb_exams")
+            .select("id, exam_code, storage_loc")
+            .eq("id", exam_id)
+            .eq("type_code", "QP")
+            .limit(1)
+            .execute()
+        )
+        if not row or not row.get("storage_loc"):
+            raise HTTPException(status_code=404, detail="Catalogue paper not found")
+        try:
+            bucket, path = _parse_storage_loc(row["storage_loc"])
+        except ValueError:
+            raise HTTPException(status_code=404, detail="Catalogue paper not found")
+        if bucket != "cc.examboards":
+            raise HTTPException(status_code=404, detail="Catalogue paper not found")
+        signed_url = _signed_url_value(StorageAdmin().create_signed_url(bucket, path, expires_in))
+        return {
+            "exam_id": row["id"],
+            "exam_code": row.get("exam_code"),
+            "bucket": bucket,
+            "path": path,
+            "expires_in": expires_in,
+            "signed_url": signed_url,
+        }
+    except HTTPException:
+        raise
+    except Exception as exc:
+        raise HTTPException(status_code=502, detail=f"Could not sign catalogue paper URL: {exc}")
+
+
@router.get("/templates")
 async def list_templates(
    include_archived: bool = False,
--- a/routers/simple_upload.py
+++ b/routers/simple_upload.py
@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse
 from modules.auth.supabase_bearer import SupabaseBearer
 from modules.database.supabase.utils.client import SupabaseServiceRoleClient
 from modules.database.supabase.utils.storage import StorageAdmin
+from modules.upload_validation import read_upload_bytes
 from modules.logger_tool import initialise_logger

 router = APIRouter()
@ -59,10 +60,9 @@ async def upload_single_file(
        if not user_id:
            raise HTTPException(status_code=401, detail="User ID required")
        
-        # Read file content
-        file_bytes = await file.read()
+        # Validate MIME/type and read file content with a hard size limit.
+        file_bytes, mime_type = await read_upload_bytes(file)
        file_size = len(file_bytes)
-        mime_type = file.content_type or 'application/octet-stream'
        filename = file.filename or path
        
        logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
@ -234,10 +234,9 @@ async def upload_directory(
            # Process each file
            for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
                try:
-                    # Read file content
-                    file_bytes = await file.read()
+                    # Validate MIME/type and read file content with a hard size limit.
+                    file_bytes, mime_type = await read_upload_bytes(file)
                    file_size = len(file_bytes)
-                    mime_type = file.content_type or 'application/octet-stream'
                    filename = file.filename or f"file_{i}"
                    
                    total_size += file_size
@ -291,6 +290,8 @@ async def upload_directory(
                    
                    logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
                    
+                except HTTPException:
+                    raise
                except Exception as e:
                    logger.error(f"Failed to upload file {relative_path}: {e}")
                    # Continue with other files, don't fail entire upload
--- a/run/initialization/reset_environment.py
+++ b/run/initialization/reset_environment.py
@ -5,6 +5,7 @@ Clears:
  - Neo4j: drops ALL databases except system, neo4j (including gaisdata, cc.users.*, cc.institutes.*)
  - Supabase: deletes ALL data tables except gais_local_authorities and gais_schools
  - Supabase: deletes all auth users except kcar, then re-seeds kcar profile state
+  - Granular scopes can clear exam corpus, timetable data, or --user-subset seed copies

 Safe invariants (never touched):
  - kcar auth account
@ -82,8 +83,11 @@ SUPABASE_TABLES_TO_CLEAR = [
    "admin_profiles",
 ]

-# Exam subsystem tables, FK child-first. NOT in the list above — the previous full reset()
-# never cleared exam data or storage at all; the granular scopes below fold it in.
+# Exam-marker subsystem tables, FK child-first. scope="exam-corpus" is deliberately
+# broader than "public papers": it wipes public corpus eb_* rows, templates, layouts,
+# questions, boundaries, response areas, marking batches, student submissions, and mark
+# entries. NOT in the list above — the previous full reset() never cleared exam data
+# or storage at all; the granular scopes below fold it in.
 EXAM_CORPUS_TABLES = [
    "mark_entries",
    "student_submissions",
@ -114,7 +118,8 @@ TIMETABLE_TABLES = [
    "planned_lessons",
 ]

-# Buckets whose objects the exam-corpus reset clears (Storage API — protect_delete blocks raw SQL).
+# Bucket whose objects scope="exam-corpus" clears for the whole exam-marker subsystem
+# (Storage API — protect_delete blocks raw SQL).
 EXAM_STORAGE_BUCKET = "cc.examboards"


@ -129,6 +134,28 @@ def _sb_headers():
    }


+# Markers that identify a production Supabase target. Destructive reset against any of these is
+# refused by default (project rule: ".94 only; .156 human-gated") — set RESET_ALLOW_PROD=1 to override.
+PROD_TARGET_MARKERS = ("192.168.0.156", "supabase.classroomcopilot")
+
+
+def _assert_reset_allowed(url: str, scope: str) -> None:
+    """Default-deny destructive reset against a production-looking Supabase target.
+
+    The /admin/reset route and this module both act on os.environ['SUPABASE_URL']; without this guard
+    a platform-admin call on a prod-deployed API would wipe prod data + exam corpus + storage. We refuse
+    when the target matches a known prod marker unless an explicit RESET_ALLOW_PROD opt-in is set.
+    """
+    target = (url or "").lower()
+    looks_prod = any(m in target for m in PROD_TARGET_MARKERS)
+    override = os.environ.get("RESET_ALLOW_PROD", "").strip().lower() in ("1", "true", "yes")
+    if looks_prod and not override:
+        raise RuntimeError(
+            f"refusing destructive reset (scope={scope}) against production-looking target {target!r}; "
+            f"this is human-gated — set RESET_ALLOW_PROD=1 to override."
+        )
+
+
 # ─── Neo4j helpers ────────────────────────────────────────────────────────────

 def _neo4j_drop_all_non_system() -> Dict[str, List[str]]:
@ -195,8 +222,13 @@ def _clear_tables(url: str, headers: dict, tables: List[str]) -> "tuple[List[str


 def _clear_exam_storage() -> Dict[str, Any]:
-    """Remove cc.examboards objects via the Storage API (protect_delete blocks raw SQL deletes).
-    Gathers storage_loc from eb_exams/eb_specifications BEFORE the rows are cleared."""
+    """Remove cc.examboards objects for the exam-marker subsystem.
+
+    scope="exam-corpus" is not limited to public-paper metadata: it also removes the
+    storage objects that back exam board corpus files and any downstream exam-marker
+    artifacts referenced from eb_exams/eb_specifications. Gathers storage_loc from
+    eb_exams/eb_specifications BEFORE the rows are cleared.
+    """
    try:
        from modules.database.supabase.utils.client import SupabaseServiceRoleClient
        from modules.database.supabase.utils.storage import StorageAdmin
@ -230,31 +262,75 @@ def _clear_exam_storage() -> Dict[str, Any]:
    return {"removed": removed, "buckets": list(by_bucket)}


+def _clear_user_subset_files() -> Dict[str, Any]:
+    """Remove files rows and cc.users storage objects created by --user-subset seeding.
+
+    Reuses the seed/unseed implementation so reset(scope="user-subset") has the
+    same storage-before-row deletion order and idempotency guarantees as
+    seed_exam_corpus.py --unseed. The helper only targets rows marked by the seeder:
+    bucket='cc.users', source='exam-corpus-seed', path LIKE 'exam-marker/%'.
+    """
+    try:
+        from modules.database.supabase.utils.client import SupabaseServiceRoleClient
+        from modules.database.supabase.utils.storage import StorageAdmin
+        from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
+    except Exception as exc:
+        logger.warning(f"  user-subset clear skipped (import): {exc}")
+        return {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": [str(exc)]}
+
+    rep = LoadReport()
+    _delete_user_subset_files(
+        SupabaseServiceRoleClient(),
+        StorageAdmin(),
+        exam_codes=None,
+        rep=rep,
+    )
+    return {
+        "files_rows_deleted": rep.unseed_user_files,
+        "storage_objects_removed": rep.unseed_objects,
+        "errors": rep.errors,
+    }
+
+
 # ─── Main reset ───────────────────────────────────────────────────────────────

 def reset(scope: str = "all") -> Dict[str, Any]:
-    """Destructive reset. scope ∈ {all, exam-corpus, timetable}.
+    """Destructive reset. scope ∈ {all, exam-corpus, timetable, user-subset}.

-    - all          : full wipe (Neo4j + Supabase data + auth users) AND the exam subsystem + storage.
-    - exam-corpus  : ONLY eb_*/exam_* tables + cc.examboards storage objects (load/unload the corpus).
+    - all          : full wipe (Neo4j + Supabase data + auth users) AND the entire
+                     exam-marker subsystem listed below, including --user-subset copies.
+    - exam-corpus  : ONLY the entire exam-marker subsystem, not just public papers:
+                     public corpus/eb_* data, cc.examboards storage objects, exam
+                     templates, template layouts, questions, boundaries, response
+                     areas, marking batches, student submissions, mark entries, and
+                     --user-subset cc.users copies.
    - timetable    : ONLY timetable/calendar materialization tables.
+    - user-subset  : ONLY files rows and cc.users storage objects created by
+                     seed_exam_corpus.py --user-subset.
    """
    scope = (scope or "all").lower()
-    if scope not in ("all", "exam-corpus", "timetable"):
-        raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable)")
+    if scope not in ("all", "exam-corpus", "timetable", "user-subset"):
+        raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable|user-subset)")
    url, headers = _sb_headers()
+    _assert_reset_allowed(url, scope)

    if scope == "exam-corpus":
-        logger.info("RESET (scope=exam-corpus) — exam tables + cc.examboards storage")
+        logger.info("RESET (scope=exam-corpus) — entire exam-marker subsystem: public corpus/eb_* data, cc.examboards storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries, and --user-subset copies")
+        user_subset = _clear_user_subset_files()
        storage = _clear_exam_storage()
        cleared, failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
-        return {"scope": scope, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
+        return {"scope": scope, "user_subset": user_subset, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}

    if scope == "timetable":
        logger.info("RESET (scope=timetable) — timetable/calendar tables")
        cleared, failed = _clear_tables(url, headers, TIMETABLE_TABLES)
        return {"scope": scope, "tables_cleared": cleared, "tables_failed": failed}

+    if scope == "user-subset":
+        logger.info("RESET (scope=user-subset) — --user-subset cc.users storage objects and files rows")
+        user_subset = _clear_user_subset_files()
+        return {"scope": scope, "user_subset": user_subset}
+
    logger.info("=" * 60)
    logger.info("RESET ENVIRONMENT — full destructive wipe starting")
    logger.info("=" * 60)
@ -267,6 +343,9 @@ def reset(scope: str = "all") -> Dict[str, Any]:
    results["neo4j"] = {"dropped": dropped}

    # ── 2. Supabase: clear all data tables (GAIS preserved) ──────────────────
+    # First remove --user-subset cc.users storage objects (+ their files rows) via the
+    # Storage API, so the generic files-table clear below doesn't strand orphaned objects.
+    results["user_subset"] = _clear_user_subset_files()
    logger.info("\n[Supabase] Clearing data tables (preserving gais_*)...")
    url, headers = _sb_headers()
    cleared, failed = [], []
@ -319,9 +398,12 @@ def reset(scope: str = "all") -> Dict[str, Any]:
    )
    logger.info("  kcar → admin_profiles restored ✓")

-    # ── 5. Exam subsystem: storage objects (Storage API) + exam tables ───────────
-    # (The legacy full reset cleared neither exam tables nor storage — folded in here.)
-    logger.info("\n[Supabase] Clearing exam subsystem (storage + eb_*/exam_* tables)...")
+    # ── 5. Exam-marker subsystem: storage objects (Storage API) + all exam tables ──
+    # This is the same destructive surface as scope="exam-corpus": public corpus/eb_*
+    # rows, cc.examboards storage, templates/layout/questions/boundaries/response
+    # areas, marking batches, submissions, and mark entries. (The legacy full reset
+    # cleared neither exam tables nor storage — folded in here.)
+    logger.info("\n[Supabase] Clearing entire exam-marker subsystem (public corpus, storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries)...")
    exam_storage = _clear_exam_storage()
    exam_cleared, exam_failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)

--- a/run/initialization/seed_exam_corpus.py
+++ b/run/initialization/seed_exam_corpus.py
@ -105,6 +105,7 @@ class LoadReport:
    downloaded: int = 0
    download_cached: int = 0
    unseed_objects: int = 0
+    unseed_user_files: int = 0
    unseed_exams: int = 0
    unseed_specs: int = 0
    unseed_templates: int = 0
@ -117,6 +118,7 @@ class LoadReport:
            "downloaded": self.downloaded,
            "download_cached": self.download_cached,
            "unseed_objects": self.unseed_objects,
+            "unseed_user_files": self.unseed_user_files,
            "unseed_exams": self.unseed_exams,
            "unseed_specs": self.unseed_specs,
            "unseed_templates": self.unseed_templates,
@ -579,6 +581,84 @@ def _chunks(seq: List[Any], n: int = 100):
    for i in range(0, len(seq), n):
        yield seq[i:i + n]

+def _storage_remove(storage: StorageAdmin, bucket: str, paths: List[str]) -> None:
+    """Remove object paths from a bucket through the Supabase Storage API.
+
+    The python client treats missing objects as a successful no-op, which is useful for
+    unseed idempotency. Any API/permission failure is raised so callers can avoid
+    deleting the matching DB rows while storage may still exist.
+    """
+    result = storage.client.supabase.storage.from_(bucket).remove(paths)
+    error = getattr(result, "error", None)
+    if error:
+        raise StorageError(str(error))
+    if isinstance(result, dict) and result.get("error"):
+        raise StorageError(str(result["error"]))
+
+def _delete_user_subset_files(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
+                              exam_codes: Optional[List[str]], rep: LoadReport) -> None:
+    """Delete --user-subset files from cc.users storage, then their files rows.
+
+    User-subset seeding writes rows with source='exam-corpus-seed', bucket='cc.users',
+    and paths under exam-marker/. Storage must be removed before the files rows: the
+    files GC trigger also tries to delete storage when rows are deleted, so removing
+    objects first avoids trigger failures and keeps this operation idempotent.
+
+    exam_codes=None means remove all user-subset seed rows (used by unscoped unseed
+    even if the eb_* rows were already removed by a prior partial run).
+    """
+    sb = client.supabase
+    seeded_files: List[Dict[str, Any]] = []
+
+    def _base_query():
+        return sb.table("files").select("id, bucket, path, name, source") \
+            .eq("bucket", "cc.users").eq("source", "exam-corpus-seed") \
+            .like("path", "exam-marker/%")
+
+    if exam_codes is None:
+        seeded_files.extend(getattr(_base_query().execute(), "data", None) or [])
+    elif exam_codes:
+        for chunk in _chunks([f"{code}.pdf" for code in exam_codes if code], 100):
+            seeded_files.extend(getattr(_base_query().in_("name", chunk).execute(), "data", None) or [])
+
+    rows_by_id: Dict[str, Dict[str, Any]] = {}
+    paths_by_bucket: Dict[str, List[str]] = {}
+    seen_paths: set = set()
+    for row in seeded_files:
+        row_id = row.get("id")
+        bucket = row.get("bucket")
+        path = row.get("path")
+        if row_id:
+            rows_by_id[str(row_id)] = row
+        if bucket == "cc.users" and isinstance(path, str) and path.startswith("exam-marker/"):
+            key = (bucket, path)
+            if key not in seen_paths:
+                seen_paths.add(key)
+                paths_by_bucket.setdefault(bucket, []).append(path)
+
+    removable_ids = list(rows_by_id)
+    if not removable_ids and not paths_by_bucket:
+        logger.info("[unseed] no user-subset cc.users files to remove")
+        return
+
+    for bkt, paths in paths_by_bucket.items():
+        for chunk in _chunks(paths, 100):
+            try:
+                _storage_remove(storage, bkt, chunk)
+                rep.unseed_objects += len(chunk)
+            except Exception as exc:
+                logger.warning(f"[unseed] user-subset storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
+                rep.errors.append(f"user-subset storage remove {bkt}: {exc}")
+                return
+
+    for chunk in _chunks(removable_ids, 100):
+        try:
+            sb.table("files").delete().in_("id", chunk).execute()
+            rep.unseed_user_files += len(chunk)
+        except Exception as exc:
+            logger.warning(f"[unseed] user-subset files delete failed: {exc}")
+            rep.errors.append(f"user-subset files delete: {exc}")
+
 def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
           board_filter: Optional[str], spec_filter: Optional[str],
           drop_specs: bool = True, drop_seed_templates: bool = True, rep: LoadReport) -> None:
@ -597,6 +677,8 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
    specs = getattr(q.execute(), "data", None) or []
    spec_codes = [s["spec_code"] for s in specs]
    if not spec_codes:
+        if not board_filter and not spec_filter:
+            _delete_user_subset_files(client, storage, exam_codes=None, rep=rep)
        logger.info("[unseed] no matching specifications; nothing to do")
        return

@ -605,7 +687,14 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
        res = sb.table("eb_exams").select("id, exam_code, storage_loc").in_("spec_code", chunk).execute()
        exams.extend(getattr(res, "data", None) or [])

-    # 1) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
+    # 1) User-subset storage/rows. Storage is removed before files rows so trg_files_gc has
+    # nothing left to collect when rows are deleted.
+    user_subset_exam_codes = None if not board_filter and not spec_filter else [
+        e.get("exam_code") for e in exams if e.get("exam_code")
+    ]
+    _delete_user_subset_files(client, storage, exam_codes=user_subset_exam_codes, rep=rep)
+
+    # 2) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
    by_bucket: Dict[str, List[str]] = {}
    for row in exams + specs:
        loc = row.get("storage_loc")
@ -621,7 +710,7 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
            except Exception as exc:
                logger.warning(f"[unseed] storage remove failed ({bkt}, {len(chunk)} objs): {exc}")

-    # 2) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
+    # 3) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
    if drop_seed_templates and exams:
        exam_codes = [e["exam_code"] for e in exams if e.get("exam_code")]
        for chunk in _chunks(exam_codes, 100):
@ -632,7 +721,7 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
            except Exception as exc:
                logger.warning(f"[unseed] template delete failed: {exc}")

-    # 3) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
+    # 4) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
    exam_ids = [e["id"] for e in exams]
    for chunk in _chunks(exam_ids, 100):
        try:
@ -648,8 +737,8 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
            except Exception as exc:
                logger.warning(f"[unseed] eb_specifications delete failed: {exc}")

-    logger.info(f"unseed done: storage_objects={rep.unseed_objects} templates={rep.unseed_templates} "
-                f"exams={rep.unseed_exams} specs={rep.unseed_specs}")
+    logger.info(f"unseed done: storage_objects={rep.unseed_objects} user_files={rep.unseed_user_files} "
+                f"templates={rep.unseed_templates} exams={rep.unseed_exams} specs={rep.unseed_specs}")


 # ─────────────────────────────── orchestration ───────────────────────────────
--- a/tests/test_docling_extract.py
+++ b/tests/test_docling_extract.py
@ -0,0 +1,81 @@
+from api.services.docling.extract import aqa_questions_rapid
+
+
+def _text(raw, page, l, t, r=120, b=None):
+    return {
+        "text": raw,
+        "prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
+    }
+
+
+def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
+    (tmp_path / "p1.json").write_text(
+        '{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "02.3" in parts
+
+
+def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
+    (tmp_path / "p1.json").write_text(
+        '{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+    (tmp_path / "p2.json").write_text(
+        '{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert parts["07.1"]["page"] == 2
+    assert parts["07.1"]["bbox"]["l"] == 49
+    assert "07.2" in parts
+
+
+def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
+    texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
+    for idx, n in enumerate(["07", "08", "11", "12", "13"]):
+        texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
+        assert label in parts
+
+
+def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "01.3" in parts
+
+
+def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
+    import json
+    texts = [
+        _text("05.2 Some question text", 1, 49, 700),
+        _text("05.3 Middle question text", 1, 49, 620),
+        _text("05.5 Later question text", 2, 49, 740),
+    ]
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
+    (tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "05.1" in parts
+    assert "05.4" in parts
+    assert "05.5" in parts
+
+
+def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "03.0" in parts
--- a/tests/test_docling_regions.py
+++ b/tests/test_docling_regions.py
@ -2,6 +2,7 @@ from __future__ import annotations

 from PIL import Image, ImageDraw

+from api.services.docling import extract
 from api.services.docling.regions import detect_response_regions_from_image


@ -37,3 +38,46 @@ def test_detects_answer_box() -> None:
    assert boxes
    assert boxes[0]["bbox"]["w"] > 600
    assert boxes[0]["bbox"]["h"] > 200
+
+
+def test_detect_response_region_taxonomy_for_lines_and_boxes():
+    img = Image.new("RGB", (800, 1000), "white")
+    draw = ImageDraw.Draw(img)
+    for y in (220, 260, 300):
+        draw.line((120, y, 680, y), fill="black", width=2)
+    draw.rectangle((140, 520, 660, 640), outline="black", width=3)
+
+    regions = detect_response_regions_from_image(img, min_confidence=0.1)
+    types = {r.region_type for r in regions}
+
+    assert "answer_lines" in types
+    assert "answer_box" in types
+
+
+def test_attach_detected_response_regions_normalizes_taxonomy_and_uses_pdf_y(monkeypatch, tmp_path):
+    pdf = tmp_path / "paper.pdf"
+    pdf.write_bytes(b"%PDF test placeholder")
+    parts = {
+        "01.1": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 700, "b": 680}, "regions": []},
+        "01.2": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 500, "b": 480}, "regions": []},
+    }
+
+    def fake_detect(path, min_confidence=0.32):
+        return [{
+            "page_index": 0,
+            "region_type": "answer-box",
+            "confidence": 0.77,
+            "bbox": {"x": 100, "y": 335, "w": 500, "h": 40},
+            "detection_method": "test",
+            "meta": {"page_height_px": 1000, "page_height_pdf": 800},
+        }]
+
+    monkeypatch.setattr(extract.region_mod, "detect_response_regions_from_pdf", fake_detect)
+
+    attached, candidates = extract.attach_detected_response_regions(parts, str(pdf))
+
+    assert attached == 1
+    assert len(candidates) == 1
+    assert parts["01.1"]["regions"] == []
+    assert parts["01.2"]["regions"][0]["type"] == "answer_box"
+    assert parts["01.2"]["regions"][0]["source"] == "opencv"
--- a/tests/test_exam_templates.py
+++ b/tests/test_exam_templates.py
@ -143,6 +143,9 @@ class _FakeStorageAdmin:
    def download_file(self, bucket_id, file_path):
        return b"%PDF-1.7 fake"

+    def create_signed_url(self, bucket_id, file_path, expires_in=3600):
+        return {"signedURL": f"https://storage.test/{bucket_id}/{file_path}?token=fake&expires_in={expires_in}"}
+

 class _FakeServiceRoleClient:
    def __init__(self, store):
@ -171,6 +174,65 @@ def test_requires_auth_when_not_overridden():
    assert resp.status_code in (401, 403)  # unauthenticated, not processed


+def test_catalogue_requires_auth_when_not_overridden():
+    app = FastAPI()
+    app.include_router(router, prefix="/api/exam")
+    resp = TestClient(app).get("/api/exam/catalogue")
+    assert resp.status_code in (401, 403)
+
+
+def test_list_catalogue_papers_uses_as_user_metadata():
+    store = {
+        "eb_exams": [
+            {"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/p.pdf"},
+            {"id": "e2", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/ms.pdf"},
+        ]
+    }
+    client, _ = make_client(store=store)
+    resp = client.get("/api/exam/catalogue")
+    assert resp.status_code == 200
+    assert [p["id"] for p in resp.json()["papers"]] == ["e1"]
+
+
+def test_catalogue_signed_url_requires_auth_and_signs_examboard_pdf(monkeypatch):
+    store = {
+        "eb_exams": [
+            {"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/physics/qp.pdf"},
+        ]
+    }
+    client, _ = make_client(store=store)
+    monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
+    resp = client.get("/api/exam/catalogue/e1/signed-url?expires_in=120")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["bucket"] == "cc.examboards"
+    assert body["path"] == "aqa/physics/qp.pdf"
+    assert body["expires_in"] == 120
+    assert "token=fake" in body["signed_url"]
+
+
+def test_catalogue_signed_url_rejects_non_examboard_storage(monkeypatch):
+    store = {
+        "eb_exams": [
+            {"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.public/aqa/physics/qp.pdf"},
+        ]
+    }
+    client, _ = make_client(store=store)
+    monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
+    assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
+
+
+def test_catalogue_signed_url_rejects_non_catalogue_doc_type(monkeypatch):
+    store = {
+        "eb_exams": [
+            {"id": "e1", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/physics/ms.pdf"},
+        ]
+    }
+    client, _ = make_client(store=store)
+    monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
+    assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
+
+
 def test_create_template_sets_owner_and_institute():
    client, store = make_client()
    resp = client.post("/api/exam/templates", json={"title": "AQA Physics 1H", "subject": "Physics"})
@ -533,6 +595,27 @@ def test_box_to_canvas_uses_cropbox_as_page_origin():
    assert templates_mod._box_to_canvas(box, 1, pages) == {"x": 0.0, "y": 25.0, "w": 80.0, "h": 40.0}


+def test_auto_map_deduplicates_continued_part_labels(monkeypatch):
+    monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [
+        {"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0},
+        {"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 800.0},
+    ])
+    first_pass = _first_pass_template()
+    first_pass["meta"]["n_pages"] = 2
+    first_pass["pages"]["2"] = {
+        "role": "question", "role_source": "auto", "margins_enabled": True,
+        "main_bands": [],
+        "part_bands": [{"label": "01.1", "question": "01", "y_start": 760, "y_end": 600, "label_box": {"l": 50, "t": 760, "r": 90, "b": 740, "coord_origin": "BOTTOMLEFT"}, "source": "auto", "confirmed": False}],
+        "furniture": [], "figures": [], "tables": [],
+    }
+
+    rows = templates_mod._map_first_pass_to_rows("t1", first_pass, b"%PDF", [])
+
+    question_ids = [q["id"] for q in rows["questions"]]
+    assert len(question_ids) == len(set(question_ids))
+    assert [q["label"] for q in rows["questions"]].count("01.1") == 1
+
+
 def test_response_region_types_are_mapped_to_response_form_enum(monkeypatch):
    monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0}])
    first_pass = _first_pass_template()
@ -559,6 +642,20 @@ def test_auto_map_fast_path_merges_ai_rows_and_returns_detail(monkeypatch):
    assert store["exam_boundaries"] and store["exam_boundaries"][0]["derivation"] == "docling-main-band"


+def test_auto_map_deduplicates_repeated_response_area_ids(monkeypatch):
+    store = _template_with_source()
+    client, store = make_client(store=store)
+    _patch_auto_map(monkeypatch, store, fast=True)
+    dup = {"page_index": 0, "bbox": {"l": 50, "t": 700, "r": 100, "b": 680, "coord_origin": "BOTTOMLEFT"}, "region_type": "answer_lines", "confidence": 0.9}
+    monkeypatch.setattr(templates_mod, "detect_response_regions_from_pdf", lambda *_a, **_k: [dup, dict(dup)])
+
+    resp = client.post("/api/exam/templates/t1/auto-map")
+
+    assert resp.status_code == 200
+    response_area_ids = [r["id"] for r in store["exam_response_areas"]]
+    assert len(response_area_ids) == len(set(response_area_ids))
+
+
 def test_auto_map_preserves_manual_and_confirmed_rows_on_rerun(monkeypatch):
    store = _template_with_source()
    store.update({
--- a/tests/test_files_idor.py
+++ b/tests/test_files_idor.py
@ -0,0 +1,103 @@
+from types import SimpleNamespace
+
+import pytest
+
+import routers.database.files.files as files_router
+import routers.database.files.files_simplified as files_simplified_router
+
+
+ROUTERS = [files_router, files_simplified_router]
+
+USER_A = "00000000-0000-0000-0000-000000000001"
+USER_B = "00000000-0000-0000-0000-000000000002"
+CAB_A = "10000000-0000-0000-0000-000000000001"
+CAB_B = "10000000-0000-0000-0000-000000000002"
+
+
+class FakeQuery:
+    def __init__(self, rows):
+        self.rows = list(rows)
+
+    def select(self, *_args, **_kwargs):
+        return self
+
+    def eq(self, key, value):
+        self.rows = [row for row in self.rows if row.get(key) == value]
+        return self
+
+    def limit(self, _n):
+        return self
+
+    def execute(self):
+        return SimpleNamespace(data=self.rows)
+
+
+class FakeSupabase:
+    def __init__(self, store):
+        self.store = store
+
+    def table(self, name):
+        return FakeQuery(self.store.get(name, []))
+
+
+class FakeServiceRoleClient:
+    def __init__(self, store):
+        self.supabase = FakeSupabase(store)
+
+
+@pytest.mark.parametrize("router_module", ROUTERS)
+def test_list_files_hides_unowned_unshared_cabinet(monkeypatch, router_module):
+    store = {
+        "file_cabinets": [
+            {"id": CAB_A, "user_id": USER_A},
+            {"id": CAB_B, "user_id": USER_B},
+        ],
+        "cabinet_memberships": [],
+        "files": [
+            {"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A},
+            {"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B},
+        ],
+    }
+    monkeypatch.setattr(
+        router_module,
+        "SupabaseServiceRoleClient",
+        lambda: FakeServiceRoleClient(store),
+    )
+
+    assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
+
+
+@pytest.mark.parametrize("router_module", ROUTERS)
+def test_list_files_allows_own_cabinet(monkeypatch, router_module):
+    store = {
+        "file_cabinets": [{"id": CAB_A, "user_id": USER_A}],
+        "cabinet_memberships": [],
+        "files": [{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}],
+    }
+    monkeypatch.setattr(
+        router_module,
+        "SupabaseServiceRoleClient",
+        lambda: FakeServiceRoleClient(store),
+    )
+
+    assert router_module.list_files(CAB_A, {"sub": USER_A}) == [
+        {"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}
+    ]
+
+
+@pytest.mark.parametrize("router_module", ROUTERS)
+def test_list_files_denies_non_owner_even_with_cabinet_membership(monkeypatch, router_module):
+    store = {
+        "file_cabinets": [{"id": CAB_B, "user_id": USER_B}],
+        "cabinet_memberships": [
+            {"cabinet_id": CAB_B, "profile_id": USER_A, "role": "viewer"}
+        ],
+        "files": [{"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B}],
+    }
+    monkeypatch.setattr(
+        router_module,
+        "SupabaseServiceRoleClient",
+        lambda: FakeServiceRoleClient(store),
+    )
+
+    assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
--- a/tests/test_reset_environment_user_subset.py
+++ b/tests/test_reset_environment_user_subset.py
@ -0,0 +1,51 @@
+from run.initialization import reset_environment
+
+
+def test_reset_user_subset_scope_only_runs_user_subset_cleanup(monkeypatch):
+    calls = []
+
+    monkeypatch.setattr(
+        reset_environment,
+        "_sb_headers",
+        lambda: ("http://192.168.0.94:8000", {"Authorization": "Bearer redacted"}),
+    )
+    monkeypatch.setattr(
+        reset_environment,
+        "_assert_reset_allowed",
+        lambda url, scope: calls.append(("guard", url, scope)),
+    )
+    monkeypatch.setattr(
+        reset_environment,
+        "_clear_user_subset_files",
+        lambda: {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
+    )
+
+    def fail_if_called(*_args, **_kwargs):
+        raise AssertionError("reset(scope='user-subset') must not clear unrelated tables or databases")
+
+    monkeypatch.setattr(reset_environment, "_clear_tables", fail_if_called)
+    monkeypatch.setattr(reset_environment, "_neo4j_drop_all_non_system", fail_if_called)
+    monkeypatch.setattr(reset_environment, "_clear_exam_storage", fail_if_called)
+
+    result = reset_environment.reset(scope="user-subset")
+
+    assert calls == [("guard", "http://192.168.0.94:8000", "user-subset")]
+    assert result == {
+        "scope": "user-subset",
+        "user_subset": {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
+    }
+
+
+def test_reset_accepts_case_insensitive_user_subset_scope(monkeypatch):
+    monkeypatch.setattr(reset_environment, "_sb_headers", lambda: ("http://192.168.0.94:8000", {}))
+    monkeypatch.setattr(reset_environment, "_assert_reset_allowed", lambda *_args, **_kwargs: None)
+    monkeypatch.setattr(
+        reset_environment,
+        "_clear_user_subset_files",
+        lambda: {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
+    )
+
+    assert reset_environment.reset(scope="USER-SUBSET") == {
+        "scope": "user-subset",
+        "user_subset": {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
+    }
--- a/tests/test_seed_exam_corpus_unseed.py
+++ b/tests/test_seed_exam_corpus_unseed.py
@ -0,0 +1,171 @@
+from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
+
+
+class _Result:
+    def __init__(self, data=None):
+        self.data = data or []
+
+
+class _FilesQuery:
+    def __init__(self, db, op="select"):
+        self.db = db
+        self.op = op
+        self.filters = []
+        self.in_filters = []
+
+    def select(self, *_args, **_kwargs):
+        return self
+
+    def delete(self, *_args, **_kwargs):
+        self.op = "delete"
+        return self
+
+    def eq(self, key, value):
+        self.filters.append(("eq", key, value))
+        return self
+
+    def like(self, key, pattern):
+        self.filters.append(("like", key, pattern))
+        return self
+
+    def in_(self, key, values):
+        self.in_filters.append((key, set(values)))
+        return self
+
+    def _matches(self, row):
+        for kind, key, value in self.filters:
+            actual = row.get(key)
+            if kind == "eq" and actual != value:
+                return False
+            if kind == "like":
+                assert value.endswith("%")
+                if not isinstance(actual, str) or not actual.startswith(value[:-1]):
+                    return False
+        for key, values in self.in_filters:
+            if row.get(key) not in values:
+                return False
+        return True
+
+    def execute(self):
+        matched = [row for row in self.db.rows if self._matches(row)]
+        if self.op == "delete":
+            self.db.ops.append(("delete", [row["id"] for row in matched]))
+            self.db.rows = [row for row in self.db.rows if not self._matches(row)]
+            return _Result(matched)
+        return _Result(matched)
+
+
+class _FakeDb:
+    def __init__(self, rows):
+        self.rows = list(rows)
+        self.ops = []
+
+    def table(self, name):
+        assert name == "files"
+        return _FilesQuery(self)
+
+
+class _FakeStorageBucket:
+    def __init__(self, storage, bucket):
+        self.storage = storage
+        self.bucket = bucket
+
+    def remove(self, paths):
+        self.storage.ops.append(("remove", self.bucket, list(paths)))
+        if self.storage.fail:
+            raise RuntimeError("storage unavailable")
+        if self.storage.result_error:
+            return {"error": self.storage.result_error}
+        return []
+
+
+class _FakeStorageRoot:
+    def __init__(self, storage):
+        self.storage = storage
+
+    def from_(self, bucket):
+        return _FakeStorageBucket(self.storage, bucket)
+
+
+class _FakeStorage:
+    def __init__(self, fail=False, result_error=None):
+        self.fail = fail
+        self.result_error = result_error
+        self.ops = []
+        self.client = type("Client", (), {"supabase": type("SB", (), {"storage": _FakeStorageRoot(self)})()})()
+
+
+class _FakeClient:
+    def __init__(self, db):
+        self.supabase = db
+
+
+def test_delete_user_subset_storage_before_files_rows_for_scoped_exams():
+    db = _FakeDb([
+        {"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
+        {"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
+        {"id": "f3", "bucket": "cc.users", "path": "exam-marker/i/c/f3/A.pdf", "name": "A.pdf", "source": "manual"},
+        {"id": "f4", "bucket": "cc.users", "path": "other/f4/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
+    ])
+    storage = _FakeStorage()
+    rep = LoadReport()
+
+    _delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
+
+    assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
+    assert db.ops == [("delete", ["f1"])]
+    assert [row["id"] for row in db.rows] == ["f2", "f3", "f4"]
+    assert rep.unseed_objects == 1
+    assert rep.unseed_user_files == 1
+    assert rep.errors == []
+
+
+def test_delete_user_subset_keeps_files_rows_when_storage_remove_fails():
+    db = _FakeDb([
+        {"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
+    ])
+    storage = _FakeStorage(fail=True)
+    rep = LoadReport()
+
+    _delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
+
+    assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
+    assert db.ops == []
+    assert [row["id"] for row in db.rows] == ["f1"]
+    assert rep.unseed_objects == 0
+    assert rep.unseed_user_files == 0
+    assert rep.errors
+
+
+def test_delete_user_subset_keeps_files_rows_when_storage_remove_returns_error():
+    db = _FakeDb([
+        {"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
+    ])
+    storage = _FakeStorage(result_error="permission denied")
+    rep = LoadReport()
+
+    _delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
+
+    assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
+    assert db.ops == []
+    assert [row["id"] for row in db.rows] == ["f1"]
+    assert rep.unseed_objects == 0
+    assert rep.unseed_user_files == 0
+    assert rep.errors
+
+
+def test_delete_user_subset_unscoped_cleans_all_seeded_exam_marker_rows():
+    db = _FakeDb([
+        {"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
+        {"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
+    ])
+    storage = _FakeStorage()
+    rep = LoadReport()
+
+    _delete_user_subset_files(_FakeClient(db), storage, exam_codes=None, rep=rep)
+
+    assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf", "exam-marker/i/c/f2/B.pdf"])]
+    assert db.ops == [("delete", ["f1", "f2"])]
+    assert db.rows == []
+    assert rep.unseed_objects == 2
+    assert rep.unseed_user_files == 2
--- a/tests/test_upload_validation.py
+++ b/tests/test_upload_validation.py
@ -0,0 +1,54 @@
+import asyncio
+
+import pytest
+from fastapi import HTTPException
+
+from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes
+
+
+class FakeUpload:
+    def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"):
+        self._data = data
+        self._pos = 0
+        self.content_type = content_type
+        self.filename = filename
+
+    async def read(self, size: int = -1) -> bytes:
+        if self._pos >= len(self._data):
+            return b""
+        if size is None or size < 0:
+            size = len(self._data) - self._pos
+        chunk = self._data[self._pos : self._pos + size]
+        self._pos += len(chunk)
+        return chunk
+
+
+def run(coro):
+    return asyncio.run(coro)
+
+
+def test_valid_pdf_upload_passes_and_returns_mime():
+    data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf")))
+    assert data.startswith(b"%PDF-")
+    assert mime == "application/pdf"
+
+
+def test_disallowed_mime_rejected_with_415():
+    with pytest.raises(HTTPException) as exc:
+        run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python")))
+    assert exc.value.status_code == 415
+    assert "Unsupported upload type" in exc.value.detail
+
+
+def test_oversize_upload_rejected_with_413():
+    with pytest.raises(HTTPException) as exc:
+        run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain")))
+    assert exc.value.status_code == 413
+    assert "exceeds max size" in exc.value.detail
+
+
+def test_pdf_helper_rejects_spoofed_pdf_mime():
+    with pytest.raises(HTTPException) as exc:
+        run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf")))
+    assert exc.value.status_code == 415
+    assert "not a valid PDF" in exc.value.detail
Author	SHA1	Message	Date
CC Worker	6c73174829	fix(exam): match app's per-page ceil so shapes don't drift up on long papers Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details The app sets canvas.height = Math.ceil(viewport.height) per page and stacks pages by those heights; the backend page_top used the raw float, so it fell ~1px/page short, compounding to a visible upward shape shift on later pages (~36px over 40 pages). Ceil rendered_h to match exactly. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 20:11:28 +00:00
CC Worker	5434a5bf21	fix(exam): emit auto-map canvas coords in the frontend 780-wide page space Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details _pdf_page_geometry left rendered_w/h in PDF points (~595x842), but the app renders each PDF page at PAGE_WIDTH=780 with proportional height and places shapes at the raw bounds. Result: every detected region rendered shrunk (~0.76x) and shifted up-left. Set rendered_w=780 + rendered_h=780*aspect (matches pdfLoader + pageGeometryFromImages), and scale px/point TOPLEFT boxes into that space (was a hardcoded 0.5). Path-2 point boxes auto-correct via rendered_w/page_pt_w. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 19:18:09 +00:00
CC Worker	44ccba2151	fix(exam): guarantee auto-map child rows reference an inserted question Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details On papers where band detection yields few/no questions but opencv/gemma still emit response regions, those regions referenced a synthetic default_qid that was never inserted -> FK violation (exam_response_areas/exam_boundaries -> exam_questions). Ensure the fallback container question exists and reattach orphan child rows to it. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 18:45:09 +00:00
CC Worker	e83873e822	fix(exam): dedupe all AI auto-map rows by id before insert Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details B1-4 live-route validation: continuation bands re-emit the same stable AI id for response_areas/boundaries/layout (not just questions), causing duplicate-pkey insert failures. Add _dedupe_rows_by_id applied to all four tables in _refresh_ai_rows. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 18:02:51 +00:00
kcar	150b915282	[verified] fix exam auto-map duplicate continued parts Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details (cherry picked from commit 31c51cb7aa33d7f2e1102cea4ffabfefee259faa)	2026-06-08 17:47:56 +00:00
kcar	76e11b0b06	feat(docling): B1-2 AQA label normalization + missing-.1 inference + MCQ gap (salvaged) Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details (cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)	2026-06-08 04:03:17 +00:00
kcar	52d1ece212	[verified] generalize B1 response regions and marks gap fill Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details	2026-06-08 04:49:21 +01:00
CC Worker	69d9c46abe	feat(docling): B1 image-only OCR eval harness (overwatch-cleaned) Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details Eval harness for AQA A-level + GCSE-science image-only papers: finalize.py --b1-only, RapidOCR runner (rapid_pass.py via dsync), GT fixtures (make_b1_gt.py + b1_gt_labels.json), and fetch_b1_corpus.py to pull the eval corpus from .94 cc.examboards at runtime. Salvaged from t_15be12ed (which timed out on iteration budget re-running OCR): exam PDFs and generated OCR caches/reports are NOT committed (third-party copyright + reproducible) — gitignored and fetched/generated at runtime. Baseline coverage recorded in the task evidence file. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 03:10:10 +00:00
CC Worker	34fc7edd68	[verified] add exam-board signed URL endpoint Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details (cherry picked from commit c65d18ca6badab193469d88e8e8b32279cca8f98)	2026-06-08 01:51:55 +00:00
kcar	c69451fba2	[verified] add upload size and MIME guards Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details (cherry picked from commit f5e05376f637f55b73e474cac8199529682ca398)	2026-06-08 01:18:39 +00:00
kcar	e98fed661f	[verified] fix files list owner scoping Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details	2026-06-08 02:08:38 +01:00
CC Worker	a6753d092f	fix(reset): fold --user-subset cleanup into scope=all and scope=exam-corpus Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details t_d1600327 added a standalone scope=user-subset, but a full reset (scope=all) and scope=exam-corpus still left the --user-subset cc.users storage objects orphaned (files rows are wiped by the table clear, but the Storage API objects are not). Call the same _clear_user_subset_files() helper in both paths so the finding-#2 gap is fully closed: storage removed before rows, idempotent. Closes overwatch review finding #2 (user-subset not cleaned by reset). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-08 00:26:24 +00:00
kcar	7f7e843563	[verified] add user subset reset scope (cherry picked from commit e1e3ec96a2d314d39e35ce2c34f6f67df1c2f182)	2026-06-08 00:25:46 +00:00
kcar	7819e6e346	fix(seed): unseed user-subset storage objects Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details (cherry picked from commit 9328ec2e062d039c0bcfabb086ce0693fe1ebe50)	2026-06-08 00:13:40 +00:00
kcar	5da108df13	docs(reset): clarify exam-corpus scope Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details	2026-06-08 00:57:57 +01:00
CC Worker	25d02aedeb	fix(reset): default-deny destructive reset against prod target Some checks failed api-ci-deploy / test-build-deploy (push) Has been cancelled Details /admin/reset and reset_environment.reset() act on os.environ['SUPABASE_URL']. A platform-admin call on a prod-deployed API would wipe prod data + exam corpus + storage. Refuse when the target matches a known prod marker (.156 / supabase.classroomcopilot) unless RESET_ALLOW_PROD=1 is set. Addresses overwatch review finding #1 on feature/exam-seeding-overhaul. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>	2026-06-07 23:49:53 +00:00