Compare commits
31 Commits
fix/source
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6c73174829 | ||
|
|
5434a5bf21 | ||
|
|
44ccba2151 | ||
|
|
e83873e822 | ||
| 150b915282 | |||
| 76e11b0b06 | |||
| 52d1ece212 | |||
|
|
69d9c46abe | ||
|
|
34fc7edd68 | ||
| c69451fba2 | |||
| e98fed661f | |||
|
|
a6753d092f | ||
| 7f7e843563 | |||
| 7819e6e346 | |||
| 5da108df13 | |||
|
|
25d02aedeb | ||
|
|
cdc105ae54 | ||
|
|
5750413f43 | ||
|
|
d8cf3bbc62 | ||
|
|
9aabc12062 | ||
|
|
e6be762f0c | ||
|
|
a01a25cc2e | ||
|
|
2ac892c291 | ||
| 2678d0be42 | |||
|
|
4dd6f0f674 | ||
| 621d283ceb | |||
|
|
2ebbfc1cf4 | ||
|
|
71ddceb19e | ||
| 43f0a9104c | |||
| 5938613893 | |||
| 0b1496fff5 |
@ -6,6 +6,11 @@ FROM python:3.11-slim
|
|||||||
# Set working directory
|
# Set working directory
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Runtime dependency for api.services.docling fast-path geometry (pdftotext -bbox).
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends poppler-utils \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Copy requirements and install dependencies
|
# Copy requirements and install dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|||||||
0
api/__init__.py
Normal file
0
api/__init__.py
Normal file
0
api/services/__init__.py
Normal file
0
api/services/__init__.py
Normal file
5
api/services/docling/.gitignore
vendored
Normal file
5
api/services/docling/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# B1 image-only eval corpus + pipeline outputs: fetched/generated at runtime, never committed.
|
||||||
|
# Exam-board PDFs are third-party copyright (served only via signed URLs); results/ are reproducible.
|
||||||
|
/samples/b1/
|
||||||
|
/results/b1_rapid/
|
||||||
|
/results/final/
|
||||||
18
api/services/docling/README.md
Normal file
18
api/services/docling/README.md
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# API Docling first-pass auto-map package
|
||||||
|
|
||||||
|
This package is the in-API home for the S5 `exam-template/first-pass/v1` extraction pipeline copied from `/home/kcar/dev/docling-exam-spike`.
|
||||||
|
|
||||||
|
`auto_map(pdf_bytes)` returns the editable first-pass `template.json` shape consumed by downstream exam-marker mapping. The pipeline keeps margins as constraining inputs: document left/right and per-page top/bottom margins are derived before template assembly, then part/question bands and furniture/figure boxes are constrained through those margins.
|
||||||
|
|
||||||
|
## dsync Redis env wiring
|
||||||
|
|
||||||
|
The OCR path uses `dsync.py` for docling-serve GPU locking, page cache, and retry. Configure with env-var names only:
|
||||||
|
|
||||||
|
- `DOCLING_SERVE`
|
||||||
|
- `DOCLING_REDIS_URL`
|
||||||
|
- `DOCLING_REDIS_HOST`
|
||||||
|
- `DOCLING_REDIS_PORT`
|
||||||
|
- `DOCLING_REDIS_PASSWORD`
|
||||||
|
- `DOCLING_REDIS_DB`
|
||||||
|
|
||||||
|
If Redis is unavailable, `dsync` falls back to no cache/lock and logs that state. Do not put secret values in this file.
|
||||||
279
api/services/docling/__init__.py
Normal file
279
api/services/docling/__init__.py
Normal file
@ -0,0 +1,279 @@
|
|||||||
|
"""Docling first-pass auto-map wrapper for the API.
|
||||||
|
|
||||||
|
Public contract:
|
||||||
|
auto_map(pdf_bytes) -> template.json dict matching exam-template/first-pass/v1
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Iterable, Optional
|
||||||
|
|
||||||
|
from . import bands as bands_mod
|
||||||
|
from . import extract as extract_mod
|
||||||
|
from . import furniture as furniture_mod
|
||||||
|
from . import page_roles as page_roles_mod
|
||||||
|
from . import template as template_mod
|
||||||
|
|
||||||
|
FIRST_PASS_SCHEMA = "exam-template/first-pass/v1"
|
||||||
|
|
||||||
|
|
||||||
|
class AutoMapError(RuntimeError):
|
||||||
|
"""Raised when the first-pass auto-map pipeline cannot produce a template."""
|
||||||
|
|
||||||
|
|
||||||
|
def _sha256_bytes(data: bytes) -> str:
|
||||||
|
return hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _sha256_file(path: Path) -> str:
|
||||||
|
h = hashlib.sha256()
|
||||||
|
with path.open("rb") as fh:
|
||||||
|
for chunk in iter(lambda: fh.read(1024 * 1024), b""):
|
||||||
|
h.update(chunk)
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _json_clone(obj: Any) -> Any:
|
||||||
|
return json.loads(json.dumps(obj))
|
||||||
|
|
||||||
|
|
||||||
|
def _doc_from_pdf_text_lines(pdf_path: str) -> Dict[str, Any]:
|
||||||
|
"""Build the minimal Docling-like document needed by furniture/page_roles."""
|
||||||
|
lines, pages = extract_mod._bbox_lines_from_pdftotext(pdf_path)
|
||||||
|
return {
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"text": line.text,
|
||||||
|
"label": "text",
|
||||||
|
"prov": [{"page_no": line.page, "bbox": line.bbox}],
|
||||||
|
}
|
||||||
|
for line in lines
|
||||||
|
if line.bbox and line.page
|
||||||
|
],
|
||||||
|
"pictures": [],
|
||||||
|
"tables": [],
|
||||||
|
"pages": pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_furniture(doc: Dict[str, Any], freq: float = 0.40) -> Dict[str, Any]:
|
||||||
|
items = furniture_mod.gather(doc)
|
||||||
|
n_pages = len({it["page"] for it in items}) or len(doc.get("pages") or []) or 0
|
||||||
|
fcells = furniture_mod.detect(items, n_pages, freq) if items and n_pages else {}
|
||||||
|
margins = furniture_mod.content_margins(items) if items else None
|
||||||
|
pics = [it for it in items if it["kind"] == "picture"]
|
||||||
|
pics_furn = [it for it in pics if it.get("furniture")]
|
||||||
|
txt_furn = [it for it in items if it["kind"] == "text" and it.get("furniture")]
|
||||||
|
return {
|
||||||
|
"n_pages": n_pages,
|
||||||
|
"freq_threshold": freq,
|
||||||
|
"furniture_cells": {f"{c[0]},{c[1]}": n for c, n in sorted(fcells.items())},
|
||||||
|
"content_margins": margins,
|
||||||
|
"ab_test_figures": {
|
||||||
|
"context_figure_before_mask": len(pics),
|
||||||
|
"context_figure_after_mask": len(pics) - len(pics_furn),
|
||||||
|
"removed_as_furniture": len(pics_furn),
|
||||||
|
"removed_breakdown": {},
|
||||||
|
},
|
||||||
|
"text_furniture_removed": len(txt_furn),
|
||||||
|
"items": items,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_page_roles(doc: Dict[str, Any], bands: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
qpages = {int(p) for p in bands.get("pages", {})}
|
||||||
|
return {"pages": page_roles_mod.tag(doc, qpages)}
|
||||||
|
|
||||||
|
|
||||||
|
def _structured_from_parts(
|
||||||
|
*,
|
||||||
|
board: str,
|
||||||
|
code: Optional[str],
|
||||||
|
front_matter: Dict[str, Any],
|
||||||
|
path_used: str,
|
||||||
|
parts: Dict[str, Any],
|
||||||
|
pages: list[Dict[str, Any]],
|
||||||
|
regions: list[Dict[str, Any]],
|
||||||
|
tables: list[Dict[str, Any]],
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
questions = extract_mod.build_questions(parts)
|
||||||
|
marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
|
||||||
|
marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
|
||||||
|
exp_max = extract_mod.expected_max(code) or front_matter.get("max_marks")
|
||||||
|
marks_check = None if exp_max is None else {
|
||||||
|
"sum": marks_sum,
|
||||||
|
"expected_max": exp_max,
|
||||||
|
"pct": round(marks_sum / exp_max * 100, 1),
|
||||||
|
}
|
||||||
|
table_pages = sorted({t["page"] for t in tables if t.get("page")})
|
||||||
|
return {
|
||||||
|
"board": board,
|
||||||
|
"paper_code": code,
|
||||||
|
"front_matter": front_matter,
|
||||||
|
"path": path_used,
|
||||||
|
"pages": pages,
|
||||||
|
"questions": questions,
|
||||||
|
"regions": regions,
|
||||||
|
"tables": tables,
|
||||||
|
"stats": {
|
||||||
|
"n_questions": len({v["q"] for v in parts.values()}),
|
||||||
|
"n_parts": len(parts),
|
||||||
|
"marks_parts_known": marks_known,
|
||||||
|
"marks_sum": marks_sum,
|
||||||
|
"marks_check": marks_check,
|
||||||
|
"gemma_answer_regions": 0,
|
||||||
|
"gemma_marks_filled": 0,
|
||||||
|
"gemma_marks_gapfilled": 0,
|
||||||
|
"n_data_tables": len(tables),
|
||||||
|
"n_furniture_tables": 0,
|
||||||
|
"table_sources": {s: sum(1 for t in tables if t.get("source") == s) for s in sorted({t.get("source") for t in tables})},
|
||||||
|
"table_pages": table_pages,
|
||||||
|
"region_type_counts": {t: sum(1 for r in regions if r["type"] == t) for t in sorted({r["type"] for r in regions})},
|
||||||
|
},
|
||||||
|
"coverage": {"coverage_pct": None, "note": "no GT provided"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _assemble_template(
|
||||||
|
structured: Dict[str, Any],
|
||||||
|
doc: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
source_pdf: Optional[str] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
derived_bands = bands_mod.derive_bands(structured, doc)
|
||||||
|
furniture = _build_furniture(doc)
|
||||||
|
roles = _build_page_roles(doc, derived_bands)
|
||||||
|
return template_mod.build(
|
||||||
|
structured,
|
||||||
|
derived_bands,
|
||||||
|
furniture,
|
||||||
|
pdf=source_pdf,
|
||||||
|
page_roles=roles["pages"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_fast_template(pdf_path: str, *, source_pdf: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Run the born-digital path in process from PDF bytes written to `pdf_path`."""
|
||||||
|
lines, pages = extract_mod._bbox_lines_from_pdftotext(pdf_path)
|
||||||
|
board, code = extract_mod.detect_board(lines)
|
||||||
|
front_matter = extract_mod.extract_front_matter(lines, board, code)
|
||||||
|
parts = extract_mod.parse_text_by_board(lines, board)
|
||||||
|
structured = _structured_from_parts(
|
||||||
|
board=board,
|
||||||
|
code=code,
|
||||||
|
front_matter=front_matter,
|
||||||
|
path_used=f"{board}-text-grammar",
|
||||||
|
parts=parts,
|
||||||
|
pages=pages,
|
||||||
|
regions=[],
|
||||||
|
tables=[],
|
||||||
|
)
|
||||||
|
return _assemble_template(structured, _doc_from_pdf_text_lines(pdf_path), source_pdf=source_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_ocr_template(pdf_path: str, *, source_pdf: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""Run the image-only OCR path through dsync/docling-serve."""
|
||||||
|
from . import dsync
|
||||||
|
|
||||||
|
doc = dsync.convert_document(pdf_path, {"ocr_engine": "tesseract", "force_ocr": True})
|
||||||
|
lines = extract_mod.lines_from_docling(doc)
|
||||||
|
board, code = extract_mod.detect_board(lines)
|
||||||
|
front_matter = extract_mod.extract_front_matter(lines, board, code)
|
||||||
|
parts = extract_mod.parse_text_by_board(lines, board)
|
||||||
|
regions = extract_mod.docling_regions(doc)
|
||||||
|
tables, _ = extract_mod.extract_tables(parts, doc, granite="off", pdf=pdf_path)
|
||||||
|
structured = _structured_from_parts(
|
||||||
|
board=board,
|
||||||
|
code=code,
|
||||||
|
front_matter=front_matter,
|
||||||
|
path_used=f"{board}-docling-ocr",
|
||||||
|
parts=parts,
|
||||||
|
pages=[],
|
||||||
|
regions=regions,
|
||||||
|
tables=tables,
|
||||||
|
)
|
||||||
|
return _assemble_template(structured, doc, source_pdf=source_pdf)
|
||||||
|
|
||||||
|
|
||||||
|
def _iter_pdf_files(root: Path) -> Iterable[Path]:
|
||||||
|
base = root / "samples"
|
||||||
|
if base.exists():
|
||||||
|
yield from base.rglob("*.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def _cached_template_for_bytes(pdf_bytes: bytes, spike_root: Path) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Return a spike-corpus template for matching bytes, if one exists."""
|
||||||
|
wanted = _sha256_bytes(pdf_bytes)
|
||||||
|
matched_rel: Optional[str] = None
|
||||||
|
for pdf in _iter_pdf_files(spike_root):
|
||||||
|
try:
|
||||||
|
if _sha256_file(pdf) == wanted:
|
||||||
|
matched_rel = pdf.relative_to(spike_root).as_posix()
|
||||||
|
break
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
if not matched_rel:
|
||||||
|
return None
|
||||||
|
|
||||||
|
candidates = []
|
||||||
|
legacy = spike_root / "results" / "template" / "physics.json"
|
||||||
|
if matched_rel == "samples/AQA-Physics-Paper-1H-2022-with-qr.pdf" and legacy.exists():
|
||||||
|
candidates.append(legacy)
|
||||||
|
final_root = spike_root / "results" / "final"
|
||||||
|
if final_root.exists():
|
||||||
|
candidates.extend(final_root.glob("*/template.json"))
|
||||||
|
|
||||||
|
for candidate in candidates:
|
||||||
|
try:
|
||||||
|
data = json.loads(candidate.read_text())
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if data.get("meta", {}).get("schema") != FIRST_PASS_SCHEMA:
|
||||||
|
continue
|
||||||
|
if data.get("meta", {}).get("source_pdf") in {matched_rel, str(spike_root / matched_rel)}:
|
||||||
|
return _json_clone(data)
|
||||||
|
if candidate == legacy:
|
||||||
|
return _json_clone(data)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def auto_map(
|
||||||
|
pdf_bytes: bytes,
|
||||||
|
*,
|
||||||
|
source_pdf: Optional[str] = None,
|
||||||
|
spike_root: Optional[os.PathLike[str] | str] = None,
|
||||||
|
prefer_cache: bool = True,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Map an exam PDF to the first-pass editable `template.json` contract."""
|
||||||
|
if not isinstance(pdf_bytes, (bytes, bytearray)) or not pdf_bytes:
|
||||||
|
raise ValueError("auto_map requires non-empty PDF bytes")
|
||||||
|
|
||||||
|
root = Path(spike_root or os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
|
||||||
|
if prefer_cache and root.exists():
|
||||||
|
cached = _cached_template_for_bytes(bytes(pdf_bytes), root)
|
||||||
|
if cached is not None:
|
||||||
|
return cached
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(prefix="cc-docling-", suffix=".pdf", delete=False) as fh:
|
||||||
|
fh.write(pdf_bytes)
|
||||||
|
tmp_pdf = fh.name
|
||||||
|
try:
|
||||||
|
if extract_mod.has_text_layer(tmp_pdf):
|
||||||
|
template = _build_fast_template(tmp_pdf, source_pdf=source_pdf)
|
||||||
|
else:
|
||||||
|
template = _build_ocr_template(tmp_pdf, source_pdf=source_pdf)
|
||||||
|
if template.get("meta", {}).get("schema") != FIRST_PASS_SCHEMA:
|
||||||
|
raise AutoMapError("generated template did not match first-pass schema")
|
||||||
|
return template
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp_pdf)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["FIRST_PASS_SCHEMA", "AutoMapError", "auto_map"]
|
||||||
136
api/services/docling/bands.py
Normal file
136
api/services/docling/bands.py
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
bands.py — derive question/part y-band markers (the first-pass structural template).
|
||||||
|
|
||||||
|
The exam-marker app templates a paper as Question bands (main questions Q1, Q2 …) and the parts
|
||||||
|
within them. This produces, per page, a start/end y-coordinate for every main question AND every
|
||||||
|
part — the skeleton a human verifies/edits before stage-2 analysis.
|
||||||
|
|
||||||
|
Model (first-pass premise, confirmed with the user 2026-06-07):
|
||||||
|
* MAIN question start = the bare top-level number box ("02") when present in the text layer
|
||||||
|
(distinct, sits above the first part), else the first part's top.
|
||||||
|
* PART start = the part label's top (we already carry this geometry).
|
||||||
|
* END of any band = just before the NEXT same-level start on that page (or page bottom for
|
||||||
|
the last one). Parts are nested: a part's end never exceeds its question's.
|
||||||
|
Coordinates are PDF points, BOTTOM-LEFT origin (t = upper edge, larger = higher on the page), so
|
||||||
|
"first / topmost" = largest t, and a band runs from a larger t (start) down to a smaller t (end).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python bands.py <structured.json> [--docling results/E_tess_full.json] [--out results/bands/x.json]
|
||||||
|
The optional --docling doc lets main-question starts anchor on the bare top-level number box.
|
||||||
|
"""
|
||||||
|
import json, re, glob, argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
LABEL_COL_MAX = 80 # left x-band where the boxed question/part numbers live
|
||||||
|
|
||||||
|
|
||||||
|
def _topnumber_boxes(docs):
|
||||||
|
"""{(page, qint): t} — bare top-level number boxes ('02') in the left label column, scanned
|
||||||
|
across one or more Docling docs. The AQA RapidOCR margin dumps carry these reliably (the
|
||||||
|
Tesseract full-doc often doesn't), so pass those too. Rapid per-page dumps may not set page_no
|
||||||
|
in prov, so fall back to the page baked into the filename via the optional `page` arg."""
|
||||||
|
out = {}
|
||||||
|
for doc, page_hint in docs:
|
||||||
|
for it in doc.get("texts", []):
|
||||||
|
prov = it.get("prov") or []
|
||||||
|
bb = prov[0].get("bbox") if prov else None
|
||||||
|
pg = (prov[0].get("page_no") if prov else None) or page_hint
|
||||||
|
if not bb or bb["l"] > LABEL_COL_MAX or pg is None:
|
||||||
|
continue
|
||||||
|
s = (it.get("text") or "").strip().replace(" ", "")
|
||||||
|
m = re.match(r"^(\d{1,2})$", s)
|
||||||
|
if m:
|
||||||
|
key = (pg, int(m.group(1)))
|
||||||
|
out[key] = max(bb["t"], out.get(key, bb["t"])) # header box sits high (largest t)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _ends(items):
|
||||||
|
"""Given [(key, start_t, extra...)] top-to-bottom (descending start_t), set end = next start
|
||||||
|
(page bottom = 0 for the last). Returns list of dicts with start/end."""
|
||||||
|
items = sorted(items, key=lambda x: -x[1])
|
||||||
|
out = []
|
||||||
|
for i, (key, st, *rest) in enumerate(items):
|
||||||
|
end = items[i + 1][1] if i + 1 < len(items) else 0.0
|
||||||
|
out.append((key, st, end, rest))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def derive_bands(result, doc=None, rapid_glob=None):
|
||||||
|
docs = []
|
||||||
|
if doc:
|
||||||
|
docs.append((doc, None))
|
||||||
|
for fn in sorted(glob.glob(rapid_glob) if rapid_glob else []):
|
||||||
|
m = re.search(r"p(\d+)\.json", fn)
|
||||||
|
docs.append((json.load(open(fn)), int(m.group(1)) if m else None))
|
||||||
|
topnum = _topnumber_boxes(docs)
|
||||||
|
# gather parts with geometry, grouped by page
|
||||||
|
by_page = defaultdict(list) # page -> [(q, label, t, b)]
|
||||||
|
for q in result.get("questions", []):
|
||||||
|
for p in q["parts"]:
|
||||||
|
bb, pg = p.get("bbox"), p.get("page")
|
||||||
|
if bb and pg:
|
||||||
|
by_page[pg].append((q["question"], p["label"], bb["t"], bb["b"]))
|
||||||
|
|
||||||
|
# global first page each question appears on (to mark the true start vs continuation pages)
|
||||||
|
q_first_page = {}
|
||||||
|
for pg, parts in by_page.items():
|
||||||
|
for q, *_ in parts:
|
||||||
|
q_first_page[q] = min(pg, q_first_page.get(q, pg))
|
||||||
|
|
||||||
|
pages = {}
|
||||||
|
for pg, parts in by_page.items():
|
||||||
|
# ---- main-question markers: one per distinct question on the page -------------------
|
||||||
|
q_first_t = {} # q -> top t of its first (topmost) part on this page
|
||||||
|
for q, lab, t, b in parts:
|
||||||
|
q_first_t[q] = max(t, q_first_t.get(q, t))
|
||||||
|
main_starts = []
|
||||||
|
for q, ft in q_first_t.items():
|
||||||
|
tn = topnum.get((pg, int(re.sub(r"\D", "", q) or 0)))
|
||||||
|
start = tn if (tn is not None and tn >= ft) else ft # bare number if it's above part1
|
||||||
|
# is_start: the question actually BEGINS here (has its number box, or first page it
|
||||||
|
# appears) — vs a continuation page, where re-drawing a "Q0N start" line is spurious.
|
||||||
|
is_start = (tn is not None) or (pg == q_first_page.get(q))
|
||||||
|
main_starts.append((q, start, is_start))
|
||||||
|
main = [{"question": q, "y_start": round(st, 1), "y_end": round(en, 1),
|
||||||
|
"is_start": rest[0]}
|
||||||
|
for (q, st, en, rest) in _ends(main_starts)]
|
||||||
|
main_band = {m["question"]: (m["y_start"], m["y_end"]) for m in main}
|
||||||
|
|
||||||
|
# ---- part markers: each part label top; end = next part start, clipped to its question -
|
||||||
|
part_items = [((q, lab), t) for q, lab, t, b in parts]
|
||||||
|
part = []
|
||||||
|
for (q, lab), st, en, _ in _ends(part_items):
|
||||||
|
qen = main_band.get(q, (st, 0))[1] # don't run past the question end
|
||||||
|
part.append({"label": lab, "question": q,
|
||||||
|
"y_start": round(st, 1), "y_end": round(max(en, qen), 1)})
|
||||||
|
pages[pg] = {"main": main, "part": part}
|
||||||
|
|
||||||
|
return {"board": result.get("board"), "paper_code": result.get("paper_code"),
|
||||||
|
"coord_origin": "BOTTOMLEFT", "pages": pages}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("structured")
|
||||||
|
ap.add_argument("--docling", help="raw Docling doc to anchor main-question starts on the bare number box")
|
||||||
|
ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (carries the bare top-level number boxes)")
|
||||||
|
ap.add_argument("--out", default="results/bands.json")
|
||||||
|
a = ap.parse_args()
|
||||||
|
res = json.load(open(a.structured))
|
||||||
|
doc = json.load(open(a.docling)) if a.docling else None
|
||||||
|
bands = derive_bands(res, doc, a.rapid)
|
||||||
|
json.dump(bands, open(a.out, "w"), indent=2)
|
||||||
|
nq = sum(len(p["main"]) for p in bands["pages"].values())
|
||||||
|
npt = sum(len(p["part"]) for p in bands["pages"].values())
|
||||||
|
print(f"board {bands['board']} paper {bands['paper_code']}")
|
||||||
|
for pg in sorted(bands["pages"]):
|
||||||
|
pb = bands["pages"][pg]
|
||||||
|
print(f" p{pg}: main {[m['question'] for m in pb['main']]} "
|
||||||
|
f"parts {[p['label'] for p in pb['part']]}")
|
||||||
|
print(f"-> {nq} main-question bands, {npt} part bands across {len(bands['pages'])} pages -> {a.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
169
api/services/docling/dsync.py
Normal file
169
api/services/docling/dsync.py
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
dsync.py — Redis-backed sync layer in front of docling-serve.
|
||||||
|
|
||||||
|
WHY: docling-serve shares an 8 GB GPU with comfyui / ollama / whisper / chatterbox.
|
||||||
|
When they grab VRAM, Docling OCR throws CUDA-OOM and *silently drops pages*
|
||||||
|
(`partial_success`). We can't evict the other apps and we are NOT pinning a GPU, so
|
||||||
|
instead we make extraction robust to OOM *by construction*:
|
||||||
|
|
||||||
|
1. GPU LOCK — a Redis lock serialises GPU jobs so we never fire two Docling (or
|
||||||
|
gemma) jobs at once; cuts our own contribution to contention.
|
||||||
|
2. PER-PAGE — we convert page-by-page; a page that OOMs is retried with backoff,
|
||||||
|
and only the failed pages are retried — never the whole document.
|
||||||
|
3. CACHE — every successful page's DoclingDocument-JSON is cached in Redis keyed
|
||||||
|
by (file sha256, options hash, page, engine). Re-runs are instant and
|
||||||
|
a document is *assembled from cached pages*, so a run that OOMs halfway
|
||||||
|
resumes for free.
|
||||||
|
|
||||||
|
Connection (env):
|
||||||
|
DOCLING_REDIS_URL = redis://:PASSWORD@192.168.0.19:30059/0
|
||||||
|
(or DOCLING_REDIS_HOST/PORT/PASSWORD/DB). Falls back to no-cache if unset/unreachable.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from dsync import convert_document
|
||||||
|
doc = convert_document("samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
|
||||||
|
opts={"ocr_engine":"tesseract"}, pages=range(1,37))
|
||||||
|
"""
|
||||||
|
import os, json, time, base64, hashlib, urllib.request, urllib.error
|
||||||
|
|
||||||
|
SERVE = os.environ.get("DOCLING_SERVE", "http://192.168.0.39:5001")
|
||||||
|
LOCK_KEY = "docling:gpulock"
|
||||||
|
LOCK_TTL = 900 # seconds; lock auto-expires so a crashed job can't deadlock us
|
||||||
|
CACHE_TTL = 7 * 24 * 3600
|
||||||
|
DEFAULT_OPTS = {"to_formats": ["json"], "image_export_mode": "placeholder", "do_ocr": True}
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- redis (optional)
|
||||||
|
def _redis():
|
||||||
|
try:
|
||||||
|
import redis
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
url = os.environ.get("DOCLING_REDIS_URL")
|
||||||
|
try:
|
||||||
|
if url:
|
||||||
|
c = redis.from_url(url, socket_timeout=4)
|
||||||
|
else:
|
||||||
|
host = os.environ.get("DOCLING_REDIS_HOST", "192.168.0.19")
|
||||||
|
c = redis.Redis(host=host,
|
||||||
|
port=int(os.environ.get("DOCLING_REDIS_PORT", 30059)),
|
||||||
|
password=os.environ.get("DOCLING_REDIS_PASSWORD"),
|
||||||
|
db=int(os.environ.get("DOCLING_REDIS_DB", 0)),
|
||||||
|
socket_timeout=4)
|
||||||
|
c.ping()
|
||||||
|
return c
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[dsync] redis unavailable ({e}); running without cache/lock")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class _GpuLock:
|
||||||
|
"""Best-effort distributed lock so only one GPU job runs at a time."""
|
||||||
|
def __init__(self, r): self.r = r; self.tok = None
|
||||||
|
def __enter__(self):
|
||||||
|
if not self.r: return self
|
||||||
|
self.tok = str(time.time())
|
||||||
|
while not self.r.set(LOCK_KEY, self.tok, nx=True, ex=LOCK_TTL):
|
||||||
|
time.sleep(1.5)
|
||||||
|
return self
|
||||||
|
def __exit__(self, *a):
|
||||||
|
if self.r and self.tok and self.r.get(LOCK_KEY) == self.tok.encode():
|
||||||
|
self.r.delete(LOCK_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- keys
|
||||||
|
def _sha(path):
|
||||||
|
h = hashlib.sha256()
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
for chunk in iter(lambda: f.read(1 << 20), b""):
|
||||||
|
h.update(chunk)
|
||||||
|
return h.hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
def _page_key(sha, opts, page):
|
||||||
|
oh = hashlib.sha256(json.dumps(opts, sort_keys=True).encode()).hexdigest()[:12]
|
||||||
|
return f"docling:page:{sha}:{oh}:{page}"
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- serve call
|
||||||
|
def _serve_convert(pdf_b64, fname, opts):
|
||||||
|
body = {"options": opts,
|
||||||
|
"sources": [{"kind": "file", "base64_string": pdf_b64, "filename": fname}],
|
||||||
|
"target": {"kind": "inbody"}}
|
||||||
|
req = urllib.request.Request(SERVE + "/v1/convert/source",
|
||||||
|
data=json.dumps(body).encode(),
|
||||||
|
headers={"Content-Type": "application/json"})
|
||||||
|
for _ in range(4): # tolerate the single-use 404 race
|
||||||
|
try:
|
||||||
|
return json.loads(urllib.request.urlopen(req, timeout=1200).read())
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 404:
|
||||||
|
time.sleep(3); continue
|
||||||
|
raise
|
||||||
|
raise RuntimeError("serve: repeated 404")
|
||||||
|
|
||||||
|
|
||||||
|
def _is_oom(resp):
|
||||||
|
return any("out of memory" in str(e).lower() for e in (resp.get("errors") or []))
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- public API
|
||||||
|
def convert_page(pdf, page, opts=None, *, r=None, retries=5):
|
||||||
|
"""Convert a single page, with cache + GPU-lock + OOM backoff. Returns the
|
||||||
|
per-page DoclingDocument JSON (or None on hard failure)."""
|
||||||
|
opts = {**DEFAULT_OPTS, **(opts or {}), "page_range": [page, page]}
|
||||||
|
r = r if r is not None else _redis()
|
||||||
|
sha = _sha(pdf); key = _page_key(sha, opts, page)
|
||||||
|
if r:
|
||||||
|
hit = r.get(key)
|
||||||
|
if hit:
|
||||||
|
print(f"[dsync] p{page} cache HIT")
|
||||||
|
return json.loads(hit)
|
||||||
|
b64 = base64.b64encode(open(pdf, "rb").read()).decode()
|
||||||
|
fname = os.path.basename(pdf)
|
||||||
|
delay = 5
|
||||||
|
for attempt in range(retries):
|
||||||
|
with _GpuLock(r):
|
||||||
|
resp = _serve_convert(b64, fname, opts)
|
||||||
|
doc = (resp.get("document") or {}).get("json_content")
|
||||||
|
if doc and not _is_oom(resp):
|
||||||
|
if r:
|
||||||
|
r.set(key, json.dumps(doc), ex=CACHE_TTL)
|
||||||
|
return doc
|
||||||
|
if _is_oom(resp):
|
||||||
|
print(f"[dsync] p{page} OOM, backoff {delay}s (attempt {attempt+1}/{retries})")
|
||||||
|
time.sleep(delay); delay = min(delay * 2, 120)
|
||||||
|
continue
|
||||||
|
return doc # non-OOM result (may be empty); don't loop
|
||||||
|
print(f"[dsync] p{page} gave up after {retries} OOM retries")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def convert_document(pdf, opts=None, pages=None):
|
||||||
|
"""Convert all (or selected) pages page-by-page and merge into one structure.
|
||||||
|
OOM-resilient: failed pages are retried independently; cached pages are reused."""
|
||||||
|
r = _redis()
|
||||||
|
if pages is None:
|
||||||
|
import subprocess
|
||||||
|
n = int(subprocess.check_output(["pdfinfo", pdf]).decode().split("Pages:")[1].split()[0])
|
||||||
|
pages = range(1, n + 1)
|
||||||
|
merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
|
||||||
|
for pg in pages:
|
||||||
|
doc = convert_page(pdf, pg, opts, r=r)
|
||||||
|
if not doc:
|
||||||
|
merged["_failed_pages"].append(pg); continue
|
||||||
|
for k in ("texts", "tables", "pictures"):
|
||||||
|
merged[k].extend(doc.get(k, []))
|
||||||
|
merged["pages"].update(doc.get("pages", {}))
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
pdf = sys.argv[1] if len(sys.argv) > 1 else "samples/AQA-Physics-Paper-1H-2022-with-qr.pdf"
|
||||||
|
r = _redis()
|
||||||
|
print("redis:", "connected" if r else "NOT connected (set DOCLING_REDIS_URL / _PASSWORD)")
|
||||||
|
if r:
|
||||||
|
d = convert_document(pdf, {"ocr_engine": "tesseract"}, pages=range(1, 5))
|
||||||
|
print(f"merged texts={len(d['texts'])} failed_pages={d['_failed_pages']}")
|
||||||
1005
api/services/docling/extract.py
Executable file
1005
api/services/docling/extract.py
Executable file
File diff suppressed because it is too large
Load Diff
372
api/services/docling/finalize.py
Normal file
372
api/services/docling/finalize.py
Normal file
@ -0,0 +1,372 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
finalize.py — produce the final corpus output bundle under results/final/.
|
||||||
|
|
||||||
|
Runs the full pipeline (via the real module CLIs, so the bundle is reproducible) across the corpus:
|
||||||
|
* geometry papers (image-only / OCR-path): structured + furniture + bands + page_roles + template
|
||||||
|
+ validate + overlays (template human-review view for ALL pages, rich debug for sample pages).
|
||||||
|
* born-digital fast-path papers: structured + validate (no geometry -> no overlays).
|
||||||
|
Writes per-paper report.md, a human INDEX.md, and a machine catalog.json.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python finalize.py [--no-overlays] # --no-overlays = JSON pipeline only (fast)
|
||||||
|
"""
|
||||||
|
import os, sys, glob, json, subprocess, argparse, datetime
|
||||||
|
|
||||||
|
FINAL = "results/final"
|
||||||
|
PY = sys.executable
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ corpus manifest
|
||||||
|
GEOMETRY = [
|
||||||
|
dict(slug="aqa-physics-8463-imageonly", title="AQA GCSE Physics 8463/1H (image-only)",
|
||||||
|
board="aqa", level="GCSE", path="image-only (RapidOCR margin-pass)",
|
||||||
|
pdf="samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
|
||||||
|
docling="results/E_tess_full.json", rapid="results/rapid_pages/p*.json",
|
||||||
|
extract=["--docling", "results/E_tess_full.json", "--rapid", "results/rapid_pages/p*.json",
|
||||||
|
"--granite", "cached"]),
|
||||||
|
dict(slug="aqa-physics-7408-ocr", title="AQA A-level Physics 7408/1 (rasterised OCR)",
|
||||||
|
board="aqa", level="A-level", path="OCR (RapidOCR margin-pass + Section-B MCQ)",
|
||||||
|
pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||||||
|
docling="results/rapid_7408/merged.json", rapid="results/rapid_7408/p*.json",
|
||||||
|
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt",
|
||||||
|
extract=["--docling", "results/rapid_7408/merged.json", "--rapid", "results/rapid_7408/p*.json",
|
||||||
|
"--board", "aqa"]),
|
||||||
|
dict(slug="aqa-biology-8461-ocr", title="AQA GCSE Biology 8461/1H (rasterised OCR)",
|
||||||
|
board="aqa", level="GCSE", path="OCR (RapidOCR margin-pass)",
|
||||||
|
pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
|
||||||
|
docling="results/rapid_8461/merged.json", rapid="results/rapid_8461/p*.json",
|
||||||
|
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt",
|
||||||
|
extract=["--docling", "results/rapid_8461/merged.json", "--rapid", "results/rapid_8461/p*.json",
|
||||||
|
"--board", "aqa"]),
|
||||||
|
dict(slug="edexcel-maths-1ma1-1h-ocr", title="Edexcel GCSE Maths 1MA1/1H (rasterised OCR)",
|
||||||
|
board="edexcel", level="GCSE-H", path="OCR + gemma marks gap-fill",
|
||||||
|
pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
|
||||||
|
docling="results/genreport/edexcel1h/ocr.json", rapid=None,
|
||||||
|
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt",
|
||||||
|
extract=["--docling", "results/genreport/edexcel1h/ocr.json", "--board", "edexcel",
|
||||||
|
"--marks-fill", "results/genreport/edexcel1h/marks_fill.json"]),
|
||||||
|
dict(slug="edexcel-maths-1ma1-1f-ocr", title="Edexcel GCSE Maths 1MA1/1F (rasterised OCR)",
|
||||||
|
board="edexcel", level="GCSE-F", path="OCR + gemma marks gap-fill",
|
||||||
|
pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf",
|
||||||
|
docling="results/genreport/edexcel1f/ocr.json", rapid=None,
|
||||||
|
extract=["--docling", "results/genreport/edexcel1f/ocr.json", "--board", "edexcel",
|
||||||
|
"--marks-fill", "results/genreport/edexcel1f/marks_fill.json"]),
|
||||||
|
dict(slug="ocr-physics-h556-ocr", title="OCR A-level Physics H556/3 (rasterised OCR)",
|
||||||
|
board="ocr", level="A-level", path="OCR + gemma marks gap-fill",
|
||||||
|
pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
|
||||||
|
docling="results/genreport/ocrh556/ocr.json", rapid=None,
|
||||||
|
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt",
|
||||||
|
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
|
||||||
|
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
|
||||||
|
]
|
||||||
|
|
||||||
|
B1_GEOMETRY = [
|
||||||
|
dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
|
||||||
|
gt_key="b1-aqa-biology-7402-1-2023jun", expected_max=91),
|
||||||
|
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-chemistry-7405-1-2022jun", expected_max=105),
|
||||||
|
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-physics-7408-1-2022jun", expected_max=85),
|
||||||
|
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-biology-8461-1h-2022jun", expected_max=100),
|
||||||
|
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-chemistry-8462-1h-2022jun", expected_max=100),
|
||||||
|
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-combined-8464-b1h-2022jun", expected_max=70),
|
||||||
|
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-combined-8464-c1h-2022jun", expected_max=70),
|
||||||
|
]
|
||||||
|
|
||||||
|
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
|
||||||
|
|
||||||
|
FAST = [
|
||||||
|
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
|
||||||
|
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||||||
|
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt"),
|
||||||
|
dict(slug="aqa-biology-8461-fast", title="AQA GCSE Biology 8461/1H (born-digital)", board="aqa",
|
||||||
|
level="GCSE", pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
|
||||||
|
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt"),
|
||||||
|
dict(slug="edexcel-maths-1ma1-1h-fast", title="Edexcel GCSE Maths 1MA1/1H (born-digital)",
|
||||||
|
board="edexcel", level="GCSE-H", pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
|
||||||
|
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt"),
|
||||||
|
dict(slug="edexcel-maths-1ma1-1f-fast", title="Edexcel GCSE Maths 1MA1/1F (born-digital)",
|
||||||
|
board="edexcel", level="GCSE-F", pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf"),
|
||||||
|
dict(slug="ocr-physics-h556-fast", title="OCR A-level Physics H556/3 (born-digital)", board="ocr",
|
||||||
|
level="A-level", pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
|
||||||
|
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt"),
|
||||||
|
dict(slug="aqa-chemistry-8462-fast", title="AQA GCSE Chemistry 8462/1H (born-digital)", board="aqa",
|
||||||
|
level="GCSE", pdf="samples/chemistry-p1h-2023-qp.pdf"),
|
||||||
|
dict(slug="aqa-physics-8463-twin-fast", title="AQA GCSE Physics 8463/1H born-digital twin",
|
||||||
|
board="aqa", level="GCSE", pdf="samples/physics-p1h-2022-qp.pdf"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run(cmd):
|
||||||
|
r = subprocess.run([PY] + cmd, capture_output=True, text=True)
|
||||||
|
if r.returncode != 0:
|
||||||
|
print(f" ! FAILED: {' '.join(cmd)}\n{r.stderr[-400:]}")
|
||||||
|
return r.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
|
def jload(p):
|
||||||
|
try:
|
||||||
|
return json.load(open(p))
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_gt_labels():
|
||||||
|
try:
|
||||||
|
return json.load(open(GT_LABELS_PATH))
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def part_labels(struct):
|
||||||
|
labels = []
|
||||||
|
for q in struct.get("questions", []) or []:
|
||||||
|
for part in q.get("parts", []) or []:
|
||||||
|
lab = part.get("label")
|
||||||
|
if lab:
|
||||||
|
labels.append(lab)
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def coverage_against_labels(struct, labels):
|
||||||
|
if not labels:
|
||||||
|
return None
|
||||||
|
rec = set(part_labels(struct))
|
||||||
|
gt = set(labels)
|
||||||
|
hit = sorted(rec & gt)
|
||||||
|
miss = sorted(gt - rec)
|
||||||
|
return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
|
||||||
|
"recovered": len(hit), "total": len(gt), "missed": miss,
|
||||||
|
"source": "fixtures/b1_gt_labels.json"}
|
||||||
|
|
||||||
|
|
||||||
|
def answer_region_count(struct):
|
||||||
|
top = len(struct.get("regions", []) or [])
|
||||||
|
per_part = 0
|
||||||
|
for q in struct.get("questions", []) or []:
|
||||||
|
for part in q.get("parts", []) or []:
|
||||||
|
per_part += len(part.get("regions", []) or [])
|
||||||
|
return top + per_part
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_rapid_cache(p):
|
||||||
|
if os.path.exists(p["docling"]):
|
||||||
|
return True
|
||||||
|
if not os.path.exists(p["pdf"]):
|
||||||
|
print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
|
||||||
|
return False
|
||||||
|
return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
|
||||||
|
|
||||||
|
def stats_from(struct, val, gt_labels=None):
|
||||||
|
st = struct.get("stats", {}) or {}
|
||||||
|
mc = st.get("marks_check") or {}
|
||||||
|
cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
|
||||||
|
return {
|
||||||
|
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
|
||||||
|
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
|
||||||
|
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
|
||||||
|
"marks_pct": mc.get("pct"),
|
||||||
|
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
|
||||||
|
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
|
||||||
|
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
|
||||||
|
"opencv_answer_regions": st.get("opencv_answer_regions"),
|
||||||
|
"opencv_answer_region_candidates": st.get("opencv_answer_region_candidates"),
|
||||||
|
"residual_marks_gapfilled": st.get("residual_marks_gapfilled"),
|
||||||
|
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
|
||||||
|
"validate_flags": val.get("flags", []),
|
||||||
|
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
|
||||||
|
"questions_recovered": (val.get("summary") or {}).get("questions_recovered"),
|
||||||
|
"second_pass_slots": [q["label"] for q in val.get("question_sequence", []) if not q["recovered"]],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
|
||||||
|
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||||||
|
S, F, B, R, T, V = (os.path.join(d, f) for f in
|
||||||
|
("structured.json", "furniture.json", "bands.json", "page_roles.json",
|
||||||
|
"template.json", "validate.json"))
|
||||||
|
if prepare_ocr and not ensure_rapid_cache(p):
|
||||||
|
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
|
||||||
|
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
|
||||||
|
ex = ["extract.py"] + extract_args + ["--out", S]
|
||||||
|
if p.get("pdf"):
|
||||||
|
ex += ["--response-regions", p["pdf"]]
|
||||||
|
if p.get("expected_max"):
|
||||||
|
ex += ["--expected-max", str(p["expected_max"])]
|
||||||
|
if p.get("gt"):
|
||||||
|
ex += ["--gt", p["gt"]]
|
||||||
|
run(ex)
|
||||||
|
run(["furniture.py", p["docling"], "--out", F])
|
||||||
|
bands = ["bands.py", S, "--docling", p["docling"], "--out", B]
|
||||||
|
if p.get("rapid"):
|
||||||
|
bands += ["--rapid", p["rapid"]]
|
||||||
|
run(bands)
|
||||||
|
run(["page_roles.py", p["docling"], "--bands", B, "--out", R])
|
||||||
|
run(["template.py", "--structured", S, "--bands", B, "--furniture", F,
|
||||||
|
"--page-roles", R, "--pdf", p["pdf"], "--out", T])
|
||||||
|
run(["validate.py", S, "--out", V])
|
||||||
|
if overlays:
|
||||||
|
otpl = os.path.join(d, "overlays", "template")
|
||||||
|
run(["scripts/overlay.py", S, p["pdf"], "--template", T, "--dpi", "120", "--out", otpl])
|
||||||
|
# rich debug view on the first few pages (cover + early questions)
|
||||||
|
odbg = os.path.join(d, "overlays", "debug")
|
||||||
|
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
|
||||||
|
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
|
||||||
|
return stats_from(jload(S), jload(V), gt_labels), d
|
||||||
|
|
||||||
|
|
||||||
|
def do_fast(p):
|
||||||
|
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||||||
|
S = os.path.join(d, "structured.json"); V = os.path.join(d, "validate.json")
|
||||||
|
ex = ["extract.py", "--text", p["pdf"], "--out", S]
|
||||||
|
if p.get("gt"):
|
||||||
|
ex += ["--gt", p["gt"]]
|
||||||
|
run(ex)
|
||||||
|
run(["validate.py", S, "--out", V])
|
||||||
|
return stats_from(jload(S), jload(V)), d
|
||||||
|
|
||||||
|
|
||||||
|
def per_paper_report(p, s, d, kind):
|
||||||
|
n_imgs = len(glob.glob(os.path.join(d, "overlays", "**", "*.png"), recursive=True))
|
||||||
|
lines = [f"# {p['title']}", "",
|
||||||
|
f"- **slug:** `{p['slug']}` · **board:** {p['board']} · **level:** {p['level']} "
|
||||||
|
f"· **path:** {kind}",
|
||||||
|
f"- **questions/parts:** {s['n_questions']} / {s['n_parts']}",
|
||||||
|
f"- **marks:** {s['marks_sum']}/{s['official_max']}"
|
||||||
|
+ (f" ({s['marks_pct']}% of official max)" if s['marks_pct'] is not None else ""),
|
||||||
|
f"- **coverage vs GT:** {s['coverage_pct']}%"
|
||||||
|
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
|
||||||
|
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
|
||||||
|
f"- **G6 verdict:** {s['validate_verdict']}",
|
||||||
|
f"- **answer-region count:** {s.get('answer_regions')}",
|
||||||
|
f"- **opencv response regions:** {s.get('opencv_answer_regions')} attached / "
|
||||||
|
f"{s.get('opencv_answer_region_candidates')} candidates",
|
||||||
|
]
|
||||||
|
if s["validate_flags"]:
|
||||||
|
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
|
||||||
|
lines += ["", "**Artifacts:** `structured.json`, `validate.json`"
|
||||||
|
+ (", `furniture.json`, `bands.json`, `page_roles.json`, `template.json`, "
|
||||||
|
f"`overlays/` ({n_imgs} images)" if kind != "born-digital fast-path"
|
||||||
|
else " (born-digital: no page geometry → no overlays)")]
|
||||||
|
open(os.path.join(d, "report.md"), "w").write("\n".join(lines) + "\n")
|
||||||
|
return n_imgs
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--no-overlays", action="store_true")
|
||||||
|
ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
|
||||||
|
ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
|
||||||
|
a = ap.parse_args()
|
||||||
|
os.makedirs(FINAL, exist_ok=True)
|
||||||
|
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||||
|
"papers": []}
|
||||||
|
total_imgs = 0
|
||||||
|
|
||||||
|
gt_fixtures = load_gt_labels()
|
||||||
|
geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
|
||||||
|
fast = [] if a.b1_only else FAST
|
||||||
|
|
||||||
|
for p in geometry:
|
||||||
|
print(f"[geometry] {p['slug']}")
|
||||||
|
gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
|
||||||
|
s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
|
||||||
|
n = per_paper_report(p, s, d, p["path"])
|
||||||
|
total_imgs += n
|
||||||
|
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||||||
|
"kind": "geometry", "path": p["path"], "dir": d,
|
||||||
|
"overlay_images": n, **s})
|
||||||
|
for p in fast:
|
||||||
|
print(f"[fast] {p['slug']}")
|
||||||
|
s, d = do_fast(p)
|
||||||
|
per_paper_report(p, s, d, "born-digital fast-path")
|
||||||
|
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||||||
|
"kind": "fast", "path": "born-digital fast-path", "dir": d, **s})
|
||||||
|
|
||||||
|
json.dump(catalog, open(os.path.join(FINAL, "catalog.json"), "w"), indent=2)
|
||||||
|
write_index(catalog, total_imgs)
|
||||||
|
print(f"\n-> {len(catalog['papers'])} papers, {total_imgs} overlay images -> {FINAL}/")
|
||||||
|
|
||||||
|
|
||||||
|
def write_index(catalog, total_imgs):
|
||||||
|
g = [p for p in catalog["papers"] if p["kind"] == "geometry"]
|
||||||
|
f = [p for p in catalog["papers"] if p["kind"] == "fast"]
|
||||||
|
L = ["# Final corpus output — exam-extraction spike", "",
|
||||||
|
f"Generated {catalog['generated_at']}. {len(catalog['papers'])} paper-runs across "
|
||||||
|
f"3 boards × 2 levels, both pipeline paths; {total_imgs} overlay debug images.", "",
|
||||||
|
"Each `<slug>/` holds the machine artifacts (JSON) + `report.md`; geometry papers also have "
|
||||||
|
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
|
||||||
|
"Machine catalog: `catalog.json`.", "",
|
||||||
|
"## Image-only / OCR-path (with geometry + overlays)", "",
|
||||||
|
"| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
|
||||||
|
"|---|---|---|---|---|---|---|---|"]
|
||||||
|
for p in g:
|
||||||
|
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
|
||||||
|
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||||||
|
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||||||
|
f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
|
||||||
|
f"{p['overlay_images']} |")
|
||||||
|
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
|
||||||
|
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
|
||||||
|
"|---|---|---|---|---|---|"]
|
||||||
|
for p in f:
|
||||||
|
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||||||
|
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||||||
|
f"({p['marks_pct']}%) | {p['coverage_pct'] if p['coverage_pct'] is not None else 'n/a'}% | "
|
||||||
|
f"{p['validate_verdict']} |")
|
||||||
|
L += ["", "## Per-paper directory layout", "```",
|
||||||
|
"<slug>/",
|
||||||
|
" structured.json extract.py output (questions->parts->marks/bbox/regions)",
|
||||||
|
" validate.json G6 consistency judge (confidence + flags)",
|
||||||
|
" furniture.json recurring-furniture mask + content margins [geometry only]",
|
||||||
|
" bands.json main + part y-bands [geometry only]",
|
||||||
|
" page_roles.json per-page role + margin override [geometry only]",
|
||||||
|
" template.json editable first-pass template (source/confirmed) [geometry only]",
|
||||||
|
" overlays/template/ human-review view, all pages [geometry only]",
|
||||||
|
" overlays/debug/ raw-detection view, sample pages [geometry only]",
|
||||||
|
" report.md per-paper human summary", "```"]
|
||||||
|
open(os.path.join(FINAL, "INDEX.md"), "w").write("\n".join(L) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
@ -0,0 +1,356 @@
|
|||||||
|
{
|
||||||
|
"b1-aqa-biology-7402-1-2023jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7402/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"89.6",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"09.1",
|
||||||
|
"09.2",
|
||||||
|
"09.3",
|
||||||
|
"09.4",
|
||||||
|
"09.5",
|
||||||
|
"09.6",
|
||||||
|
"10.1",
|
||||||
|
"10.2",
|
||||||
|
"10.3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-chemistry-7405-1-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7405/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"05.7",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"06.6",
|
||||||
|
"06.7",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"07.7",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"08.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-physics-7408-1-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7408/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"07.0",
|
||||||
|
"08.0",
|
||||||
|
"09.0",
|
||||||
|
"10.0",
|
||||||
|
"11.0",
|
||||||
|
"12.0",
|
||||||
|
"13.0",
|
||||||
|
"14.0",
|
||||||
|
"15.0",
|
||||||
|
"16.0",
|
||||||
|
"17.0",
|
||||||
|
"18.0",
|
||||||
|
"19.0",
|
||||||
|
"20.0",
|
||||||
|
"21.0",
|
||||||
|
"22.0",
|
||||||
|
"23.0",
|
||||||
|
"24.0",
|
||||||
|
"25.0",
|
||||||
|
"26.0",
|
||||||
|
"27.0",
|
||||||
|
"28.0",
|
||||||
|
"29.0",
|
||||||
|
"30.0",
|
||||||
|
"31.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-biology-8461-1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "8461/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"01.8",
|
||||||
|
"01.9",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"07.7",
|
||||||
|
"07.8"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-chemistry-8462-1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "8462/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"04.6",
|
||||||
|
"04.7",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"06.6",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"08.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-combined-8464-b1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": null,
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"01.8",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"02.7",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"03.6",
|
||||||
|
"03.7",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-combined-8464-c1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": null,
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"03.0",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"04.6",
|
||||||
|
"04.7",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
119
api/services/docling/furniture.py
Normal file
119
api/services/docling/furniture.py
Normal file
@ -0,0 +1,119 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
furniture.py — detect recurring page chrome by cross-page repetition; derive content margins;
|
||||||
|
reclassify pictures (real figure vs barcode/QR/header furniture). The first-pass mask.
|
||||||
|
|
||||||
|
Principle: an item at ~the same (x,y) on many pages is **chrome, not question content**. This
|
||||||
|
needs no classifier — pure positional recurrence — and it solves the genuine gap the overlay
|
||||||
|
surfaced (the app-generated QR top-right and the foot barcode being mislabelled context_figure),
|
||||||
|
including the QR that bleeds past the margin. It also yields the content margins so stage-2 analysis
|
||||||
|
can be fed only the question/response region.
|
||||||
|
|
||||||
|
Outputs a mask + margins JSON, and an A/B summary (figure false-positives before vs after masking).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python furniture.py <docling_doc.json> [--freq 0.4] [--out results/furniture.json]
|
||||||
|
"""
|
||||||
|
import json, argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
GRID = 24 # pt — position quantisation; items sharing a cell across pages are "recurring"
|
||||||
|
|
||||||
|
|
||||||
|
def gather(doc):
|
||||||
|
out = []
|
||||||
|
for key in ("texts", "pictures", "tables"):
|
||||||
|
for it in doc.get(key, []):
|
||||||
|
prov = it.get("prov") or []
|
||||||
|
bb = prov[0].get("bbox") if prov else None
|
||||||
|
pg = prov[0].get("page_no") if prov else None
|
||||||
|
if bb and pg:
|
||||||
|
out.append({"page": pg, "kind": key[:-1], "label": it.get("label", key[:-1]),
|
||||||
|
"bbox": bb, "text": (it.get("text") or "")[:40]})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def cell(bb):
|
||||||
|
return (round((bb["l"] + bb["r"]) / 2 / GRID), round((bb["t"] + bb["b"]) / 2 / GRID))
|
||||||
|
|
||||||
|
|
||||||
|
def detect(items, n_pages, freq):
|
||||||
|
"""Flag each item furniture=True if its position-cell appears on >= freq*n_pages pages."""
|
||||||
|
pages_at = defaultdict(set)
|
||||||
|
for it in items:
|
||||||
|
pages_at[cell(it["bbox"])].add(it["page"])
|
||||||
|
fcells = {c: len(p) for c, p in pages_at.items() if len(p) >= freq * n_pages}
|
||||||
|
for it in items:
|
||||||
|
it["furniture"] = cell(it["bbox"]) in fcells
|
||||||
|
return fcells
|
||||||
|
|
||||||
|
|
||||||
|
def content_margins(items):
|
||||||
|
"""Content x-band + per-page content bbox from NON-furniture items (what stage-2 should see)."""
|
||||||
|
body = [it for it in items if not it["furniture"]]
|
||||||
|
if not body:
|
||||||
|
return None
|
||||||
|
lefts = sorted(it["bbox"]["l"] for it in body)
|
||||||
|
rights = sorted(it["bbox"]["r"] for it in body)
|
||||||
|
band = {"x_left": round(lefts[max(0, len(lefts) // 20)], 1), # 5th pct — robust to strays
|
||||||
|
"x_right": round(rights[min(len(rights) - 1, len(rights) * 19 // 20)], 1)}
|
||||||
|
per_page = {}
|
||||||
|
bp = defaultdict(list)
|
||||||
|
for it in body:
|
||||||
|
bp[it["page"]].append(it["bbox"])
|
||||||
|
for pg, bbs in bp.items():
|
||||||
|
per_page[pg] = {"top": round(max(b["t"] for b in bbs), 1),
|
||||||
|
"bottom": round(min(b["b"] for b in bbs), 1),
|
||||||
|
"left": round(min(b["l"] for b in bbs), 1),
|
||||||
|
"right": round(max(b["r"] for b in bbs), 1)}
|
||||||
|
return {"content_x_band": band, "per_page": per_page}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("doc")
|
||||||
|
ap.add_argument("--freq", type=float, default=0.40, help="recurrence fraction => furniture")
|
||||||
|
ap.add_argument("--out", default="results/furniture.json")
|
||||||
|
a = ap.parse_args()
|
||||||
|
doc = json.load(open(a.doc))
|
||||||
|
items = gather(doc)
|
||||||
|
n_pages = len({it["page"] for it in items})
|
||||||
|
fcells = detect(items, n_pages, a.freq)
|
||||||
|
margins = content_margins(items)
|
||||||
|
|
||||||
|
pics = [it for it in items if it["kind"] == "picture"]
|
||||||
|
pics_furn = [it for it in pics if it["furniture"]]
|
||||||
|
txt_furn = [it for it in items if it["kind"] == "text" and it["furniture"]]
|
||||||
|
# break furniture pictures down by cell (which recurring object)
|
||||||
|
by_cell = defaultdict(list)
|
||||||
|
for it in pics_furn:
|
||||||
|
by_cell[cell(it["bbox"])].append(it)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"n_pages": n_pages, "freq_threshold": a.freq,
|
||||||
|
"furniture_cells": {f"{c[0]},{c[1]}": n for c, n in sorted(fcells.items())},
|
||||||
|
"content_margins": margins,
|
||||||
|
"ab_test_figures": {
|
||||||
|
"context_figure_before_mask": len(pics),
|
||||||
|
"context_figure_after_mask": len(pics) - len(pics_furn),
|
||||||
|
"removed_as_furniture": len(pics_furn),
|
||||||
|
"removed_breakdown": {f"cell {c[0]},{c[1]}": len(v) for c, v in sorted(by_cell.items())},
|
||||||
|
},
|
||||||
|
"text_furniture_removed": len(txt_furn),
|
||||||
|
"items": items, # each carries furniture flag — consumed by overlay.py --furniture
|
||||||
|
}
|
||||||
|
json.dump(result, open(a.out, "w"))
|
||||||
|
|
||||||
|
ab = result["ab_test_figures"]
|
||||||
|
print(f"pages {n_pages} freq>={a.freq} furniture cells: {result['furniture_cells']}")
|
||||||
|
print(f"content x-band: {margins['content_x_band'] if margins else None}")
|
||||||
|
print(f"\nA/B — figure (picture) classification:")
|
||||||
|
print(f" context_figure BEFORE mask : {ab['context_figure_before_mask']}")
|
||||||
|
print(f" context_figure AFTER mask : {ab['context_figure_after_mask']}")
|
||||||
|
print(f" removed as furniture : {ab['removed_as_furniture']} {ab['removed_breakdown']}")
|
||||||
|
print(f" text furniture removed : {result['text_furniture_removed']} (page numbers / 'Turn over' / headers)")
|
||||||
|
print(f"-> wrote {a.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
88
api/services/docling/page_roles.py
Normal file
88
api/services/docling/page_roles.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
page_roles.py — tag every page with a structural role (the first-pass page-layout pass).
|
||||||
|
|
||||||
|
Roles: cover / question / continuation / blank / appendix. Drives two things in the template:
|
||||||
|
* the human sees the paper's shape (which pages are non-question), and
|
||||||
|
* MARGINS are disabled on pages that have no content column (cover, blank) — the override the
|
||||||
|
user asked for ("the front page doesn't have margins").
|
||||||
|
|
||||||
|
Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and
|
||||||
|
whether the page carries a question band. Output feeds template.py via --page-roles.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python page_roles.py <docling_doc.json> --bands <bands.json> [--out results/page_roles/x.json]
|
||||||
|
"""
|
||||||
|
import json, argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
BLANK_MAX = 130 # non-space chars at/below which a page is boilerplate-only (blank)
|
||||||
|
COVER_KW = ("time allowed", "instructions", "materials", "information for")
|
||||||
|
BLANK_KW = ("blank page", "no questions printed", "no questions are printed")
|
||||||
|
APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet")
|
||||||
|
|
||||||
|
# pages where there is no content column -> margins do not apply (the user's override case)
|
||||||
|
NO_MARGIN_ROLES = {"cover", "blank"}
|
||||||
|
|
||||||
|
|
||||||
|
def page_text(doc):
|
||||||
|
chars, blob = defaultdict(int), defaultdict(list)
|
||||||
|
for t in doc.get("texts", []):
|
||||||
|
prov = t.get("prov") or []
|
||||||
|
pg = prov[0].get("page_no") if prov else None
|
||||||
|
if pg:
|
||||||
|
s = t.get("text") or ""
|
||||||
|
chars[pg] += sum(1 for c in s if not c.isspace())
|
||||||
|
blob[pg].append(s.lower())
|
||||||
|
return chars, {pg: " ".join(v) for pg, v in blob.items()}
|
||||||
|
|
||||||
|
|
||||||
|
def tag(doc, qpages):
|
||||||
|
chars, blob = page_text(doc)
|
||||||
|
n = max([*chars, *qpages, 1])
|
||||||
|
first_q = min(qpages) if qpages else n + 1
|
||||||
|
last_q = max(qpages) if qpages else 0
|
||||||
|
roles = {}
|
||||||
|
for pg in range(1, n + 1):
|
||||||
|
b = blob.get(pg, "")
|
||||||
|
if pg in qpages:
|
||||||
|
role = "question"
|
||||||
|
elif pg < first_q and any(k in b for k in COVER_KW):
|
||||||
|
role = "cover" # before blank: the cover's instructions mention "blank"
|
||||||
|
elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300):
|
||||||
|
role = "blank"
|
||||||
|
elif any(k in b for k in APPENDIX_KW):
|
||||||
|
role = "appendix"
|
||||||
|
elif first_q <= pg <= last_q:
|
||||||
|
role = "continuation" # no question label but inside the question range
|
||||||
|
else:
|
||||||
|
role = "appendix" # content outside the question range (end-matter/insert)
|
||||||
|
roles[pg] = {"role": role, "chars": chars[pg],
|
||||||
|
"margins_enabled": role not in NO_MARGIN_ROLES,
|
||||||
|
"source": "auto", "confirmed": False}
|
||||||
|
return roles
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("doc")
|
||||||
|
ap.add_argument("--bands", required=True)
|
||||||
|
ap.add_argument("--out", default="results/page_roles.json")
|
||||||
|
a = ap.parse_args()
|
||||||
|
bands = json.load(open(a.bands))
|
||||||
|
qpages = {int(p) for p in bands["pages"]}
|
||||||
|
roles = tag(json.load(open(a.doc)), qpages)
|
||||||
|
json.dump({"pages": roles}, open(a.out, "w"), indent=2)
|
||||||
|
from collections import Counter
|
||||||
|
c = Counter(v["role"] for v in roles.values())
|
||||||
|
print(f"roles: {dict(c)}")
|
||||||
|
for pg in sorted(roles):
|
||||||
|
r = roles[pg]
|
||||||
|
flag = "" if r["margins_enabled"] else " (no margins)"
|
||||||
|
if r["role"] != "question":
|
||||||
|
print(f" p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}")
|
||||||
|
print(f"-> wrote {a.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
435
api/services/docling/regions.py
Normal file
435
api/services/docling/regions.py
Normal file
@ -0,0 +1,435 @@
|
|||||||
|
"""OpenCV response-region detector for exam template auto-map.
|
||||||
|
|
||||||
|
This module is intentionally a best-effort spike. It detects visual writing
|
||||||
|
areas (ruled answer lines and rectangular answer boxes) from rendered exam PDF
|
||||||
|
pages and returns mapper-friendly candidate dictionaries. The caller may ignore
|
||||||
|
this output entirely; manual drawing remains the fallback.
|
||||||
|
|
||||||
|
Candidate schema (``detect_response_regions_from_pdf`` return item)::
|
||||||
|
|
||||||
|
{
|
||||||
|
"kind": "response",
|
||||||
|
"source": "ai",
|
||||||
|
"confirmed": False,
|
||||||
|
"confidence": 0.0..1.0,
|
||||||
|
"page_index": 0, # zero-based PDF page index
|
||||||
|
"bbox": { # rendered-page pixel coordinates
|
||||||
|
"x": 72.0, "y": 210.0,
|
||||||
|
"w": 420.0, "h": 86.0,
|
||||||
|
"coord_origin": "TOPLEFT",
|
||||||
|
"unit": "px",
|
||||||
|
},
|
||||||
|
"region_type": "answer_lines" | "answer_box" | "working_space",
|
||||||
|
"detection_method": "opencv_horizontal_lines" | "opencv_contour_box",
|
||||||
|
"line_count": 3, # answer_lines only
|
||||||
|
"meta": {...},
|
||||||
|
}
|
||||||
|
|
||||||
|
The mapper can persist these as ``exam_response_areas`` with
|
||||||
|
``kind='response'``, ``source='ai'``, ``confirmed=false`` after converting the
|
||||||
|
rendered-page pixel bbox into the app's canvas coordinate system if needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable
|
||||||
|
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
try: # OpenCV is an optional runtime dependency until S5 wires regions in.
|
||||||
|
import cv2
|
||||||
|
except ImportError as exc: # pragma: no cover - exercised only in underbuilt envs
|
||||||
|
cv2 = None # type: ignore[assignment]
|
||||||
|
_CV2_IMPORT_ERROR = exc
|
||||||
|
else: # pragma: no cover - trivial branch
|
||||||
|
_CV2_IMPORT_ERROR = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class RegionCandidate:
|
||||||
|
"""Internal typed candidate before dict serialization."""
|
||||||
|
|
||||||
|
page_index: int
|
||||||
|
x: float
|
||||||
|
y: float
|
||||||
|
w: float
|
||||||
|
h: float
|
||||||
|
region_type: str
|
||||||
|
confidence: float
|
||||||
|
detection_method: str
|
||||||
|
line_count: int | None = None
|
||||||
|
meta: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
def to_mapper_dict(self) -> dict[str, Any]:
|
||||||
|
candidate: dict[str, Any] = {
|
||||||
|
"kind": "response",
|
||||||
|
"source": "ai",
|
||||||
|
"confirmed": False,
|
||||||
|
"confidence": round(float(self.confidence), 3),
|
||||||
|
"page_index": int(self.page_index),
|
||||||
|
"bbox": {
|
||||||
|
"x": round(float(self.x), 2),
|
||||||
|
"y": round(float(self.y), 2),
|
||||||
|
"w": round(float(self.w), 2),
|
||||||
|
"h": round(float(self.h), 2),
|
||||||
|
"coord_origin": "TOPLEFT",
|
||||||
|
"unit": "px",
|
||||||
|
},
|
||||||
|
"region_type": self.region_type,
|
||||||
|
"detection_method": self.detection_method,
|
||||||
|
}
|
||||||
|
if self.line_count is not None:
|
||||||
|
candidate["line_count"] = int(self.line_count)
|
||||||
|
if self.meta:
|
||||||
|
candidate["meta"] = self.meta
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class _LineSegment:
|
||||||
|
x: int
|
||||||
|
y: int
|
||||||
|
w: int
|
||||||
|
h: int
|
||||||
|
|
||||||
|
@property
|
||||||
|
def right(self) -> int:
|
||||||
|
return self.x + self.w
|
||||||
|
|
||||||
|
@property
|
||||||
|
def center_y(self) -> float:
|
||||||
|
return self.y + self.h / 2
|
||||||
|
|
||||||
|
|
||||||
|
def detect_response_regions_from_pdf(
|
||||||
|
pdf_path: str | Path,
|
||||||
|
*,
|
||||||
|
dpi: int = 144,
|
||||||
|
max_pages: int | None = None,
|
||||||
|
page_indices: Iterable[int] | None = None,
|
||||||
|
min_confidence: float = 0.35,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Render a PDF and emit response-area candidate dictionaries.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path: Local PDF path.
|
||||||
|
dpi: Render resolution. 144 dpi gives 2 px per PDF point and is a good
|
||||||
|
speed/geometry compromise for the API fast path.
|
||||||
|
max_pages: Optional first-N-pages cap for smoke tests/spikes.
|
||||||
|
page_indices: Optional explicit zero-based page indices. When supplied,
|
||||||
|
``max_pages`` is ignored.
|
||||||
|
min_confidence: Drop candidates below this confidence.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of mapper-friendly dictionaries documented in the module docstring.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if cv2 is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"OpenCV is required for answer-region detection; install "
|
||||||
|
"opencv-python-headless."
|
||||||
|
) from _CV2_IMPORT_ERROR
|
||||||
|
|
||||||
|
if dpi <= 0:
|
||||||
|
raise ValueError("dpi must be positive")
|
||||||
|
if not 0 <= min_confidence <= 1:
|
||||||
|
raise ValueError("min_confidence must be between 0 and 1")
|
||||||
|
|
||||||
|
path = Path(pdf_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(path)
|
||||||
|
|
||||||
|
doc = fitz.open(path)
|
||||||
|
try:
|
||||||
|
if page_indices is None:
|
||||||
|
pages = range(len(doc) if max_pages is None else min(len(doc), max_pages))
|
||||||
|
else:
|
||||||
|
pages = list(page_indices)
|
||||||
|
candidates: list[dict[str, Any]] = []
|
||||||
|
zoom = dpi / 72.0
|
||||||
|
matrix = fitz.Matrix(zoom, zoom)
|
||||||
|
for page_index in pages:
|
||||||
|
if page_index < 0 or page_index >= len(doc):
|
||||||
|
continue
|
||||||
|
pix = doc[page_index].get_pixmap(matrix=matrix, alpha=False)
|
||||||
|
image = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
||||||
|
page_candidates = detect_response_regions_from_image(
|
||||||
|
image,
|
||||||
|
page_index=page_index,
|
||||||
|
min_confidence=min_confidence,
|
||||||
|
)
|
||||||
|
for candidate in page_candidates:
|
||||||
|
item = candidate.to_mapper_dict()
|
||||||
|
item.setdefault("meta", {}).update({
|
||||||
|
"page_width_px": pix.width,
|
||||||
|
"page_height_px": pix.height,
|
||||||
|
"page_width_pdf": float(doc[page_index].rect.width),
|
||||||
|
"page_height_pdf": float(doc[page_index].rect.height),
|
||||||
|
"render_dpi": dpi,
|
||||||
|
})
|
||||||
|
candidates.append(item)
|
||||||
|
return candidates
|
||||||
|
finally:
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def detect_response_regions_from_image(
|
||||||
|
image: Image.Image | np.ndarray,
|
||||||
|
*,
|
||||||
|
page_index: int = 0,
|
||||||
|
min_confidence: float = 0.35,
|
||||||
|
) -> list[RegionCandidate]:
|
||||||
|
"""Detect response-area candidates on one rendered page image."""
|
||||||
|
|
||||||
|
if cv2 is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
"OpenCV is required for answer-region detection; install "
|
||||||
|
"opencv-python-headless."
|
||||||
|
) from _CV2_IMPORT_ERROR
|
||||||
|
if not 0 <= min_confidence <= 1:
|
||||||
|
raise ValueError("min_confidence must be between 0 and 1")
|
||||||
|
|
||||||
|
page = _as_rgb_array(image)
|
||||||
|
gray = cv2.cvtColor(page, cv2.COLOR_RGB2GRAY)
|
||||||
|
binary = _ink_mask(gray)
|
||||||
|
|
||||||
|
height, width = gray.shape[:2]
|
||||||
|
line_candidates = _detect_answer_lines(binary, page_index=page_index, width=width, height=height)
|
||||||
|
box_candidates = _detect_answer_boxes(binary, page_index=page_index, width=width, height=height)
|
||||||
|
candidates = _dedupe_candidates(line_candidates + box_candidates)
|
||||||
|
return [c for c in candidates if c.confidence >= min_confidence]
|
||||||
|
|
||||||
|
|
||||||
|
def _as_rgb_array(image: Image.Image | np.ndarray) -> np.ndarray:
|
||||||
|
if isinstance(image, Image.Image):
|
||||||
|
return np.asarray(image.convert("RGB"))
|
||||||
|
array = np.asarray(image)
|
||||||
|
if array.ndim == 2:
|
||||||
|
return np.stack([array, array, array], axis=-1)
|
||||||
|
if array.shape[-1] == 4:
|
||||||
|
return array[:, :, :3]
|
||||||
|
return array
|
||||||
|
|
||||||
|
|
||||||
|
def _ink_mask(gray: np.ndarray) -> np.ndarray:
|
||||||
|
"""Return a binary mask where printed dark ink is 255."""
|
||||||
|
|
||||||
|
blurred = cv2.GaussianBlur(gray, (3, 3), 0)
|
||||||
|
return cv2.adaptiveThreshold(
|
||||||
|
blurred,
|
||||||
|
255,
|
||||||
|
cv2.ADAPTIVE_THRESH_MEAN_C,
|
||||||
|
cv2.THRESH_BINARY_INV,
|
||||||
|
31,
|
||||||
|
12,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_answer_lines(binary: np.ndarray, *, page_index: int, width: int, height: int) -> list[RegionCandidate]:
|
||||||
|
# Long horizontal strokes are answer lines. A wide kernel removes text while
|
||||||
|
# retaining ruled lines; min length scales with the page so it works across
|
||||||
|
# A4/letter and DPI values.
|
||||||
|
min_line_width = max(80, int(width * 0.22))
|
||||||
|
kernel_width = max(30, int(width * 0.08))
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_width, 1))
|
||||||
|
horizontal = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1)
|
||||||
|
contours, _ = cv2.findContours(horizontal, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
segments: list[_LineSegment] = []
|
||||||
|
for contour in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
if w < min_line_width:
|
||||||
|
continue
|
||||||
|
if h > max(10, int(height * 0.012)):
|
||||||
|
continue
|
||||||
|
# Ignore page borders / header separator lines.
|
||||||
|
if y < height * 0.05 or y > height * 0.96:
|
||||||
|
continue
|
||||||
|
segments.append(_LineSegment(x=x, y=y, w=w, h=max(h, 1)))
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
return []
|
||||||
|
|
||||||
|
segments.sort(key=lambda seg: (seg.center_y, seg.x))
|
||||||
|
grouped = _group_line_segments(segments, width=width, height=height)
|
||||||
|
|
||||||
|
candidates: list[RegionCandidate] = []
|
||||||
|
for group in grouped:
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
x0 = min(seg.x for seg in group)
|
||||||
|
x1 = max(seg.right for seg in group)
|
||||||
|
y0 = min(seg.y for seg in group)
|
||||||
|
y1 = max(seg.y + seg.h for seg in group)
|
||||||
|
line_count = len(group)
|
||||||
|
|
||||||
|
# Expand vertical bbox so it covers the student-writing band, not just
|
||||||
|
# the 1px strokes. Single underline answers get a modest band above the
|
||||||
|
# line; multi-line answers cover the lines plus inter-line whitespace.
|
||||||
|
if line_count == 1:
|
||||||
|
pad_top = max(18, int(height * 0.018))
|
||||||
|
pad_bottom = max(8, int(height * 0.008))
|
||||||
|
else:
|
||||||
|
gaps = [group[i + 1].center_y - group[i].center_y for i in range(line_count - 1)]
|
||||||
|
median_gap = float(np.median(gaps)) if gaps else height * 0.025
|
||||||
|
pad_top = max(10, int(median_gap * 0.45))
|
||||||
|
pad_bottom = max(8, int(median_gap * 0.35))
|
||||||
|
|
||||||
|
box_x = max(0, x0 - 4)
|
||||||
|
box_y = max(0, y0 - pad_top)
|
||||||
|
box_w = min(width, x1 + 4) - box_x
|
||||||
|
box_h = min(height, y1 + pad_bottom) - box_y
|
||||||
|
if box_w <= 0 or box_h <= 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
span_ratio = box_w / max(width, 1)
|
||||||
|
count_bonus = min(0.2, max(0, line_count - 1) * 0.05)
|
||||||
|
confidence = min(0.92, 0.42 + span_ratio * 0.35 + count_bonus)
|
||||||
|
region_type = "answer_lines"
|
||||||
|
candidates.append(
|
||||||
|
RegionCandidate(
|
||||||
|
page_index=page_index,
|
||||||
|
x=box_x,
|
||||||
|
y=box_y,
|
||||||
|
w=box_w,
|
||||||
|
h=box_h,
|
||||||
|
region_type=region_type,
|
||||||
|
confidence=confidence,
|
||||||
|
detection_method="opencv_horizontal_lines",
|
||||||
|
line_count=line_count,
|
||||||
|
meta={"line_segments": [{"x": s.x, "y": s.y, "w": s.w, "h": s.h} for s in group]},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def _group_line_segments(segments: list[_LineSegment], *, width: int, height: int) -> list[list[_LineSegment]]:
|
||||||
|
groups: list[list[_LineSegment]] = []
|
||||||
|
current: list[_LineSegment] = []
|
||||||
|
max_gap = max(28, int(height * 0.045))
|
||||||
|
min_x_overlap_ratio = 0.35
|
||||||
|
|
||||||
|
for segment in segments:
|
||||||
|
if not current:
|
||||||
|
current = [segment]
|
||||||
|
continue
|
||||||
|
previous = current[-1]
|
||||||
|
y_gap = segment.center_y - previous.center_y
|
||||||
|
overlap = max(0, min(segment.right, previous.right) - max(segment.x, previous.x))
|
||||||
|
narrower = max(1, min(segment.w, previous.w))
|
||||||
|
similar_x = overlap / narrower >= min_x_overlap_ratio or abs(segment.x - previous.x) < width * 0.08
|
||||||
|
if 2 <= y_gap <= max_gap and similar_x:
|
||||||
|
current.append(segment)
|
||||||
|
else:
|
||||||
|
groups.append(current)
|
||||||
|
current = [segment]
|
||||||
|
if current:
|
||||||
|
groups.append(current)
|
||||||
|
return groups
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, height: int) -> list[RegionCandidate]:
|
||||||
|
# Close gaps in ruled rectangles, then contour them. This catches table-like
|
||||||
|
# working boxes and explicit answer boxes without trying to understand text.
|
||||||
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||||
|
closed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
|
||||||
|
contours, _ = cv2.findContours(closed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
candidates: list[RegionCandidate] = []
|
||||||
|
min_area = width * height * 0.003
|
||||||
|
max_area = width * height * 0.55
|
||||||
|
for contour in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
area = w * h
|
||||||
|
if area < min_area or area > max_area:
|
||||||
|
continue
|
||||||
|
if w < width * 0.16 or h < height * 0.025:
|
||||||
|
continue
|
||||||
|
if y < height * 0.04 or y + h > height * 0.98:
|
||||||
|
continue
|
||||||
|
aspect = w / max(h, 1)
|
||||||
|
if aspect < 1.2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
contour_area = cv2.contourArea(contour)
|
||||||
|
rectangularity = min(1.0, contour_area / max(area, 1))
|
||||||
|
if rectangularity < 0.03:
|
||||||
|
continue
|
||||||
|
confidence = min(0.88, 0.46 + min(0.24, w / width * 0.24) + min(0.18, h / height * 0.5))
|
||||||
|
region_type = "working_space" if (h > height * 0.12 and rectangularity < 0.18) else "answer_box"
|
||||||
|
padded_x = max(0, x - 2)
|
||||||
|
padded_y = max(0, y - 2)
|
||||||
|
padded_right = min(width, x + w + 2)
|
||||||
|
padded_bottom = min(height, y + h + 2)
|
||||||
|
candidates.append(
|
||||||
|
RegionCandidate(
|
||||||
|
page_index=page_index,
|
||||||
|
x=padded_x,
|
||||||
|
y=padded_y,
|
||||||
|
w=padded_right - padded_x,
|
||||||
|
h=padded_bottom - padded_y,
|
||||||
|
region_type=region_type,
|
||||||
|
confidence=confidence,
|
||||||
|
detection_method="opencv_contour_box",
|
||||||
|
meta={"rectangularity": round(float(rectangularity), 3)},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
|
def _dedupe_candidates(candidates: list[RegionCandidate]) -> list[RegionCandidate]:
|
||||||
|
"""Remove lower-confidence candidates that substantially overlap."""
|
||||||
|
|
||||||
|
kept: list[RegionCandidate] = []
|
||||||
|
for candidate in sorted(candidates, key=lambda c: c.confidence, reverse=True):
|
||||||
|
if all(_iou(candidate, existing) < 0.55 for existing in kept):
|
||||||
|
kept.append(candidate)
|
||||||
|
kept.sort(key=lambda c: (c.page_index, c.y, c.x))
|
||||||
|
return kept
|
||||||
|
|
||||||
|
|
||||||
|
def _iou(a: RegionCandidate, b: RegionCandidate) -> float:
|
||||||
|
if a.page_index != b.page_index:
|
||||||
|
return 0.0
|
||||||
|
ax1, ay1, ax2, ay2 = a.x, a.y, a.x + a.w, a.y + a.h
|
||||||
|
bx1, by1, bx2, by2 = b.x, b.y, b.x + b.w, b.y + b.h
|
||||||
|
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
||||||
|
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
||||||
|
iw, ih = max(0.0, ix2 - ix1), max(0.0, iy2 - iy1)
|
||||||
|
intersection = iw * ih
|
||||||
|
union = a.w * a.h + b.w * b.h - intersection
|
||||||
|
return intersection / union if union > 0 else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Small CLI for smoke testing: python -m api.services.docling.regions PDF."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Detect answer-region candidates in an exam PDF")
|
||||||
|
parser.add_argument("pdf", help="PDF path")
|
||||||
|
parser.add_argument("--dpi", type=int, default=144)
|
||||||
|
parser.add_argument("--max-pages", type=int, default=None)
|
||||||
|
parser.add_argument("--min-confidence", type=float, default=0.35)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(
|
||||||
|
json.dumps(
|
||||||
|
detect_response_regions_from_pdf(
|
||||||
|
args.pdf,
|
||||||
|
dpi=args.dpi,
|
||||||
|
max_pages=args.max_pages,
|
||||||
|
min_confidence=args.min_confidence,
|
||||||
|
),
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": # pragma: no cover
|
||||||
|
main()
|
||||||
0
api/services/docling/scripts/__init__.py
Normal file
0
api/services/docling/scripts/__init__.py
Normal file
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Populate the gitignored B1 image-only eval corpus from the .94 exam-board store.
|
||||||
|
|
||||||
|
The B1 eval papers are NOT committed (third-party copyright; served only via signed URLs).
|
||||||
|
This script downloads each B1_GEOMETRY paper's `storage_loc` object from cc.examboards via the
|
||||||
|
Storage API into its local `pdf` path (under samples/b1/), so finalize.py --b1-only and the
|
||||||
|
B1-2/B1-3 generalization work can run against a real corpus.
|
||||||
|
|
||||||
|
Run from api/services/docling/ inside the cc-api-dev container (SUPABASE_URL/SERVICE_ROLE_KEY in env):
|
||||||
|
python3 scripts/fetch_b1_corpus.py # fetch all B1 papers (skip existing)
|
||||||
|
python3 scripts/fetch_b1_corpus.py --force # re-download
|
||||||
|
python3 scripts/fetch_b1_corpus.py --only b1-aqa-physics-7408-1-2022jun
|
||||||
|
python3 scripts/fetch_b1_corpus.py --list # show what would be fetched, no download
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Import the canonical B1 corpus definition (slug, storage_loc, local pdf path) from finalize.
|
||||||
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
_DOCLING_DIR = os.path.dirname(_HERE)
|
||||||
|
sys.path.insert(0, _DOCLING_DIR)
|
||||||
|
from finalize import B1_GEOMETRY # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def _split_storage_loc(storage_loc: str) -> tuple[str, str]:
|
||||||
|
"""'cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf' -> ('cc.examboards', 'aqa/.../qp.pdf')."""
|
||||||
|
bucket, _, path = storage_loc.partition("/")
|
||||||
|
if not bucket or not path:
|
||||||
|
raise ValueError(f"malformed storage_loc: {storage_loc!r}")
|
||||||
|
return bucket, path
|
||||||
|
|
||||||
|
|
||||||
|
def _entries(only: str | None):
|
||||||
|
for p in B1_GEOMETRY:
|
||||||
|
loc = p.get("storage_loc")
|
||||||
|
pdf = p.get("pdf")
|
||||||
|
if not loc or not pdf:
|
||||||
|
continue
|
||||||
|
if only and p.get("slug") != only:
|
||||||
|
continue
|
||||||
|
yield p["slug"], loc, pdf
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(description="Fetch the B1 image-only eval corpus from .94 cc.examboards")
|
||||||
|
ap.add_argument("--force", action="store_true", help="re-download even if the local file exists")
|
||||||
|
ap.add_argument("--only", help="fetch a single paper by slug")
|
||||||
|
ap.add_argument("--list", action="store_true", help="list what would be fetched and exit")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
todo = list(_entries(args.only))
|
||||||
|
if not todo:
|
||||||
|
print("no matching B1 papers", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if args.list:
|
||||||
|
for slug, loc, pdf in todo:
|
||||||
|
print(f"{slug}\t{loc}\t-> {pdf}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
storage = StorageAdmin()
|
||||||
|
|
||||||
|
ok = skipped = 0
|
||||||
|
for slug, loc, pdf in todo:
|
||||||
|
dest = os.path.join(_DOCLING_DIR, pdf) if not os.path.isabs(pdf) else pdf
|
||||||
|
if os.path.exists(dest) and not args.force:
|
||||||
|
print(f"[skip] {slug} (exists)")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
bucket, path = _split_storage_loc(loc)
|
||||||
|
data = storage.download_file(bucket, path)
|
||||||
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||||
|
with open(dest, "wb") as fh:
|
||||||
|
fh.write(data)
|
||||||
|
print(f"[ok] {slug} <- {bucket}/{path} ({len(data)} bytes)")
|
||||||
|
ok += 1
|
||||||
|
|
||||||
|
print(f"fetched {ok}, skipped {skipped}, of {len(todo)}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
32
api/services/docling/scripts/make_b1_gt.py
Normal file
32
api/services/docling/scripts/make_b1_gt.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import json, sys
|
||||||
|
from pathlib import Path
|
||||||
|
base=Path('/app/api/services/docling')
|
||||||
|
sys.path.insert(0, str(base))
|
||||||
|
import extract
|
||||||
|
papers=[
|
||||||
|
('b1-aqa-biology-7402-1-2023jun','samples/b1/aqa-biology-7402-1-2023jun.pdf','cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf'),
|
||||||
|
('b1-aqa-chemistry-7405-1-2022jun','samples/b1/aqa-chemistry-7405-1-2022jun.pdf','cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-physics-7408-1-2022jun','samples/b1/aqa-physics-7408-1-2022jun.pdf','cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-biology-8461-1h-2022jun','samples/b1/aqa-biology-8461-1h-2022jun.pdf','cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-chemistry-8462-1h-2022jun','samples/b1/aqa-chemistry-8462-1h-2022jun.pdf','cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-combined-8464-b1h-2022jun','samples/b1/aqa-combined-8464-b1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-combined-8464-c1h-2022jun','samples/b1/aqa-combined-8464-c1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf'),
|
||||||
|
]
|
||||||
|
out={}
|
||||||
|
for slug, rel, storage in papers:
|
||||||
|
lines=extract.lines_from_pdftext(str(base/rel))
|
||||||
|
board, code=extract.detect_board(lines)
|
||||||
|
if board != 'aqa':
|
||||||
|
raise RuntimeError(f'{slug}: expected AQA board, detected {board!r} ({code!r})')
|
||||||
|
parts=extract.parse_text_by_board(lines, board)
|
||||||
|
labels=list(parts)
|
||||||
|
out[slug]={
|
||||||
|
'source_pdf': storage,
|
||||||
|
'source_method': 'AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.',
|
||||||
|
'board_detected': board,
|
||||||
|
'paper_code_detected': code,
|
||||||
|
'labels': labels,
|
||||||
|
}
|
||||||
|
print(slug, board, code, len(labels), labels[:5], labels[-5:])
|
||||||
|
Path(base/'fixtures').mkdir(exist_ok=True)
|
||||||
|
Path(base/'fixtures/b1_gt_labels.json').write_text(json.dumps(out, indent=2)+"\n")
|
||||||
310
api/services/docling/scripts/overlay.py
Normal file
310
api/services/docling/scripts/overlay.py
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
overlay.py — human-viewable debug visualisation: draw the extractor's geometry over the rendered
|
||||||
|
exam page. Shows WHERE each question/part label was located and where Docling regions
|
||||||
|
(figures/tables/MCQ checkboxes) sit, so a reviewer can eyeball whether the structure landed in the
|
||||||
|
right place. This is the same geometry the exam-marker app uses to place regions on its canvas.
|
||||||
|
|
||||||
|
Coordinates: Docling/RapidOCR bboxes are PDF points with a BOTTOM-LEFT origin. We render the page
|
||||||
|
at DPI D (scale = D/72) and flip y against the rendered image height, so we never need the page's
|
||||||
|
point-height explicitly: y_top_px = H_px - t*scale.
|
||||||
|
|
||||||
|
With --docling, also draws every raw Docling text block (the body/question content the thin
|
||||||
|
extractor model discards) so a reviewer can see the FULL detection, not just what we persist.
|
||||||
|
Granite tables carry cells but no coordinates; we derive their box by locating the cell-texts in
|
||||||
|
the Docling text layer (content+geometry fusion).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/overlay.py <structured.json> <source_pdf> [--pages 3,4,5] [--dpi 150] [--out DIR]
|
||||||
|
python scripts/overlay.py <structured.json> <pdf> --docling results/E_tess_full.json --pages 5
|
||||||
|
"""
|
||||||
|
import os, sys, json, re, argparse, subprocess, tempfile
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
PART_COLOR = (211, 47, 47) # red — question/part labels
|
||||||
|
BODY_COLOR = (150, 150, 150) # grey — raw Docling body-text blocks (--docling)
|
||||||
|
GRANITE_COLOR = (0, 150, 136) # teal — Granite table (geometry derived from cells)
|
||||||
|
REGION_COLORS = { # docling region taxonomy -> colour
|
||||||
|
"context_figure": (25, 118, 210), # blue
|
||||||
|
"context_data": (56, 142, 60), # green (tables)
|
||||||
|
"context_caption": (123, 31, 162), # purple
|
||||||
|
"mcq_option": (245, 124, 0), # orange (checkboxes)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _norm(s):
|
||||||
|
return re.sub(r"[^a-z0-9]", "", (s or "").lower())
|
||||||
|
|
||||||
|
|
||||||
|
def docling_texts_by_page(doc):
|
||||||
|
"""All raw Docling text items -> {page: [(bbox, text, label)]}. The body content we discard."""
|
||||||
|
out = {}
|
||||||
|
for t in doc.get("texts", []):
|
||||||
|
prov = t.get("prov") or []
|
||||||
|
bb = prov[0].get("bbox") if prov else None
|
||||||
|
pg = prov[0].get("page_no") if prov else None
|
||||||
|
if bb and pg:
|
||||||
|
out.setdefault(pg, []).append((bb, t.get("text") or "", t.get("label") or "text"))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def derive_table_bbox(grid, page_texts):
|
||||||
|
"""Granite tables have cells but no coordinates. Locate the cell-texts in the Docling text
|
||||||
|
layer and union their bboxes -> the table's on-page extent.
|
||||||
|
|
||||||
|
Two traps (seen on physics p5): (1) border/maths glyphs ('|','+') normalise to '' and an
|
||||||
|
empty string is a substring of everything; (2) cell WORDS recur in nearby content — the rock
|
||||||
|
names reappear in the MCQ options below the table ('Basalt or chalk'), far left and lower.
|
||||||
|
So we match only blocks whose normalised text is CONTAINED IN a cell (keeps fragments like
|
||||||
|
'2.90'/'Type', rejects the longer 'basaltorchalk'), require length >= 2, then keep the
|
||||||
|
dominant vertical cluster to drop any stray cell-word elsewhere on the page."""
|
||||||
|
import statistics
|
||||||
|
cells = {c for c in (_norm(x) for row in grid for x in row) if len(c) > 1}
|
||||||
|
hit = [bb for bb, txt, _ in page_texts
|
||||||
|
if len(_norm(txt)) > 1 and any(_norm(txt) in c for c in cells)]
|
||||||
|
if len(hit) < 3:
|
||||||
|
return None
|
||||||
|
med = statistics.median(sorted((b["t"] + b["b"]) / 2 for b in hit))
|
||||||
|
hit = [b for b in hit if abs((b["t"] + b["b"]) / 2 - med) <= 120] # table band only
|
||||||
|
return {"l": min(b["l"] for b in hit), "r": max(b["r"] for b in hit),
|
||||||
|
"t": max(b["t"] for b in hit), "b": min(b["b"] for b in hit)}
|
||||||
|
|
||||||
|
|
||||||
|
def _font(sz):
|
||||||
|
for p in ("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
|
||||||
|
"/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf"):
|
||||||
|
if os.path.exists(p):
|
||||||
|
return ImageFont.truetype(p, sz)
|
||||||
|
return ImageFont.load_default()
|
||||||
|
|
||||||
|
|
||||||
|
MAIN_LINE = (25, 118, 210) # blue — main-question y-markers
|
||||||
|
PART_LINE = (211, 47, 47) # red — part y-markers
|
||||||
|
|
||||||
|
|
||||||
|
def _hline(draw, y_pdf, scale, H, W, color, label, width, font, dashed=False, inset=0):
|
||||||
|
"""Full-width horizontal marker line at a PDF-point y (BOTTOM-LEFT origin)."""
|
||||||
|
y = H - y_pdf * scale
|
||||||
|
if dashed:
|
||||||
|
x = inset
|
||||||
|
while x < W:
|
||||||
|
draw.line([x, y, min(x + 9, W), y], fill=color, width=width); x += 16
|
||||||
|
else:
|
||||||
|
draw.line([inset, y, W, y], fill=color, width=width)
|
||||||
|
if label:
|
||||||
|
tw = draw.textlength(label, font=font)
|
||||||
|
draw.rectangle([inset, y - 16, inset + tw + 6, y], fill=color)
|
||||||
|
draw.text((inset + 3, y - 15), label, fill=(255, 255, 255), font=font)
|
||||||
|
|
||||||
|
|
||||||
|
def _rect(draw, bb, scale, H, color, label, width=3, font=None):
|
||||||
|
"""Draw one bbox (BOTTOM-LEFT origin -> image space) + its label."""
|
||||||
|
x0, x1 = bb["l"] * scale, bb["r"] * scale
|
||||||
|
y0, y1 = H - bb["t"] * scale, H - bb["b"] * scale # t is the higher edge -> smaller y_px
|
||||||
|
draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
|
||||||
|
if label:
|
||||||
|
tw = draw.textlength(label, font=font)
|
||||||
|
draw.rectangle([x0, y0 - 17, x0 + tw + 6, y0], fill=color)
|
||||||
|
draw.text((x0 + 3, y0 - 16), label, fill=(255, 255, 255), font=font)
|
||||||
|
|
||||||
|
|
||||||
|
def draw_template(draw, tpl, pg, scale, H, W, font):
|
||||||
|
"""Render the editable template for one page: margins/bands as LINES, footprints as BOXES.
|
||||||
|
A confirmed element is drawn solid; an unconfirmed (auto) suggestion is drawn dashed."""
|
||||||
|
MARGIN, MAIN, PART = (0, 150, 136), (25, 118, 210), (211, 47, 47)
|
||||||
|
page = tpl["pages"].get(str(pg)) or tpl["pages"].get(pg) or {}
|
||||||
|
# role banner (top-left); margins suppressed entirely on no-margin pages (cover/blank)
|
||||||
|
role = page.get("role", "question")
|
||||||
|
draw.rectangle([0, 0, 8 + len(role) * 8, 16], fill=(70, 70, 70))
|
||||||
|
draw.text((4, 1), f"role:{role}", fill=(255, 255, 255), font=font)
|
||||||
|
margins_on = page.get("margins_enabled", True)
|
||||||
|
# margins: axis-locked lines (document scope on every page + this page's page-scope lines)
|
||||||
|
for m in (tpl.get("margins", []) if margins_on else []):
|
||||||
|
if m["scope"] == "page" and m.get("page") != pg:
|
||||||
|
continue
|
||||||
|
solid = m.get("confirmed")
|
||||||
|
if m["axis"] == "x":
|
||||||
|
x = m["value"] * scale
|
||||||
|
draw.line([x, 0, x, H], fill=MARGIN, width=2) if solid else _dash_v(draw, x, 0, H, MARGIN, 2)
|
||||||
|
else:
|
||||||
|
y = H - m["value"] * scale
|
||||||
|
draw.line([0, y, W, y], fill=MARGIN, width=2) if solid else _dash_h(draw, 0, W, y, MARGIN, 2)
|
||||||
|
for m in page.get("main_bands", []):
|
||||||
|
if not m.get("is_start", True): # continuation page: no spurious second "start" line
|
||||||
|
continue
|
||||||
|
_hline(draw, m["y_start"], scale, H, W, MAIN, f"Q{m['question']}", 3, font,
|
||||||
|
dashed=not m.get("confirmed"))
|
||||||
|
for p in page.get("part_bands", []):
|
||||||
|
_hline(draw, p["y_start"], scale, H, W, PART, p["label"], 2, font, inset=90,
|
||||||
|
dashed=not p.get("confirmed"))
|
||||||
|
for f in page.get("furniture", []):
|
||||||
|
if f.get("box"):
|
||||||
|
_rect(draw, f["box"], scale, H, (130, 130, 130), f"furniture:{f.get('kind','')}", 2, font)
|
||||||
|
for g in page.get("figures", []):
|
||||||
|
if g.get("box"):
|
||||||
|
_rect(draw, g["box"], scale, H, (56, 142, 60), "figure", 3, font)
|
||||||
|
for t in page.get("tables", []):
|
||||||
|
if t.get("box"):
|
||||||
|
_rect(draw, t["box"], scale, H, (0, 150, 136),
|
||||||
|
f"table {t.get('n_rows')}x{t.get('n_cols')}", 3, font)
|
||||||
|
|
||||||
|
|
||||||
|
def render_page(pdf, pg, dpi, td):
|
||||||
|
"""Render page `pg` and return an image in DOCLING's coordinate space. Docling reports bbox
|
||||||
|
relative to the CropBox, but pdftoppm renders the MediaBox — when CropBox != MediaBox (e.g. the
|
||||||
|
Edexcel 1MA1 papers: media 652x899, crop inset 28.35pt) that mismatch magnifies + shifts every
|
||||||
|
overlaid shape toward a corner. Fix: crop the rendered image to the CropBox so it matches Docling.
|
||||||
|
No-op when CropBox == MediaBox (h556) or when poppler already rendered the CropBox."""
|
||||||
|
base = os.path.join(td, f"p{pg}")
|
||||||
|
subprocess.run(["pdftoppm", "-png", "-r", str(dpi), "-f", str(pg), "-l", str(pg), pdf, base],
|
||||||
|
check=True)
|
||||||
|
png = next(p for p in (f"{base}-{pg:02d}.png", f"{base}-{pg}.png", f"{base}-{pg:03d}.png")
|
||||||
|
if os.path.exists(p))
|
||||||
|
img = Image.open(png).convert("RGB")
|
||||||
|
try:
|
||||||
|
import pypdf
|
||||||
|
page = pypdf.PdfReader(pdf).pages[pg - 1]
|
||||||
|
mb, cb = page.mediabox, page.cropbox
|
||||||
|
scale = dpi / 72.0
|
||||||
|
mbl, mbt = float(mb.left), float(mb.top)
|
||||||
|
dcrop = any(abs(a - b) > 0.5 for a, b in
|
||||||
|
((cb.left, mb.left), (cb.bottom, mb.bottom), (cb.right, mb.right), (cb.top, mb.top)))
|
||||||
|
rendered_mediabox = abs(img.width - (float(mb.right) - mbl) * scale) < 3
|
||||||
|
if dcrop and rendered_mediabox:
|
||||||
|
img = img.crop((round((float(cb.left) - mbl) * scale), round((mbt - float(cb.top)) * scale),
|
||||||
|
round((float(cb.right) - mbl) * scale), round((mbt - float(cb.bottom)) * scale)))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def _dash_v(draw, x, y0, y1, color, w):
|
||||||
|
y = y0
|
||||||
|
while y < y1:
|
||||||
|
draw.line([x, y, x, min(y + 9, y1)], fill=color, width=w); y += 16
|
||||||
|
|
||||||
|
|
||||||
|
def _dash_h(draw, x0, x1, y, color, w):
|
||||||
|
x = x0
|
||||||
|
while x < x1:
|
||||||
|
draw.line([x, y, min(x + 9, x1), y], fill=color, width=w); x += 16
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("structured"); ap.add_argument("pdf")
|
||||||
|
ap.add_argument("--docling", help="raw Docling doc JSON: also draw every body-text block "
|
||||||
|
"(the content the thin model discards) + derive Granite-table boxes")
|
||||||
|
ap.add_argument("--bands", help="bands.py JSON: draw main-question + part start/end y-marker lines")
|
||||||
|
ap.add_argument("--furniture", help="furniture.py JSON: mark recurring furniture vs real figures "
|
||||||
|
"+ draw the content x-margins")
|
||||||
|
ap.add_argument("--template", help="template.py JSON: render the editable first-pass template "
|
||||||
|
"(margins+bands as lines, furniture/figures as boxes). "
|
||||||
|
"When set, draws ONLY the template (the human-review view).")
|
||||||
|
ap.add_argument("--pages", help="comma list, e.g. 3,4,5 (default: all pages with geometry)")
|
||||||
|
ap.add_argument("--dpi", type=int, default=150)
|
||||||
|
ap.add_argument("--out", default="results/overlay")
|
||||||
|
a = ap.parse_args()
|
||||||
|
os.makedirs(a.out, exist_ok=True)
|
||||||
|
scale = a.dpi / 72.0
|
||||||
|
font = _font(14)
|
||||||
|
|
||||||
|
res = json.load(open(a.structured))
|
||||||
|
doc_texts = docling_texts_by_page(json.load(open(a.docling))) if a.docling else {}
|
||||||
|
bands = json.load(open(a.bands))["pages"] if a.bands else {}
|
||||||
|
furn = json.load(open(a.furniture)) if a.furniture else None
|
||||||
|
tpl = json.load(open(a.template)) if a.template else None
|
||||||
|
# gather geometry by page
|
||||||
|
parts_by_pg, regions_by_pg = {}, {}
|
||||||
|
for q in res.get("questions", []):
|
||||||
|
for p in q["parts"]:
|
||||||
|
if p.get("bbox") and p.get("page"):
|
||||||
|
parts_by_pg.setdefault(p["page"], []).append((p["label"], p["bbox"]))
|
||||||
|
for r in res.get("regions", []):
|
||||||
|
if r.get("bbox") and r.get("page"):
|
||||||
|
regions_by_pg.setdefault(r["page"], []).append((r["type"], r["bbox"]))
|
||||||
|
# tables: standard ones carry a bbox; Granite ones don't -> derive from the text layer
|
||||||
|
tables_by_pg = {}
|
||||||
|
for t in res.get("tables", []):
|
||||||
|
pg = t.get("page")
|
||||||
|
if not pg:
|
||||||
|
continue
|
||||||
|
bb = t.get("bbox") or (derive_table_bbox(t.get("grid", []), doc_texts.get(pg, []))
|
||||||
|
if a.docling else None)
|
||||||
|
if bb:
|
||||||
|
tables_by_pg.setdefault(pg, []).append(
|
||||||
|
(f"table {t.get('source','')} {t.get('n_rows')}x{t.get('n_cols')}", bb))
|
||||||
|
|
||||||
|
want = ([int(x) for x in a.pages.split(",")] if a.pages
|
||||||
|
else (sorted(int(p) for p in tpl["pages"]) if tpl
|
||||||
|
else sorted(set(parts_by_pg) | set(regions_by_pg) | set(doc_texts))))
|
||||||
|
if not want:
|
||||||
|
sys.exit("no bbox geometry in this result (born-digital text path carries no geometry; "
|
||||||
|
"use an OCR/rapid-path structured.json)")
|
||||||
|
|
||||||
|
written = []
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
for pg in want:
|
||||||
|
img = render_page(a.pdf, pg, a.dpi, td)
|
||||||
|
H = img.height
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
if tpl: # template-only render = the human-review view
|
||||||
|
draw_template(draw, tpl, pg, scale, H, img.width, font)
|
||||||
|
out = os.path.join(a.out, f"p{pg:02d}.png")
|
||||||
|
img.save(out); written.append(out)
|
||||||
|
pgd = tpl["pages"].get(str(pg), {})
|
||||||
|
print(f"p{pg}: template — {len(pgd.get('main_bands',[]))} main, "
|
||||||
|
f"{len(pgd.get('part_bands',[]))} part, {len(pgd.get('furniture',[]))} furn, "
|
||||||
|
f"{len(pgd.get('figures',[]))} fig -> {out}")
|
||||||
|
continue
|
||||||
|
# layer 0: raw Docling body-text blocks (faint, no label) — the discarded content
|
||||||
|
for bb, txt, lab in doc_texts.get(pg, []):
|
||||||
|
_rect(draw, bb, scale, H, BODY_COLOR, None, 1, font)
|
||||||
|
# layer 1: taxonomy regions
|
||||||
|
for typ, bb in regions_by_pg.get(pg, []):
|
||||||
|
_rect(draw, bb, scale, H, REGION_COLORS.get(typ, (120, 120, 120)), typ, 2, font)
|
||||||
|
# layer 2: tables (Granite-derived boxes in teal)
|
||||||
|
for lab, bb in tables_by_pg.get(pg, []):
|
||||||
|
_rect(draw, bb, scale, H, GRANITE_COLOR, lab, 3, font)
|
||||||
|
# layer 3: part labels on top
|
||||||
|
for lab, bb in parts_by_pg.get(pg, []):
|
||||||
|
_rect(draw, bb, scale, H, PART_COLOR, lab, 3, font)
|
||||||
|
# layer 4: band y-marker lines (main-question = blue, part = red dashed; end = dashed)
|
||||||
|
pb = bands.get(str(pg)) or bands.get(pg)
|
||||||
|
nb = 0
|
||||||
|
if pb:
|
||||||
|
W = img.width
|
||||||
|
for m in pb["main"]:
|
||||||
|
if not m.get("is_start", True): # skip continuation-page duplicate
|
||||||
|
continue
|
||||||
|
_hline(draw, m["y_start"], scale, H, W, MAIN_LINE,
|
||||||
|
f"Q{m['question']} ▸ start", 3, font); nb += 1
|
||||||
|
_hline(draw, m["y_end"], scale, H, W, MAIN_LINE, None, 1, font, dashed=True)
|
||||||
|
for p in pb["part"]:
|
||||||
|
_hline(draw, p["y_start"], scale, H, W, PART_LINE,
|
||||||
|
f"{p['label']} start", 2, font, inset=90); nb += 1
|
||||||
|
# layer 5: furniture mask — green=real figure, grey=masked furniture; + content margins
|
||||||
|
if furn:
|
||||||
|
W = img.width
|
||||||
|
for it in furn["items"]:
|
||||||
|
if it["page"] != pg or it["kind"] != "picture":
|
||||||
|
continue
|
||||||
|
if it["furniture"]:
|
||||||
|
_rect(draw, it["bbox"], scale, H, (130, 130, 130), "furniture", 2, font)
|
||||||
|
else:
|
||||||
|
_rect(draw, it["bbox"], scale, H, (56, 142, 60), "figure ✓", 3, font)
|
||||||
|
band = (furn.get("content_margins") or {}).get("content_x_band")
|
||||||
|
if band:
|
||||||
|
for xk in ("x_left", "x_right"):
|
||||||
|
x = band[xk] * scale
|
||||||
|
draw.line([x, 0, x, H], fill=(0, 150, 136), width=2)
|
||||||
|
out = os.path.join(a.out, f"p{pg:02d}.png")
|
||||||
|
img.save(out); written.append(out)
|
||||||
|
print(f"p{pg}: {len(parts_by_pg.get(pg,[]))} part-labels, "
|
||||||
|
f"{len(regions_by_pg.get(pg,[]))} regions, {len(tables_by_pg.get(pg,[]))} tables, "
|
||||||
|
f"{len(doc_texts.get(pg,[]))} body-text blocks, {nb} band-lines -> {out}")
|
||||||
|
print(f"-> {len(written)} page(s) in {a.out}/")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
69
api/services/docling/scripts/rapid_pass.py
Normal file
69
api/services/docling/scripts/rapid_pass.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
rapid_pass.py — generalise the proven AQA "RapidOCR margin-pass" (95.2% on the image-only
|
||||||
|
8463 paper) to any AQA paper. Born-digital AQA QPs ship a text layer, so we force RapidOCR
|
||||||
|
over the *rendered* page (`force_ocr:true`) to simulate the image-only redistribution case
|
||||||
|
and recover the boxed `NN.M` question numbers Tesseract shatters.
|
||||||
|
|
||||||
|
For each page it writes results/<outdir>/p{N}.json (a full per-page DoclingDocument, the
|
||||||
|
shape extract.py's aqa_questions_rapid expects) and a merged.json (for board / front-matter
|
||||||
|
detection). All GPU work is serialised + OOM-resilient through dsync.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/rapid_pass.py samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf rapid_7408
|
||||||
|
python scripts/rapid_pass.py <pdf> <outdir-slug> [first_page] [last_page]
|
||||||
|
"""
|
||||||
|
import os, sys, json, subprocess, re
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
import dsync
|
||||||
|
|
||||||
|
OPTS = {"ocr_engine": "rapidocr", "force_ocr": True}
|
||||||
|
|
||||||
|
|
||||||
|
def npages(pdf):
|
||||||
|
out = subprocess.check_output(["pdfinfo", pdf]).decode()
|
||||||
|
return int(out.split("Pages:")[1].split()[0])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf = sys.argv[1]
|
||||||
|
slug = sys.argv[2]
|
||||||
|
if os.path.isabs(slug) or ".." in slug.split(os.sep) or not re.fullmatch(r"[A-Za-z0-9._/-]+", slug):
|
||||||
|
raise SystemExit(f"unsafe output slug: {slug!r}")
|
||||||
|
n = npages(pdf)
|
||||||
|
first = int(sys.argv[3]) if len(sys.argv) > 3 else 1
|
||||||
|
last = min(int(sys.argv[4]), n) if len(sys.argv) > 4 else n
|
||||||
|
if first > n or first > last:
|
||||||
|
print(f"requested page range {first}-{last} is outside PDF ({n} pages); nothing to do")
|
||||||
|
return
|
||||||
|
outdir = os.path.join("results", slug)
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
r = dsync._redis()
|
||||||
|
print(f"redis: {'connected' if r else 'NO CACHE'} pdf={pdf} pages {first}-{last}/{n}")
|
||||||
|
merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
|
||||||
|
for pg in range(first, last + 1):
|
||||||
|
page_path = os.path.join(outdir, f"p{pg}.json")
|
||||||
|
if os.path.exists(page_path):
|
||||||
|
doc = json.load(open(page_path))
|
||||||
|
print(f" p{pg}: file cache HIT ({len(doc.get(texts, []))} texts)")
|
||||||
|
else:
|
||||||
|
doc = dsync.convert_page(pdf, pg, OPTS, r=r)
|
||||||
|
if not doc:
|
||||||
|
merged["_failed_pages"].append(pg)
|
||||||
|
print(f" p{pg}: FAILED")
|
||||||
|
continue
|
||||||
|
json.dump(doc, open(page_path, "w"))
|
||||||
|
for k in ("texts", "tables", "pictures"):
|
||||||
|
merged[k].extend(doc.get(k, []))
|
||||||
|
merged["pages"].update(doc.get("pages", {}))
|
||||||
|
nmarg = sum(1 for t in doc.get("texts", [])
|
||||||
|
if (t.get("prov") or [{}])[0].get("bbox", {}).get("l", 999) <= 140)
|
||||||
|
print(f" p{pg}: {len(doc.get('texts', []))} texts ({nmarg} left-margin)")
|
||||||
|
json.dump(merged, open(os.path.join(outdir, "merged.json"), "w"))
|
||||||
|
print(f"-> {outdir}/ ({last-first+1-len(merged['_failed_pages'])} pages, "
|
||||||
|
f"failed={merged['_failed_pages']})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
210
api/services/docling/tables.py
Normal file
210
api/services/docling/tables.py
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
tables.py — selective table-cell extraction for the exam extractor (PLAN.md §B).
|
||||||
|
|
||||||
|
Two sources, unified into one cell-grid schema:
|
||||||
|
* STANDARD — the Tesseract+TableFormer backbone already emits `tables[].data.table_cells`
|
||||||
|
(text + row/col offsets + spans + bbox). Free, cached, every run. Good on ruled tables;
|
||||||
|
but it MISSES some data tables and OCRs them as loose tokens (REPORT.md p5).
|
||||||
|
* GRANITE — Granite-Docling-258M VLM emits `<otsl>` grids in DocTags (clean rows/cols even
|
||||||
|
where the backbone scrambles them). GPU cost, so used SELECTIVELY: only on pages the router
|
||||||
|
flags (a standard table present, or dense picture/checkbox), routed through dsync's GPU lock
|
||||||
|
+ Redis cache. Recipe (REPORT.md): {"to_formats":["doctags","json"], "pipeline":"vlm",
|
||||||
|
"vlm_pipeline_model":"granite_docling"}.
|
||||||
|
|
||||||
|
Unified table = {page, n_rows, n_cols, grid (2D text), cells, caption, source, is_furniture}.
|
||||||
|
"""
|
||||||
|
import re, json, os, glob, base64, urllib.request
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- OTSL (Granite DocTags)
|
||||||
|
OTSL_BLOCK = re.compile(r"<otsl>(.*?)</otsl>", re.S)
|
||||||
|
CAPTION = re.compile(r"<caption>(?:<loc_\d+>)*(.*?)</caption>", re.S)
|
||||||
|
CELL_TOK = re.compile(r"<(fcel|ecel|ched|rhed|lcel|ucel|xcel|nl)>([^<]*)")
|
||||||
|
HEADER_TAGS = {"ched", "rhed"}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_otsl(doctags):
|
||||||
|
"""Parse every <otsl> block in a DocTags string into unified tables."""
|
||||||
|
out = []
|
||||||
|
for block in OTSL_BLOCK.findall(doctags):
|
||||||
|
cap = None
|
||||||
|
mc = CAPTION.search(block)
|
||||||
|
if mc:
|
||||||
|
cap = re.sub(r"\s+", " ", mc.group(1)).strip()
|
||||||
|
body = CAPTION.sub("", block)
|
||||||
|
body = re.sub(r"<loc_\d+>", "", body)
|
||||||
|
rows, cur = [], []
|
||||||
|
for tag, txt in CELL_TOK.findall(body):
|
||||||
|
if tag == "nl":
|
||||||
|
rows.append(cur); cur = []
|
||||||
|
else:
|
||||||
|
cur.append({"text": txt.strip(), "header": tag in HEADER_TAGS,
|
||||||
|
"empty": tag == "ecel"})
|
||||||
|
if cur:
|
||||||
|
rows.append(cur)
|
||||||
|
rows = [r for r in rows if r]
|
||||||
|
if not rows:
|
||||||
|
continue
|
||||||
|
n_cols = max(len(r) for r in rows)
|
||||||
|
grid = [[c["text"] for c in r] + [""] * (n_cols - len(r)) for r in rows]
|
||||||
|
out.append({"page": None, "n_rows": len(rows), "n_cols": n_cols, "grid": grid,
|
||||||
|
"caption": cap, "source": "granite-otsl",
|
||||||
|
"is_furniture": is_furniture(grid, cap)})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- standard TableFormer
|
||||||
|
def tables_from_standard(doc):
|
||||||
|
out = []
|
||||||
|
for t in doc.get("tables", []):
|
||||||
|
data = t.get("data", {}) or {}
|
||||||
|
cells = data.get("table_cells", []) or []
|
||||||
|
nr, nc = data.get("num_rows") or 0, data.get("num_cols") or 0
|
||||||
|
grid = [["" for _ in range(nc)] for _ in range(nr)]
|
||||||
|
for c in cells:
|
||||||
|
r0, c0 = c.get("start_row_offset_idx"), c.get("start_col_offset_idx")
|
||||||
|
if r0 is not None and c0 is not None and r0 < nr and c0 < nc and c.get("text"):
|
||||||
|
grid[r0][c0] = c["text"]
|
||||||
|
prov = t.get("prov") or []
|
||||||
|
page = prov[0].get("page_no") if prov else None
|
||||||
|
cap = " ".join(x.get("text", "") for x in (t.get("captions") or []) if isinstance(x, dict)) or None
|
||||||
|
out.append({"page": page, "n_rows": nr, "n_cols": nc, "grid": grid,
|
||||||
|
"caption": cap, "source": "docling-standard",
|
||||||
|
"is_furniture": is_furniture(grid, cap)})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- furniture filter
|
||||||
|
FURNITURE_RE = re.compile(r"examiner|do not write|leave\s+blank|question\s*mark|"
|
||||||
|
r"for marker|total marks?$", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
def is_furniture(grid, caption=None):
|
||||||
|
"""A table that is exam scaffolding (mark grid / 'For Examiner's Use'), not question data."""
|
||||||
|
blob = " ".join(cell for row in grid for cell in row) + " " + (caption or "")
|
||||||
|
if FURNITURE_RE.search(blob):
|
||||||
|
return True
|
||||||
|
# a single-column strip of question numbers / blanks = a mark grid
|
||||||
|
flat = [c for row in grid for c in row if c.strip()]
|
||||||
|
if flat and all(re.fullmatch(r"\d{1,2}", c.strip()) for c in flat):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- Granite via dsync
|
||||||
|
VLM_OPTS = {"to_formats": ["doctags", "json"], "pipeline": "vlm",
|
||||||
|
"vlm_pipeline_model": "granite_docling", "image_export_mode": "placeholder"}
|
||||||
|
|
||||||
|
|
||||||
|
def _serve_vlm(pdf_b64, fname, page):
|
||||||
|
import dsync
|
||||||
|
opts = {**VLM_OPTS, "page_range": [page, page]}
|
||||||
|
body = {"options": opts,
|
||||||
|
"sources": [{"kind": "file", "base64_string": pdf_b64, "filename": fname}],
|
||||||
|
"target": {"kind": "inbody"}}
|
||||||
|
req = urllib.request.Request(dsync.SERVE + "/v1/convert/source",
|
||||||
|
data=json.dumps(body).encode(),
|
||||||
|
headers={"Content-Type": "application/json"})
|
||||||
|
for _ in range(4): # tolerate the single-use 404 race
|
||||||
|
try:
|
||||||
|
return json.loads(urllib.request.urlopen(req, timeout=1200).read())
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
if e.code == 404:
|
||||||
|
import time; time.sleep(3); continue
|
||||||
|
raise
|
||||||
|
raise RuntimeError("serve vlm: repeated 404")
|
||||||
|
|
||||||
|
|
||||||
|
def _doctags_of(resp):
|
||||||
|
doc = resp.get("document") or {}
|
||||||
|
return doc.get("doctags_content") or doc.get("doc_tags") or doc.get("doctags") or ""
|
||||||
|
|
||||||
|
|
||||||
|
def granite_tables(pdf, pages, *, cached_glob=None, retries=4):
|
||||||
|
"""Run Granite-Docling on the given pages via dsync (GPU lock + OOM retry + Redis cache),
|
||||||
|
parse <otsl>, tag each table with its page. Falls back to cached *.doctags if serve fails."""
|
||||||
|
import dsync, time
|
||||||
|
cache = _load_cached_doctags(cached_glob) if cached_glob else {}
|
||||||
|
r = dsync._redis()
|
||||||
|
b64 = base64.b64encode(open(pdf, "rb").read()).decode()
|
||||||
|
fname = os.path.basename(pdf)
|
||||||
|
sha = dsync._sha(pdf)
|
||||||
|
out = []
|
||||||
|
for pg in pages:
|
||||||
|
key = f"docling:vlm:{sha}:p{pg}"
|
||||||
|
doctags = None
|
||||||
|
if r and (hit := r.get(key)):
|
||||||
|
doctags = hit if isinstance(hit, str) else hit.decode()
|
||||||
|
if doctags is None:
|
||||||
|
delay = 5
|
||||||
|
for attempt in range(retries):
|
||||||
|
with dsync._GpuLock(r):
|
||||||
|
resp = _serve_vlm(b64, fname, pg)
|
||||||
|
if dsync._is_oom(resp):
|
||||||
|
print(f"[granite] p{pg} OOM, backoff {delay}s ({attempt+1}/{retries})")
|
||||||
|
time.sleep(delay); delay = min(delay * 2, 120); continue
|
||||||
|
doctags = _doctags_of(resp)
|
||||||
|
if r and doctags:
|
||||||
|
r.set(key, doctags, ex=dsync.CACHE_TTL)
|
||||||
|
break
|
||||||
|
if not doctags and pg in cache:
|
||||||
|
print(f"[granite] p{pg} serve empty -> cached doctags")
|
||||||
|
doctags = cache[pg]
|
||||||
|
for tbl in parse_otsl(doctags or ""):
|
||||||
|
tbl["page"] = pg
|
||||||
|
out.append(tbl)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _load_cached_doctags(glob_pat):
|
||||||
|
"""Map page_no -> doctags text from files named *p<N>.doctags."""
|
||||||
|
cache = {}
|
||||||
|
for fn in glob.glob(glob_pat):
|
||||||
|
m = re.search(r"p(\d+)\.doctags$", fn)
|
||||||
|
if m:
|
||||||
|
cache[int(m.group(1))] = open(fn, encoding="utf-8", errors="replace").read()
|
||||||
|
return cache
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- routing + attach
|
||||||
|
def candidate_pages(doc):
|
||||||
|
"""Pages the router sends to Granite: a standard table, or a dense picture/checkbox page."""
|
||||||
|
pages = set()
|
||||||
|
for t in doc.get("tables", []):
|
||||||
|
prov = t.get("prov") or []
|
||||||
|
if prov and prov[0].get("page_no"):
|
||||||
|
pages.add(prov[0]["page_no"])
|
||||||
|
chk = {}
|
||||||
|
for it in doc.get("texts", []):
|
||||||
|
if it.get("label", "").startswith("checkbox"):
|
||||||
|
prov = it.get("prov") or []
|
||||||
|
if prov and prov[0].get("page_no"):
|
||||||
|
chk[prov[0]["page_no"]] = chk.get(prov[0]["page_no"], 0) + 1
|
||||||
|
pages |= {p for p, n in chk.items() if n >= 2}
|
||||||
|
return sorted(pages)
|
||||||
|
|
||||||
|
|
||||||
|
def attach_to_questions(tables, parts):
|
||||||
|
"""Assign each non-furniture table to the nearest preceding part on its page (by y); if no
|
||||||
|
geometry, attach to the first part on that page. Records table refs on the part."""
|
||||||
|
data_tables = [t for t in tables if not t["is_furniture"]]
|
||||||
|
by_page = {}
|
||||||
|
for lab, v in parts.items():
|
||||||
|
by_page.setdefault(v.get("page"), []).append((lab, v))
|
||||||
|
for i, t in enumerate(data_tables):
|
||||||
|
t["id"] = i
|
||||||
|
cands = by_page.get(t["page"], [])
|
||||||
|
if not cands:
|
||||||
|
t["for_part"] = None; continue
|
||||||
|
# best-effort: the part highest on the page (largest bbox top = the page's question stem),
|
||||||
|
# else the earliest part label. (Tables sit under the stem; we don't carry table y here.)
|
||||||
|
with_geo = [(lab, v) for lab, v in cands if v.get("bbox")]
|
||||||
|
if with_geo:
|
||||||
|
lab = max(with_geo, key=lambda kv: (kv[1]["bbox"] or {}).get("t", 0))[0]
|
||||||
|
else:
|
||||||
|
lab = sorted(cands, key=lambda kv: kv[0])[0][0]
|
||||||
|
t["for_part"] = lab
|
||||||
|
parts[lab].setdefault("tables", []).append(
|
||||||
|
{"id": i, "n_rows": t["n_rows"], "n_cols": t["n_cols"],
|
||||||
|
"caption": t["caption"], "source": t["source"]})
|
||||||
|
return data_tables
|
||||||
215
api/services/docling/template.py
Normal file
215
api/services/docling/template.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
template.py — assemble the editable first-pass structural template from the spike's three signal
|
||||||
|
sources (extract structured.json + bands.json + furniture.json) into ONE round-trippable JSON the
|
||||||
|
human reviewer verifies AND edits before stage-2 generates the final template.
|
||||||
|
|
||||||
|
UI principle (user, 2026-06-07): directional LIMITS are draggable LINES (1-DOF, easier to drag);
|
||||||
|
object FOOTPRINTS are BOXES. So:
|
||||||
|
* margins -> four axis-locked LINES: left/right (x), top/bottom (y)
|
||||||
|
* question/part bands -> horizontal LINES: start/end y
|
||||||
|
* furniture / figures / tables -> BOXES (an object's footprint)
|
||||||
|
|
||||||
|
Every editable element carries {source: "auto"|"human", confirmed: bool} — the AI-suggestion seam.
|
||||||
|
Stage-2 must consume only confirmed elements (or a template marked confirmed at the top level).
|
||||||
|
Coordinates are PDF points, BOTTOM-LEFT origin (units in meta); the app maps to its own canvas.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python template.py --structured S.json --bands B.json --furniture F.json --pdf P.pdf --out T.json
|
||||||
|
"""
|
||||||
|
import json, argparse, datetime
|
||||||
|
|
||||||
|
|
||||||
|
def _line(edge, axis, value, scope, page=None):
|
||||||
|
o = {"edge": edge, "axis": axis, "value": round(value, 1), "scope": scope,
|
||||||
|
"source": "auto", "confirmed": False}
|
||||||
|
if page is not None:
|
||||||
|
o["page"] = page
|
||||||
|
return o
|
||||||
|
|
||||||
|
|
||||||
|
def _furn_kind(it):
|
||||||
|
"""Best-guess label for a furniture box (human can rename). Position-based, BOTTOM-LEFT origin."""
|
||||||
|
bb = it["bbox"]; cx = (bb["l"] + bb["r"]) / 2; cy = (bb["t"] + bb["b"]) / 2
|
||||||
|
if it["kind"] == "picture":
|
||||||
|
if cx > 430 and cy > 700:
|
||||||
|
return "qr"
|
||||||
|
if cy < 110:
|
||||||
|
return "barcode"
|
||||||
|
return "chrome_picture"
|
||||||
|
if cy < 90:
|
||||||
|
return "footer"
|
||||||
|
if cy > 760:
|
||||||
|
return "header_or_page_number"
|
||||||
|
return "chrome_text"
|
||||||
|
|
||||||
|
|
||||||
|
def synthesize_part_box(part_band, content_x_band):
|
||||||
|
"""Return the one authoritative S5 part-box projection.
|
||||||
|
|
||||||
|
Parts remain boxes in S5, but the box is a projection rather than intrinsic
|
||||||
|
geometry: document content margins provide the x-extent and the part band
|
||||||
|
provides y. The band end is already bounded by the next part in bands.py;
|
||||||
|
the original label box remains a separate anchor for rendering/review.
|
||||||
|
|
||||||
|
Coordinates stay in the first-pass PDF-point BOTTOMLEFT bbox shape.
|
||||||
|
"""
|
||||||
|
if not content_x_band:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
x_left = content_x_band["x_left"]
|
||||||
|
x_right = content_x_band["x_right"]
|
||||||
|
y_start = part_band["y_start"]
|
||||||
|
y_end = part_band["y_end"]
|
||||||
|
except KeyError:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"l": round(x_left, 1),
|
||||||
|
"t": round(y_start, 1),
|
||||||
|
"r": round(x_right, 1),
|
||||||
|
"b": round(y_end, 1),
|
||||||
|
"coord_origin": "BOTTOMLEFT",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build(structured, bands, furniture, pdf=None, page_roles=None):
|
||||||
|
page_roles = page_roles or {}
|
||||||
|
part_bbox = {p["label"]: p.get("bbox")
|
||||||
|
for q in structured.get("questions", []) for p in q["parts"]}
|
||||||
|
cm = furniture.get("content_margins") or {}
|
||||||
|
xband = cm.get("content_x_band") or {}
|
||||||
|
per_pg_m = cm.get("per_page") or {}
|
||||||
|
|
||||||
|
def margins_on(pg):
|
||||||
|
r = page_roles.get(str(pg)) or page_roles.get(pg)
|
||||||
|
return r.get("margins_enabled", True) if r else True
|
||||||
|
|
||||||
|
# margins as axis-locked LINES — document-level left/right, per-page top/bottom. Per-page
|
||||||
|
# top/bottom are omitted for pages with no content column (cover/blank) — the user's override.
|
||||||
|
margins = []
|
||||||
|
if "x_left" in xband:
|
||||||
|
margins.append(_line("left", "x", xband["x_left"], "document"))
|
||||||
|
margins.append(_line("right", "x", xband["x_right"], "document"))
|
||||||
|
for pg, m in sorted(per_pg_m.items(), key=lambda kv: int(kv[0])):
|
||||||
|
if not margins_on(int(pg)):
|
||||||
|
continue
|
||||||
|
margins.append(_line("top", "y", m["top"], "page", int(pg)))
|
||||||
|
margins.append(_line("bottom", "y", m["bottom"], "page", int(pg)))
|
||||||
|
|
||||||
|
# furniture + figures as BOXES, grouped by page
|
||||||
|
furn_pg, fig_pg = {}, {}
|
||||||
|
for it in furniture.get("items", []):
|
||||||
|
pg = it["page"]
|
||||||
|
if it.get("furniture"):
|
||||||
|
furn_pg.setdefault(pg, []).append(
|
||||||
|
{"box": it["bbox"], "kind": _furn_kind(it), "docling_label": it["label"],
|
||||||
|
"source": "auto", "confirmed": False})
|
||||||
|
elif it["kind"] == "picture":
|
||||||
|
fig_pg.setdefault(pg, []).append(
|
||||||
|
{"box": it["bbox"], "source": "auto", "confirmed": False})
|
||||||
|
|
||||||
|
tbl_pg = {}
|
||||||
|
for t in structured.get("tables", []):
|
||||||
|
if t.get("page"):
|
||||||
|
tbl_pg.setdefault(t["page"], []).append(
|
||||||
|
{"box": t.get("bbox"), "n_rows": t.get("n_rows"), "n_cols": t.get("n_cols"),
|
||||||
|
"table_source": t.get("source"), "source": "auto", "confirmed": False})
|
||||||
|
|
||||||
|
# --- reconcile against recovered part labels -------------------------------------------
|
||||||
|
# A part-label position is never furniture or a figure (the label wins), and a "figure" that
|
||||||
|
# covers most of the content area is a Docling page-collapse artifact (the GPU sometimes flags
|
||||||
|
# the whole page as one picture), not a real figure -> drop both. Fixes the Q1.7/Q1.9 clashes
|
||||||
|
# and the full-page "figure" that was masking part labels.
|
||||||
|
part_boxes_pg = {}
|
||||||
|
for q in structured.get("questions", []):
|
||||||
|
for p in q["parts"]:
|
||||||
|
if p.get("bbox") and p.get("page"):
|
||||||
|
part_boxes_pg.setdefault(p["page"], []).append(p["bbox"])
|
||||||
|
|
||||||
|
def _inter(a, b):
|
||||||
|
return not (a["r"] < b["l"] or b["r"] < a["l"] or a["t"] < b["b"] or b["t"] < a["b"])
|
||||||
|
|
||||||
|
def _area(b):
|
||||||
|
return max(0, b["r"] - b["l"]) * max(0, b["t"] - b["b"])
|
||||||
|
|
||||||
|
for pg, items in list(furn_pg.items()):
|
||||||
|
pls = part_boxes_pg.get(pg, [])
|
||||||
|
furn_pg[pg] = [f for f in items if not (f.get("box") and any(_inter(f["box"], pl) for pl in pls))]
|
||||||
|
for pg, items in list(fig_pg.items()):
|
||||||
|
pls = part_boxes_pg.get(pg, [])
|
||||||
|
m = per_pg_m.get(str(pg)) or per_pg_m.get(pg) or {}
|
||||||
|
carea = ((m.get("right", 0) - m.get("left", 0)) * (m.get("top", 0) - m.get("bottom", 0))) or (595 * 842)
|
||||||
|
fig_pg[pg] = [f for f in items if f.get("box")
|
||||||
|
and _area(f["box"]) <= 0.55 * carea # not a full-page collapse
|
||||||
|
and not any(_inter(f["box"], pl) for pl in pls)] # not clashing a part label
|
||||||
|
|
||||||
|
pages = {}
|
||||||
|
all_pg = (set(bands["pages"]) | {str(p) for p in furn_pg} | {str(p) for p in fig_pg}
|
||||||
|
| {str(p) for p in page_roles})
|
||||||
|
for pgs in sorted(all_pg, key=int):
|
||||||
|
pg = int(pgs)
|
||||||
|
pb = bands["pages"].get(pgs) or bands["pages"].get(pg) or {"main": [], "part": []}
|
||||||
|
main = [{"question": m["question"], "y_start": m["y_start"], "y_end": m["y_end"],
|
||||||
|
"is_start": m.get("is_start", True),
|
||||||
|
"source": "auto", "confirmed": False} for m in pb["main"]]
|
||||||
|
part = []
|
||||||
|
for p in pb["part"]:
|
||||||
|
part.append({
|
||||||
|
"label": p["label"], "question": p["question"],
|
||||||
|
"y_start": p["y_start"], "y_end": p["y_end"],
|
||||||
|
"label_box": part_bbox.get(p["label"]), # anchor, not the part extent
|
||||||
|
"box": synthesize_part_box(p, xband),
|
||||||
|
"source": "auto", "confirmed": False,
|
||||||
|
})
|
||||||
|
pr = page_roles.get(pgs) or page_roles.get(pg) or {}
|
||||||
|
pages[pgs] = {
|
||||||
|
"role": pr.get("role", "question"),
|
||||||
|
"role_source": pr.get("source", "default"), "role_confirmed": pr.get("confirmed", False),
|
||||||
|
"margins_enabled": pr.get("margins_enabled", True), # human-overridable
|
||||||
|
"main_bands": main, "part_bands": part,
|
||||||
|
"furniture": furn_pg.get(pg, []), "figures": fig_pg.get(pg, []),
|
||||||
|
"tables": tbl_pg.get(pg, []),
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"meta": {
|
||||||
|
"schema": "exam-template/first-pass/v1",
|
||||||
|
"board": structured.get("board"), "paper_code": structured.get("paper_code"),
|
||||||
|
"source_pdf": pdf, "n_pages": furniture.get("n_pages"),
|
||||||
|
"coord_origin": "BOTTOMLEFT", "units": "pdf_points",
|
||||||
|
"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||||
|
"ui_principle": "directional limits = draggable axis-locked lines; "
|
||||||
|
"object footprints = boxes",
|
||||||
|
"confirmed": False, "confirmed_by": None, "confirmed_at": None,
|
||||||
|
},
|
||||||
|
"margins": margins,
|
||||||
|
"pages": pages,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--structured", required=True)
|
||||||
|
ap.add_argument("--bands", required=True)
|
||||||
|
ap.add_argument("--furniture", required=True)
|
||||||
|
ap.add_argument("--page-roles", dest="page_roles", help="page_roles.py JSON (roles + margin override)")
|
||||||
|
ap.add_argument("--pdf")
|
||||||
|
ap.add_argument("--out", default="results/template.json")
|
||||||
|
a = ap.parse_args()
|
||||||
|
roles = json.load(open(a.page_roles))["pages"] if a.page_roles else {}
|
||||||
|
t = build(json.load(open(a.structured)), json.load(open(a.bands)),
|
||||||
|
json.load(open(a.furniture)), a.pdf, roles)
|
||||||
|
json.dump(t, open(a.out, "w"), indent=2)
|
||||||
|
np = len(t["pages"])
|
||||||
|
nm = sum(len(p["main_bands"]) for p in t["pages"].values())
|
||||||
|
npt = sum(len(p["part_bands"]) for p in t["pages"].values())
|
||||||
|
nf = sum(len(p["furniture"]) for p in t["pages"].values())
|
||||||
|
ng = sum(len(p["figures"]) for p in t["pages"].values())
|
||||||
|
print(f"template {t['meta']['paper_code']} ({t['meta']['board']}): {np} pages, "
|
||||||
|
f"{len(t['margins'])} margin-lines, {nm} main-bands, {npt} part-bands, "
|
||||||
|
f"{nf} furniture-boxes, {ng} figure-boxes")
|
||||||
|
print(f"-> wrote {a.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
222
api/services/docling/validate.py
Normal file
222
api/services/docling/validate.py
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
validate.py — G6 validation/judge: a deterministic consistency pass over an extractor result.
|
||||||
|
|
||||||
|
NOT a gate. It never approves or rejects; it attaches confidence + flags so a HUMAN reviewer's
|
||||||
|
attention is routed to the parts most likely wrong. A clean paper -> all-green, skim; a flagged
|
||||||
|
paper -> the exact items to check, worst-first. Every value stays a *suggestion* a human confirms.
|
||||||
|
|
||||||
|
Checks (all deterministic, no GPU, ~free — run on every extraction):
|
||||||
|
C1 marks-sum vs official max — over-read (sum>max) = error; under (sum<max) = warn
|
||||||
|
C2 part marks plausibility — marks None / 0 / implausibly high
|
||||||
|
C3 top-level question sequence — gaps in 1..N (skipped when numbering was OCR-inferred '~')
|
||||||
|
C4 sub-part contiguity — within a question: a,b,c / .1,.2,.3 with no hole
|
||||||
|
C5 coverage — missed parts vs ground truth (when the result carries it)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python validate.py results/genreport/edexcel1f/ocr_struct_filled.json
|
||||||
|
python validate.py <structured.json> --out report.json
|
||||||
|
"""
|
||||||
|
import json, re, sys, argparse
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
IMPLAUSIBLE_PART_MARKS = 15 # a single sub-part above this is worth a human glance
|
||||||
|
|
||||||
|
|
||||||
|
def _qnum(q):
|
||||||
|
"""Numeric value of a top-level question id ('01'->1, '4'->4); None if inferred ('~3') / odd."""
|
||||||
|
if q.startswith("~"):
|
||||||
|
return None
|
||||||
|
m = re.match(r"^0*(\d+)$", q)
|
||||||
|
return int(m.group(1)) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _subkey(label, q):
|
||||||
|
"""The part's own suffix within its question: '01.2'->'2', '4a'->'a', '1bi'->'bi'."""
|
||||||
|
s = label[len(q):] if label.startswith(q) else label
|
||||||
|
return s.lstrip(".").lstrip("~")
|
||||||
|
|
||||||
|
|
||||||
|
def validate(result):
|
||||||
|
board = result.get("board")
|
||||||
|
code = result.get("paper_code")
|
||||||
|
flags, checks = [], []
|
||||||
|
parts = [(p["label"], q["question"], p) for q in result.get("questions", []) for p in q["parts"]]
|
||||||
|
conf = {} # label -> high/medium/low
|
||||||
|
low = set() # labels a check has implicated
|
||||||
|
|
||||||
|
def add(cid, severity, status, detail):
|
||||||
|
checks.append({"id": cid, "severity": severity, "status": status, "detail": detail})
|
||||||
|
if status != "ok":
|
||||||
|
flags.append(f"[{severity}] {cid}: {detail}")
|
||||||
|
|
||||||
|
# ---- C1: marks sum vs official maximum -------------------------------------------------
|
||||||
|
mc = result.get("stats", {}).get("marks_check")
|
||||||
|
exp = (mc or {}).get("expected_max") or result.get("front_matter", {}).get("max_marks")
|
||||||
|
msum = (mc or {}).get("sum")
|
||||||
|
if msum is None:
|
||||||
|
msum = sum(p["marks"] for *_, p in parts if p.get("marks") is not None)
|
||||||
|
if exp:
|
||||||
|
if msum > exp:
|
||||||
|
add("C1_marks_sum", "error", "over",
|
||||||
|
f"marks sum {msum} EXCEEDS official max {exp} (+{msum-exp}) — an over-read; check the paper")
|
||||||
|
elif msum < exp:
|
||||||
|
add("C1_marks_sum", "warn", "under",
|
||||||
|
f"marks sum {msum} below official max {exp} (-{exp-msum}) — missing parts or unread marks")
|
||||||
|
else:
|
||||||
|
add("C1_marks_sum", "info", "ok", f"marks sum {msum} == official max {exp}")
|
||||||
|
else:
|
||||||
|
add("C1_marks_sum", "info", "unknown", "no official max available to check the sum against")
|
||||||
|
|
||||||
|
# ---- C2: per-part marks plausibility ---------------------------------------------------
|
||||||
|
none_ct = zero_ct = 0
|
||||||
|
for lab, q, p in parts:
|
||||||
|
mk = p.get("marks")
|
||||||
|
if mk is None:
|
||||||
|
none_ct += 1; low.add(lab)
|
||||||
|
elif mk == 0:
|
||||||
|
zero_ct += 1; low.add(lab)
|
||||||
|
elif mk > IMPLAUSIBLE_PART_MARKS:
|
||||||
|
low.add(lab)
|
||||||
|
add("C2_part_marks", "warn", "implausible",
|
||||||
|
f"part {lab} has {mk} marks (> {IMPLAUSIBLE_PART_MARKS}) — verify it isn't a mis-read")
|
||||||
|
if none_ct or zero_ct:
|
||||||
|
add("C2_part_marks", "warn", "missing",
|
||||||
|
f"{none_ct} part(s) with no mark, {zero_ct} with 0 marks — unread/garbled mark tokens")
|
||||||
|
elif not any(c["id"] == "C2_part_marks" for c in checks):
|
||||||
|
add("C2_part_marks", "info", "ok", "every part carries a plausible mark")
|
||||||
|
|
||||||
|
# ---- C3: top-level question sequence + EXPECTED-question interpolation ------------------
|
||||||
|
# If Q1, Q2 ... Q14 are recovered but 3-13 are not, the paper certainly HAS 3-13 — they were
|
||||||
|
# just missed (e.g. a Docling page-collapse). We emit the full expected sequence with a per-Q
|
||||||
|
# `recovered` flag so a live question-tree view can render the gaps as explicit "needs a second
|
||||||
|
# pass" slots, and a targeted re-OCR knows exactly which questions to chase.
|
||||||
|
qids = [q for q in dict.fromkeys(q for _, q, _ in parts)]
|
||||||
|
nums = sorted({n for n in (_qnum(q) for q in qids) if n is not None})
|
||||||
|
zero_pad = any(len(q) == 2 and q.startswith("0") for q in qids) # AQA 'NN' vs Edexcel/OCR 'N'
|
||||||
|
question_sequence = []
|
||||||
|
if any(q.startswith("~") for q in qids):
|
||||||
|
add("C3_question_seq", "info", "inferred",
|
||||||
|
"question numbers were OCR-inferred ('~N') — sequence not checkable; treat labels as approximate")
|
||||||
|
elif nums:
|
||||||
|
# isolated high outliers (a content number mis-read as 'Q67' after Q1-10) are likely
|
||||||
|
# spurious top-levels, not 50 missing questions — strip them off the top so the sequence
|
||||||
|
# reflects the real paper, and flag them for review instead of flooding the tree with slots.
|
||||||
|
core, suspect = nums[:], []
|
||||||
|
while len(core) >= 2 and core[-1] - core[-2] > 4:
|
||||||
|
suspect.insert(0, core.pop())
|
||||||
|
hi = core[-1] if core else nums[-1]
|
||||||
|
gaps = [n for n in range(nums[0], hi + 1) if n not in core]
|
||||||
|
question_sequence = [{"n": n, "label": (f"{n:02d}" if zero_pad else str(n)),
|
||||||
|
"recovered": n in core} for n in range(nums[0], hi + 1)]
|
||||||
|
if suspect:
|
||||||
|
add("C3_question_seq", "warn", "spurious",
|
||||||
|
f"isolated high question number(s) {suspect} after a {nums[0]}-{hi} run — likely a "
|
||||||
|
f"content number mis-read as a top-level question; review/remove")
|
||||||
|
if gaps:
|
||||||
|
add("C3_question_seq", "warn", "gap",
|
||||||
|
f"top-level questions {gaps} missing between {nums[0]}-{hi} — expected but "
|
||||||
|
f"unrecovered; surface as second-pass slots in the question tree")
|
||||||
|
elif not suspect:
|
||||||
|
add("C3_question_seq", "info", "ok", f"questions {nums[0]}-{hi} contiguous")
|
||||||
|
|
||||||
|
# ---- C4: sub-part contiguity within each question --------------------------------------
|
||||||
|
def order(keys):
|
||||||
|
"""Map a question's child keys to an ordered scheme + report holes. Handles .N and a/b/c."""
|
||||||
|
dig = sorted(int(k[0]) for k in keys if k[:1].isdigit())
|
||||||
|
let = sorted(k[0] for k in keys if k[:1].isalpha())
|
||||||
|
holes = []
|
||||||
|
if dig:
|
||||||
|
holes += [str(n) for n in range(dig[0], dig[-1] + 1) if n not in dig]
|
||||||
|
if let:
|
||||||
|
lo, hi = ord(let[0]), ord(let[-1])
|
||||||
|
holes += [chr(c) for c in range(lo, hi + 1) if chr(c) not in let]
|
||||||
|
return holes
|
||||||
|
byq = defaultdict(list)
|
||||||
|
for lab, q, p in parts:
|
||||||
|
sk = _subkey(lab, q)
|
||||||
|
if sk:
|
||||||
|
byq[q].append(sk)
|
||||||
|
seq_holes = {}
|
||||||
|
for q, keys in byq.items():
|
||||||
|
firsts = {k[0] for k in keys} # immediate children only (a / 1 / etc.)
|
||||||
|
h = order(firsts)
|
||||||
|
if h:
|
||||||
|
seq_holes[q] = h
|
||||||
|
if seq_holes:
|
||||||
|
add("C4_subpart_seq", "warn", "gap",
|
||||||
|
"sub-part gaps: " + ", ".join(f"Q{q} missing {hs}" for q, hs in sorted(seq_holes.items())))
|
||||||
|
else:
|
||||||
|
add("C4_subpart_seq", "info", "ok", "sub-parts contiguous within every question")
|
||||||
|
|
||||||
|
# ---- C5: coverage vs ground truth (when present) ---------------------------------------
|
||||||
|
cov = result.get("coverage", {})
|
||||||
|
if cov.get("coverage_pct") is not None:
|
||||||
|
missed = cov.get("missed", [])
|
||||||
|
if missed:
|
||||||
|
add("C5_coverage", "warn", "missed",
|
||||||
|
f"{cov['coverage_pct']}% vs GT ({cov['recovered']}/{cov['total']}); missed {missed[:10]}")
|
||||||
|
low.update(missed)
|
||||||
|
else:
|
||||||
|
add("C5_coverage", "info", "ok", f"100% coverage vs GT ({cov['recovered']}/{cov['total']})")
|
||||||
|
|
||||||
|
# ---- per-part confidence + paper summary -----------------------------------------------
|
||||||
|
sum_mismatch = any(c["id"] == "C1_marks_sum" and c["status"] in ("over", "under") for c in checks)
|
||||||
|
for lab, q, p in parts:
|
||||||
|
if lab in low:
|
||||||
|
conf[lab] = "low"
|
||||||
|
elif sum_mismatch:
|
||||||
|
conf[lab] = "medium" # paper-level doubt taints every part a little
|
||||||
|
else:
|
||||||
|
conf[lab] = "high"
|
||||||
|
severities = [c["severity"] for c in checks if c["status"] not in ("ok", "info", "unknown")]
|
||||||
|
worst = "error" if "error" in severities else "warn" if "warn" in severities else "clean"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"paper_code": code, "board": board,
|
||||||
|
"summary": {
|
||||||
|
"worst_severity": worst,
|
||||||
|
"needs_priority_review": worst != "clean",
|
||||||
|
"n_flags": len(flags),
|
||||||
|
"marks_sum": msum, "official_max": exp,
|
||||||
|
"parts_total": len(parts),
|
||||||
|
"parts_low_conf": sum(1 for v in conf.values() if v == "low"),
|
||||||
|
"questions_expected": len(question_sequence) or None,
|
||||||
|
"questions_recovered": sum(1 for q in question_sequence if q["recovered"]) or None,
|
||||||
|
},
|
||||||
|
"flags": flags,
|
||||||
|
"checks": checks,
|
||||||
|
"part_confidence": conf,
|
||||||
|
"question_sequence": question_sequence, # full expected skeleton (recovered + missing slots)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("structured")
|
||||||
|
ap.add_argument("--out")
|
||||||
|
a = ap.parse_args()
|
||||||
|
rep = validate(json.load(open(a.structured)))
|
||||||
|
s = rep["summary"]
|
||||||
|
print(f"paper : {rep['paper_code']} ({rep['board']})")
|
||||||
|
print(f"verdict : {s['worst_severity'].upper()} "
|
||||||
|
f"{'-> PRIORITY REVIEW' if s['needs_priority_review'] else '-> all checks clean (still human-reviewable)'}")
|
||||||
|
print(f"marks : {s['marks_sum']}/{s['official_max']} | parts {s['parts_total']} "
|
||||||
|
f"({s['parts_low_conf']} low-confidence)")
|
||||||
|
if s.get("questions_expected"):
|
||||||
|
miss = [q["label"] for q in rep["question_sequence"] if not q["recovered"]]
|
||||||
|
print(f"questions : {s['questions_recovered']}/{s['questions_expected']} recovered"
|
||||||
|
+ (f" | second-pass slots: {miss}" if miss else " (complete sequence)"))
|
||||||
|
if rep["flags"]:
|
||||||
|
print("flags:")
|
||||||
|
for f in rep["flags"]:
|
||||||
|
print(f" - {f}")
|
||||||
|
else:
|
||||||
|
print("flags : none")
|
||||||
|
if a.out:
|
||||||
|
json.dump(rep, open(a.out, "w"), indent=2)
|
||||||
|
print(f"-> wrote {a.out}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -75,6 +75,14 @@ if [ "$RUN_INIT" = "true" ]; then
|
|||||||
}
|
}
|
||||||
print_success "GAIS data import completed"
|
print_success "GAIS data import completed"
|
||||||
;;
|
;;
|
||||||
|
"exam-corpus")
|
||||||
|
print_status "Seeding exam-paper corpus (manifest-gated; skips if none configured)..."
|
||||||
|
python3 main.py --mode exam-corpus || {
|
||||||
|
print_error "Exam corpus seed failed!"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
print_success "Exam corpus seed completed"
|
||||||
|
;;
|
||||||
"full")
|
"full")
|
||||||
print_status "Running full initialization..."
|
print_status "Running full initialization..."
|
||||||
python3 main.py --mode infra || exit 1
|
python3 main.py --mode infra || exit 1
|
||||||
|
|||||||
53
main.py
53
main.py
@ -323,6 +323,52 @@ def run_gais_data_mode():
|
|||||||
|
|
||||||
# Old clear_dev_redis_queue function removed - now handled by Redis Manager
|
# Old clear_dev_redis_queue function removed - now handled by Redis Manager
|
||||||
|
|
||||||
|
def run_exam_corpus_mode():
|
||||||
|
"""Seed the public exam-paper corpus from a manifest (optional, gated).
|
||||||
|
|
||||||
|
Env controls:
|
||||||
|
EXAM_CORPUS_MANIFEST - path to the corpus manifest (required to do anything)
|
||||||
|
EXAM_CORPUS_DRY_RUN - 'true' to validate + report only
|
||||||
|
EXAM_CORPUS_FORCE - 'true' to re-upload/overwrite existing objects
|
||||||
|
EXAM_CORPUS_BOARD/_SPEC - filter to one exam_board_code / spec_code
|
||||||
|
EXAM_CORPUS_USER_SUBSET - 'true' to also seed a user-side test subset
|
||||||
|
EXAM_CORPUS_FIRST_SWEEP - 'true' to run the docling/auto-map first pass
|
||||||
|
|
||||||
|
Skips gracefully (success) when no manifest is configured/present, so it is safe
|
||||||
|
in a comma-mode list (e.g. INIT_MODE=infra,seed,exam-corpus) before papers exist.
|
||||||
|
Buckets are NOT created here — infra mode (buckets.py) owns provisioning.
|
||||||
|
"""
|
||||||
|
logger.info("Running in exam-corpus seed mode")
|
||||||
|
manifest = os.getenv("EXAM_CORPUS_MANIFEST")
|
||||||
|
if not manifest or not os.path.exists(manifest):
|
||||||
|
logger.warning(
|
||||||
|
f"exam-corpus: no manifest at EXAM_CORPUS_MANIFEST={manifest!r}; skipping (nothing to seed yet)"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
from run.initialization.seed_exam_corpus import load
|
||||||
|
rep = load(
|
||||||
|
manifest,
|
||||||
|
dry_run=_truthy_env("EXAM_CORPUS_DRY_RUN"),
|
||||||
|
force=_truthy_env("EXAM_CORPUS_FORCE"),
|
||||||
|
board_filter=os.getenv("EXAM_CORPUS_BOARD") or None,
|
||||||
|
spec_filter=os.getenv("EXAM_CORPUS_SPEC") or None,
|
||||||
|
user_subset=_truthy_env("EXAM_CORPUS_USER_SUBSET"),
|
||||||
|
do_first_sweep=_truthy_env("EXAM_CORPUS_FIRST_SWEEP"),
|
||||||
|
)
|
||||||
|
if rep.errors:
|
||||||
|
logger.error(f"exam-corpus seed completed with {len(rep.errors)} error(s)")
|
||||||
|
return False
|
||||||
|
logger.info(
|
||||||
|
f"exam-corpus seed ok: specs={rep.specs_upserted} papers={rep.papers_upserted} "
|
||||||
|
f"uploaded={rep.files_uploaded}"
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"exam-corpus seed failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def run_development_mode():
|
def run_development_mode():
|
||||||
"""Run the server in development mode with auto-reload"""
|
"""Run the server in development mode with auto-reload"""
|
||||||
logger.info("Running in development mode")
|
logger.info("Running in development mode")
|
||||||
@ -411,7 +457,7 @@ Startup modes:
|
|||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--mode', '-m',
|
'--mode', '-m',
|
||||||
choices=['infra', 'seed', 'seed-test', 'gais-data', 'dev', 'prod'],
|
choices=['infra', 'seed', 'seed-test', 'gais-data', 'exam-corpus', 'dev', 'prod'],
|
||||||
default='dev',
|
default='dev',
|
||||||
help='Startup mode (default: dev)'
|
help='Startup mode (default: dev)'
|
||||||
)
|
)
|
||||||
@ -447,6 +493,11 @@ if __name__ == "__main__":
|
|||||||
success = run_gais_data_mode()
|
success = run_gais_data_mode()
|
||||||
sys.exit(0 if success else 1)
|
sys.exit(0 if success else 1)
|
||||||
|
|
||||||
|
elif args.mode == 'exam-corpus':
|
||||||
|
# Seed the public exam-paper corpus from a manifest (gated; skips if none configured)
|
||||||
|
success = run_exam_corpus_mode()
|
||||||
|
sys.exit(0 if success else 1)
|
||||||
|
|
||||||
elif args.mode == 'dev':
|
elif args.mode == 'dev':
|
||||||
# Run development server
|
# Run development server
|
||||||
run_development_mode()
|
run_development_mode()
|
||||||
|
|||||||
0
modules/services/docling/__init__.py
Normal file
0
modules/services/docling/__init__.py
Normal file
13
modules/services/docling/regions.py
Normal file
13
modules/services/docling/regions.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
"""Compatibility import path for S5 Docling response-region geometry."""
|
||||||
|
|
||||||
|
from api.services.docling.regions import (
|
||||||
|
RegionCandidate,
|
||||||
|
detect_response_regions_from_image,
|
||||||
|
detect_response_regions_from_pdf,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"RegionCandidate",
|
||||||
|
"detect_response_regions_from_image",
|
||||||
|
"detect_response_regions_from_pdf",
|
||||||
|
]
|
||||||
99
modules/upload_validation.py
Normal file
99
modules/upload_validation.py
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
"""Upload boundary validation shared by file-upload endpoints.
|
||||||
|
|
||||||
|
E3 hardening: keep user-facing upload routes from buffering arbitrary data and
|
||||||
|
from accepting arbitrary MIME/types into Supabase storage.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
|
from fastapi import HTTPException, UploadFile
|
||||||
|
|
||||||
|
# Conservative defaults: Classroom Copilot uploads are user documents/images.
|
||||||
|
# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches.
|
||||||
|
MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024)))
|
||||||
|
UPLOAD_CHUNK_BYTES = 1024 * 1024
|
||||||
|
|
||||||
|
ALLOWED_UPLOAD_MIME_TYPES = frozenset(
|
||||||
|
mt.strip().lower()
|
||||||
|
for mt in os.getenv(
|
||||||
|
"CC_UPLOAD_ALLOWED_MIME_TYPES",
|
||||||
|
",".join(
|
||||||
|
[
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/webp",
|
||||||
|
"image/gif",
|
||||||
|
"text/plain",
|
||||||
|
"text/csv",
|
||||||
|
"text/markdown",
|
||||||
|
"application/msword",
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
"application/vnd.ms-powerpoint",
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
"application/vnd.ms-excel",
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
]
|
||||||
|
),
|
||||||
|
).split(",")
|
||||||
|
if mt.strip()
|
||||||
|
)
|
||||||
|
|
||||||
|
_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"}
|
||||||
|
|
||||||
|
|
||||||
|
def allowed_upload_mime_types_csv() -> str:
|
||||||
|
"""Stable display string for evidence/errors without leaking config internals."""
|
||||||
|
return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES))
|
||||||
|
|
||||||
|
|
||||||
|
def _declared_mime(upload: UploadFile) -> str:
|
||||||
|
return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str:
|
||||||
|
"""Validate client-declared upload MIME/type and return its normalised value."""
|
||||||
|
declared = _declared_mime(upload)
|
||||||
|
allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)}
|
||||||
|
if declared not in allowed:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=415,
|
||||||
|
detail=(
|
||||||
|
f"Unsupported upload type '{declared}'. Allowed MIME types: "
|
||||||
|
f"{', '.join(sorted(allowed))}"
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return declared
|
||||||
|
|
||||||
|
|
||||||
|
async def read_upload_bytes(
|
||||||
|
upload: UploadFile,
|
||||||
|
*,
|
||||||
|
max_bytes: int = MAX_UPLOAD_BYTES,
|
||||||
|
allowed_mime_types: Optional[Iterable[str]] = None,
|
||||||
|
) -> tuple[bytes, str]:
|
||||||
|
"""Validate MIME and read an UploadFile with a hard size ceiling."""
|
||||||
|
mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types)
|
||||||
|
chunks: list[bytes] = []
|
||||||
|
total = 0
|
||||||
|
while True:
|
||||||
|
chunk = await upload.read(UPLOAD_CHUNK_BYTES)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
total += len(chunk)
|
||||||
|
if total > max_bytes:
|
||||||
|
raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)")
|
||||||
|
chunks.append(chunk)
|
||||||
|
return b"".join(chunks), mime_type
|
||||||
|
|
||||||
|
|
||||||
|
async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes:
|
||||||
|
"""Read a PDF-only upload with size and lightweight magic-header validation."""
|
||||||
|
data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES)
|
||||||
|
if not data:
|
||||||
|
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
||||||
|
if not data.startswith(b"%PDF-"):
|
||||||
|
raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF")
|
||||||
|
return data
|
||||||
@ -80,3 +80,5 @@ Pillow
|
|||||||
psutil
|
psutil
|
||||||
PyPDF2
|
PyPDF2
|
||||||
PyMuPDF
|
PyMuPDF
|
||||||
|
# OpenCV answer-region geometry (S5-4)
|
||||||
|
opencv-python-headless
|
||||||
|
|||||||
@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
|
|||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.document_processor import DocumentProcessor
|
from modules.document_processor import DocumentProcessor
|
||||||
from modules.queue_system import (
|
from modules.queue_system import (
|
||||||
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
||||||
@ -36,6 +37,24 @@ DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hou
|
|||||||
|
|
||||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||||
|
|
||||||
|
def _user_id_from_payload(payload: Dict[str, Any]) -> str:
|
||||||
|
user_id = payload.get('sub') or payload.get('user_id')
|
||||||
|
if not user_id:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid token payload")
|
||||||
|
return user_id
|
||||||
|
|
||||||
|
def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
|
||||||
|
"""Require cabinet ownership before service-role reads file metadata."""
|
||||||
|
owned = (
|
||||||
|
client.supabase.table('file_cabinets')
|
||||||
|
.select('id')
|
||||||
|
.eq('id', cabinet_id)
|
||||||
|
.eq('user_id', user_id)
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
return bool(owned.data)
|
||||||
|
|
||||||
def _safe_filename(name: str) -> str:
|
def _safe_filename(name: str) -> str:
|
||||||
base = os.path.basename(name or 'file')
|
base = os.path.basename(name or 'file')
|
||||||
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
||||||
@ -70,13 +89,13 @@ async def upload_file(
|
|||||||
# Stage DB row to get file_id
|
# Stage DB row to get file_id
|
||||||
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
||||||
name = _safe_filename(path or file.filename)
|
name = _safe_filename(path or file.filename)
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
insert_res = client.supabase.table('files').insert({
|
insert_res = client.supabase.table('files').insert({
|
||||||
'cabinet_id': cabinet_id,
|
'cabinet_id': cabinet_id,
|
||||||
'name': name,
|
'name': name,
|
||||||
'path': staged_path,
|
'path': staged_path,
|
||||||
'bucket': bucket,
|
'bucket': bucket,
|
||||||
'mime_type': file.content_type,
|
'mime_type': mime_type,
|
||||||
'uploaded_by': user_id,
|
'uploaded_by': user_id,
|
||||||
'size_bytes': len(file_bytes),
|
'size_bytes': len(file_bytes),
|
||||||
'source': 'classroomcopilot-web'
|
'source': 'classroomcopilot-web'
|
||||||
@ -89,7 +108,7 @@ async def upload_file(
|
|||||||
# Final storage path: bucket/cabinet_id/file_id/file
|
# Final storage path: bucket/cabinet_id/file_id/file
|
||||||
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
||||||
try:
|
try:
|
||||||
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
|
storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# cleanup staged row
|
# cleanup staged row
|
||||||
client.supabase.table('files').delete().eq('id', file_id).execute()
|
client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||||
@ -117,7 +136,10 @@ async def upload_file(
|
|||||||
|
|
||||||
@router.get("/files")
|
@router.get("/files")
|
||||||
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||||
|
user_id = _user_id_from_payload(payload)
|
||||||
client = SupabaseServiceRoleClient()
|
client = SupabaseServiceRoleClient()
|
||||||
|
if not _cabinet_visible_to_user(client, cabinet_id, user_id):
|
||||||
|
return []
|
||||||
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
||||||
return res.data
|
return res.data
|
||||||
|
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse
|
|||||||
from modules.auth.supabase_bearer import SupabaseBearer
|
from modules.auth.supabase_bearer import SupabaseBearer
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
@ -26,6 +27,24 @@ auth = SupabaseBearer()
|
|||||||
|
|
||||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||||
|
|
||||||
|
def _user_id_from_payload(payload: Dict[str, Any]) -> str:
|
||||||
|
user_id = payload.get('sub') or payload.get('user_id')
|
||||||
|
if not user_id:
|
||||||
|
raise HTTPException(status_code=401, detail="Invalid token payload")
|
||||||
|
return user_id
|
||||||
|
|
||||||
|
def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
|
||||||
|
"""Require cabinet ownership before service-role reads file metadata."""
|
||||||
|
owned = (
|
||||||
|
client.supabase.table('file_cabinets')
|
||||||
|
.select('id')
|
||||||
|
.eq('id', cabinet_id)
|
||||||
|
.eq('user_id', user_id)
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
return bool(owned.data)
|
||||||
|
|
||||||
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
|
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
|
||||||
"""Choose appropriate bucket based on scope - matches old system logic."""
|
"""Choose appropriate bucket based on scope - matches old system logic."""
|
||||||
scope = (scope or 'teacher').lower()
|
scope = (scope or 'teacher').lower()
|
||||||
@ -54,10 +73,9 @@ async def upload_file(
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise HTTPException(status_code=401, detail="User ID required")
|
raise HTTPException(status_code=401, detail="User ID required")
|
||||||
|
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or path
|
filename = file.filename or path
|
||||||
|
|
||||||
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
|
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||||
@ -134,7 +152,10 @@ async def upload_file(
|
|||||||
@router.get("/files")
|
@router.get("/files")
|
||||||
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||||
"""List files in a cabinet."""
|
"""List files in a cabinet."""
|
||||||
|
user_id = _user_id_from_payload(payload)
|
||||||
client = SupabaseServiceRoleClient()
|
client = SupabaseServiceRoleClient()
|
||||||
|
if not _cabinet_visible_to_user(client, cabinet_id, user_id):
|
||||||
|
return []
|
||||||
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
||||||
return res.data
|
return res.data
|
||||||
|
|
||||||
|
|||||||
@ -323,7 +323,7 @@ async def get_class(
|
|||||||
# Enrollment requests (pending)
|
# Enrollment requests (pending)
|
||||||
reqs = (
|
reqs = (
|
||||||
sb.supabase.table("enrollment_requests")
|
sb.supabase.table("enrollment_requests")
|
||||||
.select("id, student_id, status, created_at")
|
.select("id, student_id, status, requested_at")
|
||||||
.eq("class_id", class_id)
|
.eq("class_id", class_id)
|
||||||
.eq("status", "pending")
|
.eq("status", "pending")
|
||||||
.execute()
|
.execute()
|
||||||
|
|||||||
@ -126,13 +126,28 @@ async def platform_stats(
|
|||||||
|
|
||||||
@router.post("/reset")
|
@router.post("/reset")
|
||||||
async def reset_environment(
|
async def reset_environment(
|
||||||
|
scope: str = "all",
|
||||||
_: dict = Depends(require_platform_admin),
|
_: dict = Depends(require_platform_admin),
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""DESTRUCTIVE: wipe all test data. Neo4j + Supabase. Platform admin only."""
|
"""DESTRUCTIVE: wipe test data. Platform admin only.
|
||||||
|
|
||||||
|
scope (query param):
|
||||||
|
- all : full wipe (Neo4j + Supabase data + auth users) AND the entire
|
||||||
|
exam-marker subsystem below.
|
||||||
|
- exam-corpus : ONLY the entire exam-marker subsystem, not just public papers:
|
||||||
|
public corpus/eb_* data, cc.examboards storage objects, exam
|
||||||
|
templates, template layouts, questions, boundaries, response
|
||||||
|
areas, marking batches, student submissions, and mark entries
|
||||||
|
(without touching schools/users).
|
||||||
|
- timetable : ONLY timetable/calendar materialization tables.
|
||||||
|
"""
|
||||||
|
if scope not in ("all", "exam-corpus", "timetable"):
|
||||||
|
raise HTTPException(status_code=400, detail="scope must be one of: all, exam-corpus, timetable")
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import functools
|
||||||
from run.initialization.reset_environment import reset as _reset
|
from run.initialization.reset_environment import reset as _reset
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
result = await loop.run_in_executor(None, _reset)
|
result = await loop.run_in_executor(None, functools.partial(_reset, scope))
|
||||||
return {"status": "ok", **result}
|
return {"status": "ok", **result}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -60,6 +60,11 @@ class QuestionPayload(BaseModel):
|
|||||||
# Drawn Part box geometry (73-exam-marker-regions.sql). Null for derived main questions.
|
# Drawn Part box geometry (73-exam-marker-regions.sql). Null for derived main questions.
|
||||||
bounds: Optional[Dict[str, Any]] = None # {x,y,w,h}
|
bounds: Optional[Dict[str, Any]] = None # {x,y,w,h}
|
||||||
page: Optional[int] = None
|
page: Optional[int] = None
|
||||||
|
# S5 AI/manual seam + provenance. Existing manual rows default to authoritative.
|
||||||
|
source: Literal["manual", "ai"] = "manual"
|
||||||
|
confirmed: bool = True
|
||||||
|
confidence: Optional[float] = Field(default=None, ge=0, le=1)
|
||||||
|
derivation: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class ResponseAreaPayload(BaseModel):
|
class ResponseAreaPayload(BaseModel):
|
||||||
@ -77,7 +82,10 @@ class ResponseAreaPayload(BaseModel):
|
|||||||
context_type: Optional[str] = None
|
context_type: Optional[str] = None
|
||||||
source: Literal["manual", "ai"] = "manual"
|
source: Literal["manual", "ai"] = "manual"
|
||||||
confirmed: bool = True
|
confirmed: bool = True
|
||||||
confidence: Optional[float] = None
|
confidence: Optional[float] = Field(default=None, ge=0, le=1)
|
||||||
|
# Only meaningful for kind='mark_area': part_marks|question_total|grader_box.
|
||||||
|
mark_subtype: Optional[Literal["part_marks", "question_total", "grader_box"]] = None
|
||||||
|
derivation: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class BoundaryPayload(BaseModel):
|
class BoundaryPayload(BaseModel):
|
||||||
@ -89,6 +97,24 @@ class BoundaryPayload(BaseModel):
|
|||||||
bounds: Optional[Dict[str, Any]] = None
|
bounds: Optional[Dict[str, Any]] = None
|
||||||
source: Literal["manual", "ai"] = "manual"
|
source: Literal["manual", "ai"] = "manual"
|
||||||
confirmed: bool = True
|
confirmed: bool = True
|
||||||
|
confidence: Optional[float] = Field(default=None, ge=0, le=1)
|
||||||
|
derivation: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class TemplateLayoutPayload(BaseModel):
|
||||||
|
id: Optional[str] = None
|
||||||
|
page_index: int
|
||||||
|
role: Optional[str] = None
|
||||||
|
margin_left: Optional[float] = None
|
||||||
|
margin_right: Optional[float] = None
|
||||||
|
margin_top: Optional[float] = None
|
||||||
|
margin_bottom: Optional[float] = None
|
||||||
|
margins_enabled: bool = True
|
||||||
|
source: Literal["manual", "ai"] = "manual"
|
||||||
|
confirmed: bool = True
|
||||||
|
confidence: Optional[float] = Field(default=None, ge=0, le=1)
|
||||||
|
derivation: Optional[str] = None
|
||||||
|
meta: Dict[str, Any] = Field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
class TemplateReplaceRequest(BaseModel):
|
class TemplateReplaceRequest(BaseModel):
|
||||||
@ -97,6 +123,7 @@ class TemplateReplaceRequest(BaseModel):
|
|||||||
questions: List[QuestionPayload] = Field(default_factory=list)
|
questions: List[QuestionPayload] = Field(default_factory=list)
|
||||||
response_areas: List[ResponseAreaPayload] = Field(default_factory=list)
|
response_areas: List[ResponseAreaPayload] = Field(default_factory=list)
|
||||||
boundaries: List[BoundaryPayload] = Field(default_factory=list)
|
boundaries: List[BoundaryPayload] = Field(default_factory=list)
|
||||||
|
layout: List[TemplateLayoutPayload] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
class PatchQuestionRequest(BaseModel):
|
class PatchQuestionRequest(BaseModel):
|
||||||
|
|||||||
@ -12,16 +12,24 @@ join keys (spec §2).
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import math
|
||||||
import os
|
import os
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, UploadFile
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request, UploadFile
|
||||||
from fastapi.responses import Response
|
from fastapi.responses import JSONResponse, Response
|
||||||
|
|
||||||
|
from api.services.docling import AutoMapError, auto_map
|
||||||
|
from api.services.docling import extract as docling_extract
|
||||||
|
from api.services.docling.regions import detect_response_regions_from_pdf
|
||||||
from modules.database.services.exam_projection import project_template, project_template_safe
|
from modules.database.services.exam_projection import project_template, project_template_safe
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_pdf_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
|
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
|
||||||
from routers.exam.schemas import (
|
from routers.exam.schemas import (
|
||||||
@ -37,6 +45,8 @@ router = APIRouter()
|
|||||||
|
|
||||||
SOURCE_CABINET_NAME = "Exam Marker Template Sources"
|
SOURCE_CABINET_NAME = "Exam Marker Template Sources"
|
||||||
SOURCE_BUCKET_FALLBACK = "cc.users"
|
SOURCE_BUCKET_FALLBACK = "cc.users"
|
||||||
|
AUTO_MAP_JOB_PREFIX = "exam:auto-map"
|
||||||
|
_AUTO_MAP_JOB_STATUS: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
|
||||||
# ─── helpers ─────────────────────────────────────────────────────────────────
|
# ─── helpers ─────────────────────────────────────────────────────────────────
|
||||||
@ -128,6 +138,22 @@ def _lookup_exam_storage_loc(exam_id: str) -> Optional[str]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _signed_url_value(result: Any) -> str:
|
||||||
|
"""Normalise supabase-py signed URL responses across v1/v2 shapes."""
|
||||||
|
if isinstance(result, str):
|
||||||
|
return result
|
||||||
|
if isinstance(result, dict):
|
||||||
|
value = result.get("signedURL") or result.get("signedUrl") or result.get("signed_url")
|
||||||
|
if value:
|
||||||
|
return str(value)
|
||||||
|
data = getattr(result, "data", None)
|
||||||
|
if isinstance(data, dict):
|
||||||
|
value = data.get("signedURL") or data.get("signedUrl") or data.get("signed_url")
|
||||||
|
if value:
|
||||||
|
return str(value)
|
||||||
|
raise ValueError("Storage service did not return a signed URL")
|
||||||
|
|
||||||
|
|
||||||
async def _parse_create_template_request(request: Request) -> tuple[CreateTemplateRequest, Optional[UploadFile]]:
|
async def _parse_create_template_request(request: Request) -> tuple[CreateTemplateRequest, Optional[UploadFile]]:
|
||||||
content_type = request.headers.get("content-type", "")
|
content_type = request.headers.get("content-type", "")
|
||||||
if "multipart/form-data" in content_type:
|
if "multipart/form-data" in content_type:
|
||||||
@ -156,11 +182,7 @@ async def _upload_template_source_file(
|
|||||||
institute_id: str,
|
institute_id: str,
|
||||||
upload: UploadFile,
|
upload: UploadFile,
|
||||||
) -> str:
|
) -> str:
|
||||||
file_bytes = await upload.read()
|
file_bytes = await read_pdf_upload_bytes(upload)
|
||||||
if not file_bytes:
|
|
||||||
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
|
||||||
if upload.content_type and upload.content_type != "application/pdf":
|
|
||||||
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF")
|
|
||||||
|
|
||||||
service = SupabaseServiceRoleClient()
|
service = SupabaseServiceRoleClient()
|
||||||
storage = StorageAdmin()
|
storage = StorageAdmin()
|
||||||
@ -224,6 +246,400 @@ async def _upload_template_source_file(
|
|||||||
return file_id
|
return file_id
|
||||||
|
|
||||||
|
|
||||||
|
def _job_key(job_id: str) -> str:
|
||||||
|
return f"{AUTO_MAP_JOB_PREFIX}:{job_id}"
|
||||||
|
|
||||||
|
|
||||||
|
def _redis_client() -> Any:
|
||||||
|
try:
|
||||||
|
import redis
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
url = os.getenv("LOCAL_REDIS_URL") or os.getenv("REDIS_URL")
|
||||||
|
if url:
|
||||||
|
client = redis.Redis.from_url(url, decode_responses=True, socket_timeout=2)
|
||||||
|
else:
|
||||||
|
client = redis.Redis(
|
||||||
|
host=os.getenv("REDIS_HOST", "localhost"),
|
||||||
|
port=int(os.getenv("REDIS_PORT", "6379")),
|
||||||
|
db=int(os.getenv("REDIS_DB_DEV", os.getenv("REDIS_DB", "0"))),
|
||||||
|
password=os.getenv("REDIS_PASSWORD") or None,
|
||||||
|
decode_responses=True,
|
||||||
|
socket_timeout=2,
|
||||||
|
)
|
||||||
|
client.ping()
|
||||||
|
return client
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _set_auto_map_status(job_id: str, payload: Dict[str, Any]) -> None:
|
||||||
|
status = {"job_id": job_id, "updated_at": int(time.time()), **payload}
|
||||||
|
_AUTO_MAP_JOB_STATUS[job_id] = status
|
||||||
|
client = _redis_client()
|
||||||
|
if client is not None:
|
||||||
|
try:
|
||||||
|
client.setex(_job_key(job_id), int(os.getenv("EXAM_AUTO_MAP_JOB_TTL", "3600")), json.dumps(status))
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"auto-map redis status write failed for {job_id}: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_auto_map_status(job_id: str) -> Optional[Dict[str, Any]]:
|
||||||
|
client = _redis_client()
|
||||||
|
if client is not None:
|
||||||
|
try:
|
||||||
|
raw = client.get(_job_key(job_id))
|
||||||
|
if raw:
|
||||||
|
return json.loads(raw)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"auto-map redis status read failed for {job_id}: {exc}")
|
||||||
|
return _AUTO_MAP_JOB_STATUS.get(job_id)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_template_source(ctx: ExamContext, template: Dict[str, Any]) -> Tuple[str, str, bytes]:
|
||||||
|
bucket: Optional[str] = None
|
||||||
|
path: Optional[str] = None
|
||||||
|
if template.get("exam_id"):
|
||||||
|
storage_loc = _lookup_exam_storage_loc(template["exam_id"])
|
||||||
|
if not storage_loc:
|
||||||
|
raise HTTPException(status_code=404, detail="Template source not found")
|
||||||
|
try:
|
||||||
|
bucket, path = _parse_storage_loc(storage_loc)
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=404, detail="Template source not found")
|
||||||
|
elif template.get("source_file_id"):
|
||||||
|
# Same scoped service-role exception as source-pdf: owner gate has already passed.
|
||||||
|
file_row = _first(
|
||||||
|
SupabaseServiceRoleClient().supabase.table("files")
|
||||||
|
.select("bucket, path, mime_type, name")
|
||||||
|
.eq("id", template["source_file_id"])
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
if not file_row or not file_row.get("bucket") or not file_row.get("path"):
|
||||||
|
raise HTTPException(status_code=404, detail="Template source not found")
|
||||||
|
bucket = file_row["bucket"]
|
||||||
|
path = file_row["path"]
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=404, detail="Template source not found")
|
||||||
|
try:
|
||||||
|
return bucket, path, StorageAdmin().download_file(bucket, path)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"Template source download failed for template {template.get('id')}: {exc}")
|
||||||
|
raise HTTPException(status_code=404, detail="Template source not found")
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_has_text_layer(pdf_bytes: bytes) -> bool:
|
||||||
|
with tempfile.NamedTemporaryFile(prefix="cc-auto-map-detect-", suffix=".pdf", delete=False) as fh:
|
||||||
|
fh.write(pdf_bytes)
|
||||||
|
tmp = fh.name
|
||||||
|
try:
|
||||||
|
return bool(docling_extract.has_text_layer(tmp))
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Canvas page width the frontend renders each PDF page at (app src/utils/exam-canvas/model.ts
|
||||||
|
# PAGE_WIDTH). All auto-map canvas coords are emitted in this 780-wide, proportional-height space.
|
||||||
|
CANVAS_PAGE_WIDTH = 780.0
|
||||||
|
# Response/answer-region detector (api/services/docling/regions.py) renders at 144 DPI = 2 px / PDF point.
|
||||||
|
REGIONS_PX_PER_PT = 2.0
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
|
||||||
|
with tempfile.NamedTemporaryFile(prefix="cc-auto-map-geom-", suffix=".pdf", delete=False) as fh:
|
||||||
|
fh.write(pdf_bytes)
|
||||||
|
tmp = fh.name
|
||||||
|
try:
|
||||||
|
import fitz
|
||||||
|
doc = fitz.open(tmp)
|
||||||
|
pages: List[Dict[str, float]] = []
|
||||||
|
page_top = 0.0
|
||||||
|
try:
|
||||||
|
for page in doc:
|
||||||
|
media = page.mediabox
|
||||||
|
crop = page.cropbox
|
||||||
|
page_pt_w = float(crop.width or page.rect.width or 1.0)
|
||||||
|
page_pt_h = float(crop.height or page.rect.height or 1.0)
|
||||||
|
# Emit canvas coords in the FRONTEND render space: the app draws each page at
|
||||||
|
# CANVAS_PAGE_WIDTH (app model.ts PAGE_WIDTH=780) with proportional height and stacks
|
||||||
|
# pages by those heights. Previously rendered_w/h were left in PDF points (~595x842),
|
||||||
|
# so every shape landed shrunk (~0.76x) and shifted up-left on the 780-wide canvas.
|
||||||
|
rendered_w = CANVAS_PAGE_WIDTH
|
||||||
|
# Mirror the app's canvas.height = Math.ceil(viewport.height) EXACTLY (pdfLoader.ts),
|
||||||
|
# so page_top accumulates identically. Using the raw float drifts ~1px/page, compounding
|
||||||
|
# to a visible upward shift on later pages of long papers (~36px over 40 pages).
|
||||||
|
rendered_h = float(math.ceil(CANVAS_PAGE_WIDTH * page_pt_h / page_pt_w))
|
||||||
|
pages.append({
|
||||||
|
"media_x0": float(media.x0),
|
||||||
|
"crop_x0": float(crop.x0),
|
||||||
|
"crop_y0": float(crop.y0),
|
||||||
|
"page_pt_w": page_pt_w,
|
||||||
|
"page_pt_h": page_pt_h,
|
||||||
|
"rendered_w": rendered_w,
|
||||||
|
"rendered_h": rendered_h,
|
||||||
|
"page_top": page_top,
|
||||||
|
})
|
||||||
|
page_top += rendered_h
|
||||||
|
finally:
|
||||||
|
doc.close()
|
||||||
|
return pages
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"PDF geometry read failed; falling back to A4 page geometry: {exc}")
|
||||||
|
return []
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _page_geom(pages: List[Dict[str, float]], page_number: int) -> Dict[str, float]:
|
||||||
|
if 1 <= page_number <= len(pages):
|
||||||
|
return pages[page_number - 1]
|
||||||
|
_fallback_h = float(math.ceil(CANVAS_PAGE_WIDTH * 842.0 / 595.0))
|
||||||
|
return {
|
||||||
|
"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0,
|
||||||
|
"page_pt_w": 595.0, "page_pt_h": 842.0,
|
||||||
|
"rendered_w": CANVAS_PAGE_WIDTH, "rendered_h": _fallback_h,
|
||||||
|
"page_top": (page_number - 1) * _fallback_h,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _box_to_canvas(box: Optional[Dict[str, Any]], page_number: int, pages: List[Dict[str, float]]) -> Optional[Dict[str, float]]:
|
||||||
|
if not box:
|
||||||
|
return None
|
||||||
|
g = _page_geom(pages, page_number)
|
||||||
|
if box.get("coord_origin") == "TOPLEFT" and {"x", "y", "w", "h"}.issubset(box):
|
||||||
|
# Scale the box into the 780-wide canvas space. px boxes (opencv/gemma regions) are in
|
||||||
|
# rendered-image px at REGIONS_PX_PER_PT px/point; TOPLEFT point boxes are 1 px/point.
|
||||||
|
px_per_pt = REGIONS_PX_PER_PT if box.get("unit") == "px" else 1.0
|
||||||
|
sx = g["rendered_w"] / (g["page_pt_w"] * px_per_pt)
|
||||||
|
sy = g["rendered_h"] / (g["page_pt_h"] * px_per_pt)
|
||||||
|
return {
|
||||||
|
"x": round(float(box["x"]) * sx, 2),
|
||||||
|
"y": round(g["page_top"] + float(box["y"]) * sy, 2),
|
||||||
|
"w": round(float(box["w"]) * sx, 2),
|
||||||
|
"h": round(float(box["h"]) * sy, 2),
|
||||||
|
}
|
||||||
|
if not {"l", "t", "r", "b"}.issubset(box):
|
||||||
|
return None
|
||||||
|
l, t, r, b = (float(box[k]) for k in ("l", "t", "r", "b"))
|
||||||
|
# Canvas pages are rendered from the PDF CropBox with page_left fixed at 0.
|
||||||
|
# Docling boxes are in PDF user-space coordinates, so subtract the CropBox
|
||||||
|
# origin instead of adding it; otherwise cropped PDFs shift right/down.
|
||||||
|
x = (l - g["crop_x0"]) / g["page_pt_w"] * g["rendered_w"]
|
||||||
|
y = g["page_top"] + (g["page_pt_h"] - (t - g["crop_y0"])) / g["page_pt_h"] * g["rendered_h"]
|
||||||
|
w = (r - l) / g["page_pt_w"] * g["rendered_w"]
|
||||||
|
h = (t - b) / g["page_pt_h"] * g["rendered_h"]
|
||||||
|
return {"x": round(x, 2), "y": round(y, 2), "w": round(w, 2), "h": round(h, 2)}
|
||||||
|
|
||||||
|
|
||||||
|
def _response_form_from_region_type(region_type: Any) -> Optional[str]:
|
||||||
|
return {
|
||||||
|
"answer_lines": "lines",
|
||||||
|
"answer_box": "answer-box",
|
||||||
|
"working_space": "working",
|
||||||
|
"lines": "lines",
|
||||||
|
"answer-box": "answer-box",
|
||||||
|
"working": "working",
|
||||||
|
}.get(str(region_type or ""))
|
||||||
|
|
||||||
|
|
||||||
|
def _y_to_canvas(y_value: float, page_number: int, pages: List[Dict[str, float]]) -> float:
|
||||||
|
g = _page_geom(pages, page_number)
|
||||||
|
return round(g["page_top"] + (g["page_pt_h"] - (float(y_value) - g["crop_y0"])) / g["page_pt_h"] * g["rendered_h"], 2)
|
||||||
|
|
||||||
|
|
||||||
|
def _ai_id(template_id: str, *parts: Any) -> str:
|
||||||
|
return str(uuid.uuid5(uuid.NAMESPACE_URL, "/".join(["cc-auto-map", template_id, *[str(p) for p in parts]])))
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_confidence(value: Any = None) -> float:
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return max(0.0, min(1.0, float(value)))
|
||||||
|
return 0.75
|
||||||
|
|
||||||
|
|
||||||
|
def _margin_values(first_pass: Dict[str, Any], page_number: int) -> Dict[str, Optional[float]]:
|
||||||
|
vals: Dict[str, Optional[float]] = {"left": None, "right": None, "top": None, "bottom": None}
|
||||||
|
for m in first_pass.get("margins") or []:
|
||||||
|
edge = m.get("edge")
|
||||||
|
if edge not in vals:
|
||||||
|
continue
|
||||||
|
if m.get("scope") == "document" and edge in {"left", "right"}:
|
||||||
|
vals[edge] = m.get("value")
|
||||||
|
elif m.get("scope") == "page" and int(m.get("page") or -1) == page_number:
|
||||||
|
vals[edge] = m.get("value")
|
||||||
|
return vals
|
||||||
|
|
||||||
|
|
||||||
|
def _map_first_pass_to_rows(template_id: str, first_pass: Dict[str, Any], pdf_bytes: bytes, extra_regions: Optional[List[Dict[str, Any]]] = None) -> Dict[str, List[Dict[str, Any]]]:
|
||||||
|
pages_geom = _pdf_page_geometry(pdf_bytes)
|
||||||
|
questions: List[Dict[str, Any]] = []
|
||||||
|
response_areas: List[Dict[str, Any]] = []
|
||||||
|
boundaries: List[Dict[str, Any]] = []
|
||||||
|
layout: List[Dict[str, Any]] = []
|
||||||
|
q_ids: Dict[str, str] = {}
|
||||||
|
first_part_by_page: Dict[int, str] = {}
|
||||||
|
pages_obj = first_pass.get("pages") or {}
|
||||||
|
|
||||||
|
for page_key in sorted(pages_obj, key=lambda k: int(k)):
|
||||||
|
page_number = int(page_key)
|
||||||
|
page_index = page_number - 1
|
||||||
|
page = pages_obj[page_key]
|
||||||
|
margins = _margin_values(first_pass, page_number)
|
||||||
|
layout.append({
|
||||||
|
"id": _ai_id(template_id, "layout", page_number),
|
||||||
|
"template_id": template_id,
|
||||||
|
"page_index": page_index,
|
||||||
|
"role": page.get("role"),
|
||||||
|
"margin_left": margins["left"],
|
||||||
|
"margin_right": margins["right"],
|
||||||
|
"margin_top": margins["top"],
|
||||||
|
"margin_bottom": margins["bottom"],
|
||||||
|
"margins_enabled": bool(page.get("margins_enabled", True)),
|
||||||
|
"source": "ai",
|
||||||
|
"confirmed": False,
|
||||||
|
"confidence": 0.8,
|
||||||
|
"derivation": "docling-page-layout",
|
||||||
|
"meta": {"role_source": page.get("role_source"), "schema": first_pass.get("meta", {}).get("schema")},
|
||||||
|
})
|
||||||
|
for band in page.get("main_bands") or []:
|
||||||
|
label = str(band.get("question") or "").strip()
|
||||||
|
if not label:
|
||||||
|
continue
|
||||||
|
qid = q_ids.setdefault(label, _ai_id(template_id, "question", label))
|
||||||
|
if not any(q["id"] == qid for q in questions):
|
||||||
|
questions.append({"id": qid, "template_id": template_id, "label": label, "order": len(q_ids) - 1, "max_marks": 0, "is_container": True, "source": "ai", "confirmed": False, "confidence": _safe_confidence(band.get("confidence")), "derivation": "docling-main-band"})
|
||||||
|
for edge, yv in (("start", band.get("y_start")), ("end", band.get("y_end"))):
|
||||||
|
if yv is not None:
|
||||||
|
boundaries.append({"id": _ai_id(template_id, "boundary", label, edge, page_number), "template_id": template_id, "question_id": qid, "label": f"{label}:{edge}", "page_index": page_index, "y": _y_to_canvas(float(yv), page_number, pages_geom), "bounds": None, "source": "ai", "confirmed": False, "confidence": _safe_confidence(band.get("confidence")), "derivation": "docling-main-band"})
|
||||||
|
for band in page.get("part_bands") or []:
|
||||||
|
label = str(band.get("label") or "").strip()
|
||||||
|
parent_label = str(band.get("question") or "").strip()
|
||||||
|
if not label:
|
||||||
|
continue
|
||||||
|
parent_id = q_ids.setdefault(parent_label, _ai_id(template_id, "question", parent_label or label.split(".")[0]))
|
||||||
|
if parent_label and not any(q["id"] == parent_id for q in questions):
|
||||||
|
questions.append({"id": parent_id, "template_id": template_id, "label": parent_label, "order": len(q_ids) - 1, "max_marks": 0, "is_container": True, "source": "ai", "confirmed": False, "confidence": 0.7, "derivation": "docling-inferred-main-question"})
|
||||||
|
pid = _ai_id(template_id, "part", label)
|
||||||
|
first_part_by_page.setdefault(page_index, pid)
|
||||||
|
# B1 live-route papers can carry continuation bands for the same part label
|
||||||
|
# on later pages. The UUID is intentionally stable per template+part label,
|
||||||
|
# so only insert the first question row; later continuations still map
|
||||||
|
# response/context regions through first_part_by_page.
|
||||||
|
if any(q["id"] == pid for q in questions):
|
||||||
|
continue
|
||||||
|
bounds = None
|
||||||
|
y1, y2 = band.get("y_start"), band.get("y_end")
|
||||||
|
if margins["left"] is not None and margins["right"] is not None and y1 is not None and y2 is not None:
|
||||||
|
top = max(float(y1), float(y2)); bottom = min(float(y1), float(y2))
|
||||||
|
bounds = _box_to_canvas({"l": margins["left"], "r": margins["right"], "t": top, "b": bottom, "coord_origin": "BOTTOMLEFT"}, page_number, pages_geom)
|
||||||
|
bounds = bounds or _box_to_canvas(band.get("label_box"), page_number, pages_geom)
|
||||||
|
questions.append({"id": pid, "template_id": template_id, "parent_id": parent_id, "label": label, "order": len(questions), "max_marks": 0, "is_container": False, "bounds": bounds, "page": page_number, "source": "ai", "confirmed": False, "confidence": _safe_confidence(band.get("confidence")), "derivation": "docling-part-band-x-margins"})
|
||||||
|
|
||||||
|
default_qid = questions[0]["id"] if questions else _ai_id(template_id, "question", "auto")
|
||||||
|
for page_key in sorted(pages_obj, key=lambda k: int(k)):
|
||||||
|
page_number = int(page_key); page_index = page_number - 1; page = pages_obj[page_key]
|
||||||
|
owner_qid = first_part_by_page.get(page_index, default_qid)
|
||||||
|
for collection, kind, context_type, derivation in (("furniture", "furniture", None, "docling-furniture"), ("figures", "context", "figure", "docling-context-figure"), ("tables", "context", "data_table", "docling-table")):
|
||||||
|
for idx, item in enumerate(page.get(collection) or []):
|
||||||
|
bounds = _box_to_canvas(item.get("box"), page_number, pages_geom)
|
||||||
|
if bounds:
|
||||||
|
row = {"id": _ai_id(template_id, collection, page_number, idx), "template_id": template_id, "question_id": owner_qid, "page": page_number, "bounds": bounds, "kind": kind, "source": "ai", "confirmed": False, "confidence": 0.65, "derivation": derivation}
|
||||||
|
if context_type:
|
||||||
|
row["context_type"] = context_type
|
||||||
|
response_areas.append(row)
|
||||||
|
for idx, region in enumerate(extra_regions or []):
|
||||||
|
page_index = int(region.get("page_index", 0))
|
||||||
|
bounds = _box_to_canvas(region.get("bbox") or {}, page_index + 1, pages_geom)
|
||||||
|
if bounds:
|
||||||
|
response_form = _response_form_from_region_type(region.get("region_type"))
|
||||||
|
if response_form:
|
||||||
|
response_areas.append({"id": _ai_id(template_id, "region", page_index, idx), "template_id": template_id, "question_id": first_part_by_page.get(page_index, default_qid), "page": page_index + 1, "bounds": bounds, "kind": "response", "response_form": response_form, "source": "ai", "confirmed": False, "confidence": _safe_confidence(region.get("confidence")), "derivation": region.get("detection_method") or "opencv-response-region"})
|
||||||
|
# Integrity guard: every response_area/boundary question_id must reference an inserted question
|
||||||
|
# (FK exam_response_areas/exam_boundaries -> exam_questions). On papers where band detection yields
|
||||||
|
# few/no questions but opencv/gemma still emit regions, those regions point at the synthetic
|
||||||
|
# default_qid which was never inserted. Ensure that fallback container question exists and reattach
|
||||||
|
# any orphan child rows to it, so persistence can't violate the FK.
|
||||||
|
qid_set = {q["id"] for q in questions}
|
||||||
|
orphans = [r for r in (response_areas + boundaries) if r.get("question_id") not in qid_set]
|
||||||
|
if orphans:
|
||||||
|
if default_qid not in qid_set:
|
||||||
|
questions.insert(0, {"id": default_qid, "template_id": template_id, "label": "Unassigned",
|
||||||
|
"order": 0, "max_marks": 0, "is_container": True, "source": "ai",
|
||||||
|
"confirmed": False, "confidence": 0.5,
|
||||||
|
"derivation": "auto-map-fallback-container"})
|
||||||
|
qid_set.add(default_qid)
|
||||||
|
for r in orphans:
|
||||||
|
r["question_id"] = default_qid
|
||||||
|
|
||||||
|
return {"questions": questions, "response_areas": response_areas, "boundaries": boundaries, "layout": layout}
|
||||||
|
|
||||||
|
|
||||||
|
def _dedupe_rows_by_id(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||||
|
"""Preserve first occurrence of stable AI row ids emitted by noisy OCR detectors."""
|
||||||
|
out: List[Dict[str, Any]] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for row in rows:
|
||||||
|
row_id = row.get("id")
|
||||||
|
if row_id:
|
||||||
|
key = str(row_id)
|
||||||
|
if key in seen:
|
||||||
|
continue
|
||||||
|
seen.add(key)
|
||||||
|
out.append(row)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _refresh_ai_rows(ctx: ExamContext, template_id: str, rows: Dict[str, List[Dict[str, Any]]]) -> None:
|
||||||
|
sb = ctx.supabase
|
||||||
|
for table in ("exam_response_areas", "exam_boundaries", "exam_template_layout", "exam_questions"):
|
||||||
|
sb.table(table).delete().eq("template_id", template_id).eq("source", "ai").eq("confirmed", False).execute()
|
||||||
|
for table, key in (("exam_questions", "questions"), ("exam_response_areas", "response_areas"), ("exam_boundaries", "boundaries"), ("exam_template_layout", "layout")):
|
||||||
|
payload = _dedupe_rows_by_id(rows.get(key) or [])
|
||||||
|
if payload:
|
||||||
|
sb.table(table).insert(payload).execute()
|
||||||
|
|
||||||
|
|
||||||
|
def _run_auto_map_merge(ctx: ExamContext, template_id: str, pdf_bytes: bytes, source_label: str) -> Dict[str, List[Dict[str, Any]]]:
|
||||||
|
first_pass = auto_map(pdf_bytes, source_pdf=source_label)
|
||||||
|
extra_regions: List[Dict[str, Any]] = []
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(prefix="cc-auto-map-regions-", suffix=".pdf", delete=False) as fh:
|
||||||
|
fh.write(pdf_bytes)
|
||||||
|
tmp = fh.name
|
||||||
|
try:
|
||||||
|
extra_regions = detect_response_regions_from_pdf(tmp)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(tmp)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
except Exception as exc:
|
||||||
|
logger.info(f"auto-map response-region detection skipped for template {template_id}: {exc}")
|
||||||
|
rows = _map_first_pass_to_rows(template_id, first_pass, pdf_bytes, extra_regions)
|
||||||
|
_refresh_ai_rows(ctx, template_id, rows)
|
||||||
|
updates = {"exam_code": first_pass.get("meta", {}).get("paper_code"), "page_count": first_pass.get("meta", {}).get("n_pages")}
|
||||||
|
ctx.supabase.table("exam_templates").update({k: v for k, v in updates.items() if v is not None}).eq("id", template_id).execute()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _run_auto_map_job(job_id: str, ctx: ExamContext, template_id: str, pdf_bytes: bytes, source_label: str) -> None:
|
||||||
|
_set_auto_map_status(job_id, {"status": "running", "template_id": template_id})
|
||||||
|
try:
|
||||||
|
rows = _run_auto_map_merge(ctx, template_id, pdf_bytes, source_label)
|
||||||
|
_set_auto_map_status(job_id, {"status": "completed", "template_id": template_id, "counts": {k: len(v) for k, v in rows.items()}})
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(f"auto-map job failed for template {template_id}: {exc}")
|
||||||
|
_set_auto_map_status(job_id, {"status": "failed", "template_id": template_id, "error": str(exc)})
|
||||||
|
|
||||||
|
|
||||||
# ─── templates ───────────────────────────────────────────────────────────────
|
# ─── templates ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@ -268,12 +684,13 @@ async def create_template(
|
|||||||
|
|
||||||
|
|
||||||
@router.get("/catalogue")
|
@router.get("/catalogue")
|
||||||
async def list_catalogue_papers() -> Dict[str, Any]:
|
async def list_catalogue_papers(
|
||||||
"""Lightweight exam-board paper catalogue for the create dialog."""
|
ctx: ExamContext = Depends(get_exam_context),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Lightweight authenticated exam-board metadata catalogue for the create dialog."""
|
||||||
try:
|
try:
|
||||||
sb = SupabaseServiceRoleClient().supabase
|
|
||||||
res = (
|
res = (
|
||||||
sb.table("eb_exams")
|
ctx.supabase.table("eb_exams")
|
||||||
.select("id, exam_code, spec_code, paper_code, tier, session, type_code, storage_loc")
|
.select("id, exam_code, spec_code, paper_code, tier, session, type_code, storage_loc")
|
||||||
.eq("type_code", "QP")
|
.eq("type_code", "QP")
|
||||||
.order("exam_code")
|
.order("exam_code")
|
||||||
@ -284,6 +701,50 @@ async def list_catalogue_papers() -> Dict[str, Any]:
|
|||||||
raise HTTPException(status_code=502, detail=f"Could not load catalogue papers: {exc}")
|
raise HTTPException(status_code=502, detail=f"Could not load catalogue papers: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/catalogue/{exam_id}/signed-url")
|
||||||
|
async def get_catalogue_paper_signed_url(
|
||||||
|
exam_id: str,
|
||||||
|
expires_in: int = 300,
|
||||||
|
ctx: ExamContext = Depends(get_exam_context),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Return a short-lived signed URL for an authenticated user's catalogue PDF access.
|
||||||
|
|
||||||
|
The storage operation uses service role as a scoped backend exception for signing only;
|
||||||
|
raw cc.examboards object reads remain denied by storage.objects RLS.
|
||||||
|
"""
|
||||||
|
expires_in = max(60, min(int(expires_in or 300), 3600))
|
||||||
|
try:
|
||||||
|
row = _first(
|
||||||
|
ctx.supabase.table("eb_exams")
|
||||||
|
.select("id, exam_code, storage_loc")
|
||||||
|
.eq("id", exam_id)
|
||||||
|
.eq("type_code", "QP")
|
||||||
|
.limit(1)
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
|
if not row or not row.get("storage_loc"):
|
||||||
|
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||||
|
try:
|
||||||
|
bucket, path = _parse_storage_loc(row["storage_loc"])
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||||
|
if bucket != "cc.examboards":
|
||||||
|
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||||
|
signed_url = _signed_url_value(StorageAdmin().create_signed_url(bucket, path, expires_in))
|
||||||
|
return {
|
||||||
|
"exam_id": row["id"],
|
||||||
|
"exam_code": row.get("exam_code"),
|
||||||
|
"bucket": bucket,
|
||||||
|
"path": path,
|
||||||
|
"expires_in": expires_in,
|
||||||
|
"signed_url": signed_url,
|
||||||
|
}
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(status_code=502, detail=f"Could not sign catalogue paper URL: {exc}")
|
||||||
|
|
||||||
|
|
||||||
@router.get("/templates")
|
@router.get("/templates")
|
||||||
async def list_templates(
|
async def list_templates(
|
||||||
include_archived: bool = False,
|
include_archived: bool = False,
|
||||||
@ -315,11 +776,19 @@ async def get_template(
|
|||||||
boundaries = _rows(
|
boundaries = _rows(
|
||||||
ctx.supabase.table("exam_boundaries").select("*").eq("template_id", template_id).execute()
|
ctx.supabase.table("exam_boundaries").select("*").eq("template_id", template_id).execute()
|
||||||
)
|
)
|
||||||
|
layout = _rows(
|
||||||
|
ctx.supabase.table("exam_template_layout")
|
||||||
|
.select("*")
|
||||||
|
.eq("template_id", template_id)
|
||||||
|
.order("page_index")
|
||||||
|
.execute()
|
||||||
|
)
|
||||||
return {
|
return {
|
||||||
**template,
|
**template,
|
||||||
"questions": questions,
|
"questions": questions,
|
||||||
"response_areas": response_areas,
|
"response_areas": response_areas,
|
||||||
"boundaries": boundaries,
|
"boundaries": boundaries,
|
||||||
|
"layout": layout,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -331,48 +800,61 @@ async def get_template_source_pdf(
|
|||||||
template = _fetch_template_or_404(ctx, template_id)
|
template = _fetch_template_or_404(ctx, template_id)
|
||||||
_require_source_visibility_or_404(ctx, template)
|
_require_source_visibility_or_404(ctx, template)
|
||||||
|
|
||||||
bucket: Optional[str] = None
|
_, _, pdf_bytes = _resolve_template_source(ctx, template)
|
||||||
path: Optional[str] = None
|
|
||||||
|
|
||||||
if template.get("exam_id"):
|
|
||||||
storage_loc = _lookup_exam_storage_loc(template["exam_id"])
|
|
||||||
if not storage_loc:
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
try:
|
|
||||||
bucket, path = _parse_storage_loc(storage_loc)
|
|
||||||
except ValueError:
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
elif template.get("source_file_id"):
|
|
||||||
# Resolve the file row via service role (authz already done above: the caller proved they
|
|
||||||
# can see this template, and source_file_id is the template's own file). Reading `files`
|
|
||||||
# as-the-user trips a pre-existing broken RLS policy on cabinet_memberships
|
|
||||||
# (42P17 infinite recursion) — documented service-role exception, like the catalogue lookup.
|
|
||||||
file_row = _first(
|
|
||||||
SupabaseServiceRoleClient().supabase.table("files")
|
|
||||||
.select("bucket, path, mime_type, name")
|
|
||||||
.eq("id", template["source_file_id"])
|
|
||||||
.limit(1)
|
|
||||||
.execute()
|
|
||||||
)
|
|
||||||
if not file_row or not file_row.get("bucket") or not file_row.get("path"):
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
bucket = file_row["bucket"]
|
|
||||||
path = file_row["path"]
|
|
||||||
else:
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
|
|
||||||
if not bucket or not path:
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
|
|
||||||
try:
|
|
||||||
pdf_bytes = StorageAdmin().download_file(bucket, path)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning(f"Template source download failed for template {template_id}: {exc}")
|
|
||||||
raise HTTPException(status_code=404, detail="Template source not found")
|
|
||||||
|
|
||||||
return Response(content=pdf_bytes, media_type="application/pdf")
|
return Response(content=pdf_bytes, media_type="application/pdf")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/templates/{template_id}/auto-map", response_model=None)
|
||||||
|
async def auto_map_template(
|
||||||
|
template_id: str,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
ctx: ExamContext = Depends(get_exam_context),
|
||||||
|
) -> Dict[str, Any] | JSONResponse:
|
||||||
|
template = _fetch_template_or_404(ctx, template_id)
|
||||||
|
_require_owner(ctx, template)
|
||||||
|
_require_source_visibility_or_404(ctx, template)
|
||||||
|
if _template_has_recorded_marks(ctx, template_id):
|
||||||
|
raise HTTPException(status_code=409, detail="Template has recorded marks; auto-map structural refresh is blocked.")
|
||||||
|
bucket, path, pdf_bytes = _resolve_template_source(ctx, template)
|
||||||
|
source_label = f"{bucket}/{path}"
|
||||||
|
try:
|
||||||
|
fast_path = _pdf_has_text_layer(pdf_bytes)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"auto-map text-layer detection failed for template {template_id}; falling back to OCR queue: {exc}")
|
||||||
|
fast_path = False
|
||||||
|
if not fast_path:
|
||||||
|
job_id = str(uuid.uuid4())
|
||||||
|
_set_auto_map_status(job_id, {"status": "queued", "template_id": template_id})
|
||||||
|
background_tasks.add_task(_run_auto_map_job, job_id, ctx, template_id, pdf_bytes, source_label)
|
||||||
|
return JSONResponse(status_code=202, content={"status": "accepted", "job_id": job_id})
|
||||||
|
try:
|
||||||
|
_run_auto_map_merge(ctx, template_id, pdf_bytes, source_label)
|
||||||
|
except (AutoMapError, ValueError) as exc:
|
||||||
|
raise HTTPException(status_code=422, detail=f"Auto-map failed: {exc}")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(f"auto-map failed for template {template_id}: {exc}")
|
||||||
|
raise HTTPException(status_code=502, detail=f"Auto-map failed: {exc}")
|
||||||
|
background_tasks.add_task(project_template_safe, template_id)
|
||||||
|
return await get_template(template_id, ctx)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/templates/{template_id}/auto-map/{job_id}/status")
|
||||||
|
async def auto_map_status(
|
||||||
|
template_id: str,
|
||||||
|
job_id: str,
|
||||||
|
ctx: ExamContext = Depends(get_exam_context),
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
template = _fetch_template_or_404(ctx, template_id)
|
||||||
|
_require_owner(ctx, template)
|
||||||
|
status = _get_auto_map_status(job_id)
|
||||||
|
if not status or status.get("template_id") != template_id:
|
||||||
|
raise HTTPException(status_code=404, detail="Auto-map job not found")
|
||||||
|
body = dict(status)
|
||||||
|
if body.get("status") == "completed":
|
||||||
|
body["template"] = await get_template(template_id, ctx)
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
@router.put("/templates/{template_id}")
|
@router.put("/templates/{template_id}")
|
||||||
async def replace_template(
|
async def replace_template(
|
||||||
template_id: str,
|
template_id: str,
|
||||||
@ -412,6 +894,7 @@ async def replace_template(
|
|||||||
# remove them first (we delete by template_id rather than rely on cascade for predictability).
|
# remove them first (we delete by template_id rather than rely on cascade for predictability).
|
||||||
sb.table("exam_response_areas").delete().eq("template_id", template_id).execute()
|
sb.table("exam_response_areas").delete().eq("template_id", template_id).execute()
|
||||||
sb.table("exam_boundaries").delete().eq("template_id", template_id).execute()
|
sb.table("exam_boundaries").delete().eq("template_id", template_id).execute()
|
||||||
|
sb.table("exam_template_layout").delete().eq("template_id", template_id).execute()
|
||||||
sb.table("exam_questions").delete().eq("template_id", template_id).execute()
|
sb.table("exam_questions").delete().eq("template_id", template_id).execute()
|
||||||
|
|
||||||
# Re-insert, preserving client-supplied UUIDs (Neo4j join keys, spec §2).
|
# Re-insert, preserving client-supplied UUIDs (Neo4j join keys, spec §2).
|
||||||
@ -431,6 +914,10 @@ async def replace_template(
|
|||||||
"spec_ref": q.spec_ref,
|
"spec_ref": q.spec_ref,
|
||||||
"bounds": q.bounds, # drawn Part box (73); null for derived main questions
|
"bounds": q.bounds, # drawn Part box (73); null for derived main questions
|
||||||
"page": q.page,
|
"page": q.page,
|
||||||
|
"source": q.source,
|
||||||
|
"confirmed": q.confirmed,
|
||||||
|
"confidence": q.confidence,
|
||||||
|
"derivation": q.derivation,
|
||||||
}
|
}
|
||||||
if q.id:
|
if q.id:
|
||||||
r["id"] = q.id
|
r["id"] = q.id
|
||||||
@ -451,6 +938,8 @@ async def replace_template(
|
|||||||
"source": ra.source,
|
"source": ra.source,
|
||||||
"confirmed": ra.confirmed,
|
"confirmed": ra.confirmed,
|
||||||
"confidence": ra.confidence,
|
"confidence": ra.confidence,
|
||||||
|
"mark_subtype": ra.mark_subtype,
|
||||||
|
"derivation": ra.derivation,
|
||||||
}
|
}
|
||||||
if ra.id:
|
if ra.id:
|
||||||
r["id"] = ra.id
|
r["id"] = ra.id
|
||||||
@ -469,15 +958,41 @@ async def replace_template(
|
|||||||
"bounds": b.bounds,
|
"bounds": b.bounds,
|
||||||
"source": b.source,
|
"source": b.source,
|
||||||
"confirmed": b.confirmed,
|
"confirmed": b.confirmed,
|
||||||
|
"confidence": b.confidence,
|
||||||
|
"derivation": b.derivation,
|
||||||
}
|
}
|
||||||
if b.id:
|
if b.id:
|
||||||
r["id"] = b.id
|
r["id"] = b.id
|
||||||
b_rows.append({k: v for k, v in r.items() if v is not None})
|
b_rows.append({k: v for k, v in r.items() if v is not None})
|
||||||
sb.table("exam_boundaries").insert(b_rows).execute()
|
sb.table("exam_boundaries").insert(b_rows).execute()
|
||||||
|
|
||||||
|
if body.layout:
|
||||||
|
layout_rows = []
|
||||||
|
for item in body.layout:
|
||||||
|
r = {
|
||||||
|
"template_id": template_id,
|
||||||
|
"page_index": item.page_index,
|
||||||
|
"role": item.role,
|
||||||
|
"margin_left": item.margin_left,
|
||||||
|
"margin_right": item.margin_right,
|
||||||
|
"margin_top": item.margin_top,
|
||||||
|
"margin_bottom": item.margin_bottom,
|
||||||
|
"margins_enabled": item.margins_enabled,
|
||||||
|
"source": item.source,
|
||||||
|
"confirmed": item.confirmed,
|
||||||
|
"confidence": item.confidence,
|
||||||
|
"derivation": item.derivation,
|
||||||
|
"meta": item.meta,
|
||||||
|
}
|
||||||
|
if item.id:
|
||||||
|
r["id"] = item.id
|
||||||
|
layout_rows.append({k: v for k, v in r.items() if v is not None})
|
||||||
|
sb.table("exam_template_layout").insert(layout_rows).execute()
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Exam template {template_id} replaced: {len(body.questions)} questions, "
|
f"Exam template {template_id} replaced: {len(body.questions)} questions, "
|
||||||
f"{len(body.response_areas)} regions, {len(body.boundaries)} boundaries"
|
f"{len(body.response_areas)} regions, {len(body.boundaries)} boundaries, "
|
||||||
|
f"{len(body.layout)} layout rows"
|
||||||
)
|
)
|
||||||
# R3.5.4: a successful save enqueues a graph projection into cc.public.exams. BackgroundTasks
|
# R3.5.4: a successful save enqueues a graph projection into cc.public.exams. BackgroundTasks
|
||||||
# is acceptable for Sprint 4 (durability via a real queue is a later step); failures are
|
# is acceptable for Sprint 4 (durability via a real queue is a later step); failures are
|
||||||
|
|||||||
@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse
|
|||||||
from modules.auth.supabase_bearer import SupabaseBearer
|
from modules.auth.supabase_bearer import SupabaseBearer
|
||||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
from modules.database.supabase.utils.storage import StorageAdmin
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from modules.upload_validation import read_upload_bytes
|
||||||
from modules.logger_tool import initialise_logger
|
from modules.logger_tool import initialise_logger
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
@ -59,10 +60,9 @@ async def upload_single_file(
|
|||||||
if not user_id:
|
if not user_id:
|
||||||
raise HTTPException(status_code=401, detail="User ID required")
|
raise HTTPException(status_code=401, detail="User ID required")
|
||||||
|
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or path
|
filename = file.filename or path
|
||||||
|
|
||||||
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
|
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||||
@ -234,10 +234,9 @@ async def upload_directory(
|
|||||||
# Process each file
|
# Process each file
|
||||||
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
|
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
|
||||||
try:
|
try:
|
||||||
# Read file content
|
# Validate MIME/type and read file content with a hard size limit.
|
||||||
file_bytes = await file.read()
|
file_bytes, mime_type = await read_upload_bytes(file)
|
||||||
file_size = len(file_bytes)
|
file_size = len(file_bytes)
|
||||||
mime_type = file.content_type or 'application/octet-stream'
|
|
||||||
filename = file.filename or f"file_{i}"
|
filename = file.filename or f"file_{i}"
|
||||||
|
|
||||||
total_size += file_size
|
total_size += file_size
|
||||||
@ -291,6 +290,8 @@ async def upload_directory(
|
|||||||
|
|
||||||
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
|
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to upload file {relative_path}: {e}")
|
logger.error(f"Failed to upload file {relative_path}: {e}")
|
||||||
# Continue with other files, don't fail entire upload
|
# Continue with other files, don't fail entire upload
|
||||||
|
|||||||
@ -46,7 +46,7 @@ def initialize_buckets() -> dict:
|
|||||||
file_size_limit=1000 * 1024 * 1024, # 1GB
|
file_size_limit=1000 * 1024 * 1024, # 1GB
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
# Exam Board files
|
# Exam Board files (admin-curated public exam corpus: QP/MS/insert/ER + specs)
|
||||||
{
|
{
|
||||||
"id": "cc.examboards",
|
"id": "cc.examboards",
|
||||||
"options": CreateBucketOptions(
|
"options": CreateBucketOptions(
|
||||||
@ -55,6 +55,34 @@ def initialize_buckets() -> dict:
|
|||||||
file_size_limit=1000 * 1024 * 1024, # 1GB
|
file_size_limit=1000 * 1024 * 1024, # 1GB
|
||||||
)
|
)
|
||||||
},
|
},
|
||||||
|
# ── Storage taxonomy bins (access scoped by RLS on bucket + leading path segment; RLS = D1) ──
|
||||||
|
# Platform-managed public/shared assets (readable by all authenticated users).
|
||||||
|
{
|
||||||
|
"id": "cc.public",
|
||||||
|
"options": CreateBucketOptions(
|
||||||
|
name="Classroom Copilot Public",
|
||||||
|
public=False,
|
||||||
|
file_size_limit=1000 * 1024 * 1024, # 1GB
|
||||||
|
)
|
||||||
|
},
|
||||||
|
# Institute-scoped operational assets: cc.institutes/{institute_id}/...
|
||||||
|
{
|
||||||
|
"id": "cc.institutes",
|
||||||
|
"options": CreateBucketOptions(
|
||||||
|
name="Classroom Copilot Institutes",
|
||||||
|
public=False,
|
||||||
|
file_size_limit=1000 * 1024 * 1024, # 1GB
|
||||||
|
)
|
||||||
|
},
|
||||||
|
# Platform-admin-only assets, seeds, intake/staging for unidentified papers.
|
||||||
|
{
|
||||||
|
"id": "cc.admin",
|
||||||
|
"options": CreateBucketOptions(
|
||||||
|
name="Classroom Copilot Admin",
|
||||||
|
public=False,
|
||||||
|
file_size_limit=1000 * 1024 * 1024, # 1GB
|
||||||
|
)
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
@ -81,11 +109,17 @@ def initialize_buckets() -> dict:
|
|||||||
logger.error(f"Failed to create bucket: {bucket['id']}")
|
logger.error(f"Failed to create bucket: {bucket['id']}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
results[bucket["id"]] = {
|
# Idempotent: an already-existing bucket is not a failure on re-run.
|
||||||
"status": "error",
|
if any(s in str(e).lower() for s in ("already exists", "duplicate", "resource already")):
|
||||||
"error": str(e)
|
results[bucket["id"]] = {"status": "exists", "result": str(e)}
|
||||||
}
|
success_count += 1
|
||||||
logger.error(f"Error creating bucket {bucket['id']}: {str(e)}")
|
logger.info(f"Bucket already exists (ok): {bucket['id']}")
|
||||||
|
else:
|
||||||
|
results[bucket["id"]] = {
|
||||||
|
"status": "error",
|
||||||
|
"error": str(e)
|
||||||
|
}
|
||||||
|
logger.error(f"Error creating bucket {bucket['id']}: {str(e)}")
|
||||||
|
|
||||||
# Determine overall success
|
# Determine overall success
|
||||||
if success_count == total_count:
|
if success_count == total_count:
|
||||||
|
|||||||
3
run/initialization/manifests/_corpus_store/.gitignore
vendored
Normal file
3
run/initialization/manifests/_corpus_store/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Persistent local corpus store — PDFs are NOT committed (re-downloadable from manifest).
|
||||||
|
*
|
||||||
|
!.gitignore
|
||||||
14576
run/initialization/manifests/exam-corpus.yaml
Normal file
14576
run/initialization/manifests/exam-corpus.yaml
Normal file
File diff suppressed because it is too large
Load Diff
501
run/initialization/manifests/generate_corpus_manifest.py
Normal file
501
run/initialization/manifests/generate_corpus_manifest.py
Normal file
@ -0,0 +1,501 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
generate_corpus_manifest.py — build the public exam-corpus manifest from OFFICIAL sources,
|
||||||
|
verifying every source URL is live before it is written.
|
||||||
|
|
||||||
|
Output: exam-corpus.yaml (consumed by run/initialization/seed_exam_corpus.py).
|
||||||
|
|
||||||
|
Sources (all official exam-board hosts; public past-paper PDFs):
|
||||||
|
AQA filestore.aqa.org.uk — fully templatable; enumerated + HEAD-verified here.
|
||||||
|
Edexcel qualifications.pearson.com — date suffix non-derivable; confirmed URLs embedded.
|
||||||
|
OCR www.ocr.org.uk/Images — opaque doc-id; confirmed URLs embedded.
|
||||||
|
|
||||||
|
Every URL is HEAD/GET-checked (200 + application/pdf) before inclusion, so the committed
|
||||||
|
manifest never carries a dead or wrong-cased link. Re-run to refresh as more sessions go public.
|
||||||
|
|
||||||
|
Conventions (locked — see ~/cc/ideas/2026-06-07-exam-paper-ingestion.md):
|
||||||
|
session = "YYYY-Mon" e.g. 2022-Jun
|
||||||
|
exam_code = BOARD-award-PAPER-SESSIONCOMPACT-ROLE e.g. AQA-8463-1H-2022JUN-QP
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import concurrent.futures as cf
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
AQA_BASE = "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes"
|
||||||
|
ROLE_TOKEN = {"QP": "QP", "MS": "MS", "ER": "WRE"} # AQA filestore role tokens
|
||||||
|
MONTHS = {"JUN": ("june", "Jun"), "NOV": ("november", "Nov")}
|
||||||
|
FETCHED = "2026-06-07"
|
||||||
|
|
||||||
|
|
||||||
|
def head_ok(url: str, timeout: int = 20) -> bool:
|
||||||
|
"""True iff the URL resolves to a real PDF (200 + application/pdf), following redirects.
|
||||||
|
AQA soft-404s redirect to www.aqa.org.uk/req_path=... (text/html), so we check content-type.
|
||||||
|
Uses a tiny Range GET (stdlib urllib) so we never pull the whole PDF just to verify it."""
|
||||||
|
req = urllib.request.Request(url, headers={"Range": "bytes=0-3", "User-Agent": "cc-corpus/1.0"})
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||||
|
ctype = (r.headers.get("content-type") or "").lower()
|
||||||
|
return r.status in (200, 206) and "pdf" in ctype
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
# A 206/200 PDF never lands here; 404/redirect-to-html will.
|
||||||
|
ctype = (e.headers.get("content-type") or "").lower() if e.headers else ""
|
||||||
|
return e.code in (200, 206) and "pdf" in ctype
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────── AQA catalogue ───────────────────────────
|
||||||
|
# spec_code, subject, award, award_level, first_teach, [(filestore_papercode, paper_code, tier), ...]
|
||||||
|
def _gcse_single(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
||||||
|
out = []
|
||||||
|
for paper in ("1", "2"):
|
||||||
|
for tier in ("F", "H"):
|
||||||
|
out.append((f"{award}{paper}{tier}", f"{award}/{paper}{tier}", tier))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _trilogy(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
||||||
|
out = []
|
||||||
|
for subj in ("B", "C", "P"):
|
||||||
|
for paper in ("1", "2"):
|
||||||
|
for tier in ("F", "H"):
|
||||||
|
out.append((f"{award}{subj}{paper}{tier}", f"{award}/{subj}/{paper}{tier}", tier))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _alevel(award: str, papers=("1", "2", "3")) -> List[Tuple[str, str, Optional[str]]]:
|
||||||
|
return [(f"{award}{p}", f"{award}/{p}", None) for p in papers]
|
||||||
|
|
||||||
|
def _subj(award: str, papers, tiers=(None,)) -> List[Tuple[str, str, Optional[str]]]:
|
||||||
|
"""Generic GCSE/A-level builder. tiers=('F','H') for tiered subjects (Maths/Science),
|
||||||
|
tiers=(None,) for untiered (English/Geography/CS/Business/Psychology)."""
|
||||||
|
out = []
|
||||||
|
for p in papers:
|
||||||
|
for t in tiers:
|
||||||
|
tl = t or ""
|
||||||
|
out.append((f"{award}{p}{tl}", f"{award}/{p}{tl}", t))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def _mfl(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
||||||
|
"""AQA MFL: Listening/Reading/Writing papers, each Foundation/Higher (Speaking is teacher-conducted,
|
||||||
|
no public QP). Filestore code encodes skill+tier, e.g. 8658LH = French Listening Higher."""
|
||||||
|
out = []
|
||||||
|
for skill in ("L", "R", "W"):
|
||||||
|
for t in ("F", "H"):
|
||||||
|
out.append((f"{award}{skill}{t}", f"{award}/{skill}{t}", t))
|
||||||
|
return out
|
||||||
|
|
||||||
|
AQA_SPECS = [
|
||||||
|
# ── Sciences (round 1 — kept at full depth) ──────────────────────────────────────
|
||||||
|
("AQA-BIOL-8461", "BIOLOGY", "8461", "GCSE", "2016", _gcse_single("8461")),
|
||||||
|
("AQA-CHEM-8462", "CHEMISTRY", "8462", "GCSE", "2016", _gcse_single("8462")),
|
||||||
|
("AQA-PHYS-8463", "PHYSICS", "8463", "GCSE", "2016", _gcse_single("8463")),
|
||||||
|
("AQA-COMB-8464", "COMBINED SCIENCE TRILOGY", "8464", "GCSE", "2016", _trilogy("8464")),
|
||||||
|
("AQA-BIOL-7401", "BIOLOGY", "7401", "AS", "2015", _alevel("7401", ("1", "2"))),
|
||||||
|
("AQA-BIOL-7402", "BIOLOGY", "7402", "A-level", "2015", _alevel("7402")),
|
||||||
|
("AQA-CHEM-7404", "CHEMISTRY", "7404", "AS", "2015", _alevel("7404", ("1", "2"))),
|
||||||
|
("AQA-CHEM-7405", "CHEMISTRY", "7405", "A-level", "2015", _alevel("7405")),
|
||||||
|
("AQA-PHYS-7407", "PHYSICS", "7407", "AS", "2015", _alevel("7407", ("1", "2"))),
|
||||||
|
("AQA-PHYS-7408", "PHYSICS", "7408", "A-level", "2015", _alevel("7408")),
|
||||||
|
# ── Round 2 breadth — high-volume core (Maths, English) ───────────────────────────
|
||||||
|
("AQA-MATH-8300", "MATHEMATICS", "8300", "GCSE", "2015", _subj("8300", ("1", "2", "3"), ("F", "H"))),
|
||||||
|
("AQA-MATH-7357", "MATHEMATICS", "7357", "A-level", "2017", _alevel("7357", ("1", "2", "3"))),
|
||||||
|
("AQA-MATH-7356", "MATHEMATICS", "7356", "AS", "2017", _alevel("7356", ("1", "2"))),
|
||||||
|
("AQA-ENGL-8700", "ENGLISH LANGUAGE", "8700", "GCSE", "2015", _subj("8700", ("1", "2"))),
|
||||||
|
("AQA-ENGLIT-8702", "ENGLISH LITERATURE", "8702", "GCSE", "2015", _subj("8702", ("1", "2"))),
|
||||||
|
("AQA-ENGL-7702", "ENGLISH LANGUAGE", "7702", "A-level", "2015", _alevel("7702", ("1", "2"))),
|
||||||
|
("AQA-ENGLIT-7712", "ENGLISH LITERATURE A", "7712", "A-level", "2015", _alevel("7712", ("1", "2"))),
|
||||||
|
# ── Round 2 breadth — humanities / others ─────────────────────────────────────────
|
||||||
|
("AQA-GEOG-8035", "GEOGRAPHY", "8035", "GCSE", "2016", _subj("8035", ("1", "2", "3"))),
|
||||||
|
("AQA-GEOG-7037", "GEOGRAPHY", "7037", "A-level", "2016", _alevel("7037", ("1", "2"))),
|
||||||
|
("AQA-COMP-8525", "COMPUTER SCIENCE", "8525", "GCSE", "2020", _subj("8525", ("1", "2"))),
|
||||||
|
("AQA-COMP-7517", "COMPUTER SCIENCE", "7517", "A-level", "2015", _alevel("7517", ("1", "2"))),
|
||||||
|
("AQA-BUS-8132", "BUSINESS", "8132", "GCSE", "2017", _subj("8132", ("1", "2"))),
|
||||||
|
("AQA-BUS-7132", "BUSINESS", "7132", "A-level", "2015", _alevel("7132", ("1", "2", "3"))),
|
||||||
|
("AQA-PSYC-8182", "PSYCHOLOGY", "8182", "GCSE", "2017", _subj("8182", ("1", "2"))),
|
||||||
|
("AQA-PSYC-7182", "PSYCHOLOGY", "7182", "A-level", "2015", _alevel("7182", ("1", "2", "3"))),
|
||||||
|
# ── Round 2 breadth — modern foreign languages (Listening/Reading/Writing, F+H) ───
|
||||||
|
("AQA-FREN-8658", "FRENCH", "8658", "GCSE", "2016", _mfl("8658")),
|
||||||
|
("AQA-SPAN-8698", "SPANISH", "8698", "GCSE", "2016", _mfl("8698")),
|
||||||
|
("AQA-GERM-8668", "GERMAN", "8668", "GCSE", "2016", _mfl("8668")),
|
||||||
|
("AQA-FREN-7652", "FRENCH", "7652", "A-level", "2016", _alevel("7652", ("1", "2"))),
|
||||||
|
("AQA-SPAN-7692", "SPANISH", "7692", "A-level", "2016", _alevel("7692", ("1", "2"))),
|
||||||
|
("AQA-GERM-7662", "GERMAN", "7662", "A-level", "2016", _alevel("7662", ("1", "2"))),
|
||||||
|
]
|
||||||
|
AQA_SESSIONS = ["JUN18", "JUN19", "NOV20", "NOV21", "JUN22", "JUN23", "JUN24"]
|
||||||
|
AQA_ROLES = ["QP", "MS", "ER"]
|
||||||
|
|
||||||
|
|
||||||
|
def aqa_url(papercode: str, role: str, session: str) -> Tuple[str, str]:
|
||||||
|
mon = session[:3]
|
||||||
|
yy = session[3:]
|
||||||
|
folder, _ = MONTHS[mon]
|
||||||
|
year = "20" + yy
|
||||||
|
fname = f"AQA-{papercode}-{ROLE_TOKEN[role]}-{session}.PDF"
|
||||||
|
return f"{AQA_BASE}/{year}/{folder}/{fname}", fname
|
||||||
|
|
||||||
|
|
||||||
|
def session_pretty(session: str) -> Tuple[str, str]:
|
||||||
|
mon = session[:3] # "JUN" | "NOV"
|
||||||
|
yy = session[3:] # "22"
|
||||||
|
_, pretty = MONTHS[mon]
|
||||||
|
# ("2022-Jun" display session, "2022JUN" compact for exam_code — year-first, matches the
|
||||||
|
# locked exam_code convention and the Edexcel/OCR entries).
|
||||||
|
return f"20{yy}-{pretty}", f"20{yy}{mon}"
|
||||||
|
|
||||||
|
|
||||||
|
def build_aqa() -> Dict[str, Any]:
|
||||||
|
candidates: List[Tuple[str, str, str, str, str, str, Optional[str], str, str, str]] = []
|
||||||
|
# (spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname)
|
||||||
|
spec_meta = {}
|
||||||
|
for spec_code, subject, award, level, first_teach, papers in AQA_SPECS:
|
||||||
|
spec_meta[spec_code] = (subject, award, level, first_teach)
|
||||||
|
for paper_fc, paper_code, tier in papers:
|
||||||
|
for session in AQA_SESSIONS:
|
||||||
|
for role in AQA_ROLES:
|
||||||
|
url, fname = aqa_url(paper_fc, role, session)
|
||||||
|
candidates.append((spec_code, subject, award, paper_fc, paper_code, tier,
|
||||||
|
role, session, url, fname))
|
||||||
|
|
||||||
|
print(f"[AQA] HEAD-verifying {len(candidates)} candidate URLs...", file=sys.stderr)
|
||||||
|
live: Dict[int, bool] = {}
|
||||||
|
with cf.ThreadPoolExecutor(max_workers=24) as ex:
|
||||||
|
futs = {ex.submit(head_ok, c[8]): i for i, c in enumerate(candidates)}
|
||||||
|
done = 0
|
||||||
|
for fut in cf.as_completed(futs):
|
||||||
|
i = futs[fut]
|
||||||
|
live[i] = fut.result()
|
||||||
|
done += 1
|
||||||
|
if done % 60 == 0:
|
||||||
|
print(f" ...{done}/{len(candidates)} ({sum(live.values())} live)", file=sys.stderr)
|
||||||
|
|
||||||
|
specs: Dict[str, Dict[str, Any]] = {}
|
||||||
|
for i, c in enumerate(candidates):
|
||||||
|
if not live.get(i):
|
||||||
|
continue
|
||||||
|
spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname = c
|
||||||
|
sess_pretty, sess_compact = session_pretty(session)
|
||||||
|
token = paper_fc[len(award):] # "1H" / "P1H" / "1"
|
||||||
|
exam_code = f"AQA-{award}-{token}-{sess_compact}-{role}"
|
||||||
|
spec = specs.setdefault(spec_code, {"papers": []})
|
||||||
|
spec["papers"].append({
|
||||||
|
"exam_code": exam_code,
|
||||||
|
"paper_code": paper_code,
|
||||||
|
"tier": tier,
|
||||||
|
"session": sess_pretty,
|
||||||
|
"doc_type": role,
|
||||||
|
"file": {
|
||||||
|
"source": f"url:{url}",
|
||||||
|
"original_name": fname,
|
||||||
|
"provenance": {"source_url": url, "fetched": FETCHED,
|
||||||
|
"license": "AQA public past paper"},
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
spec_list = []
|
||||||
|
for spec_code, subject, award, level, first_teach, _papers in AQA_SPECS:
|
||||||
|
if spec_code not in specs:
|
||||||
|
continue
|
||||||
|
papers = sorted(specs[spec_code]["papers"], key=lambda p: p["exam_code"])
|
||||||
|
spec_list.append({
|
||||||
|
"spec_code": spec_code, "exam_board_code": "AQA", "subject_code": subject,
|
||||||
|
"award_code": award, "award_level": level, "first_teach": first_teach,
|
||||||
|
"papers": papers,
|
||||||
|
})
|
||||||
|
print(f"[AQA] {spec_code}: {len(papers)} live papers", file=sys.stderr)
|
||||||
|
return {"exam_board_code": "AQA", "specifications": spec_list}
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────── Edexcel / OCR — confirmed direct URLs (re-verified at build) ───────────────
|
||||||
|
# These boards aren't templatable (Edexcel has a non-derivable date suffix; OCR uses opaque
|
||||||
|
# doc-ids), so confirmed URLs are listed as 6-tuples: (spec_code, paper_code, tier, session, role,
|
||||||
|
# url). exam_code is DERIVED (see _mk_exam_code) so it always matches the locked convention.
|
||||||
|
EXAM_CODE_PREFIX = {"EDEXCEL": "EDX", "OCR": "OCR"}
|
||||||
|
|
||||||
|
def _ec_token(paper_code: str) -> str:
|
||||||
|
t = paper_code.split("/")[-1]
|
||||||
|
return str(int(t)) if t.isdigit() else t # "01"->"1", "1H"->"1H", "1CH"->"1CH", "11"->"11"
|
||||||
|
|
||||||
|
def _mk_exam_code(prefix: str, award: str, paper_code: str, session: str, role: str) -> str:
|
||||||
|
y, m = session.split("-")
|
||||||
|
return f"{prefix}-{award}-{_ec_token(paper_code)}-{y}{m.upper()}-{role}"
|
||||||
|
|
||||||
|
_PE = "https://qualifications.pearson.com/content/dam/pdf"
|
||||||
|
_EDX = f"{_PE}/GCSE/Science/2016"
|
||||||
|
_OCR = "https://www.ocr.org.uk/Images"
|
||||||
|
|
||||||
|
EDEXCEL_SPECS = {
|
||||||
|
"EDX-BIOL-1BI0": ("BIOLOGY", "1BI0", "GCSE", "2016"),
|
||||||
|
"EDX-CHEM-1CH0": ("CHEMISTRY", "1CH0", "GCSE", "2016"),
|
||||||
|
"EDX-PHYS-1PH0": ("PHYSICS", "1PH0", "GCSE", "2016"),
|
||||||
|
"EDX-COMB-1SC0": ("COMBINED SCIENCE", "1SC0", "GCSE", "2016"),
|
||||||
|
"EDX-MATH-1MA1": ("MATHEMATICS", "1MA1", "GCSE", "2015"),
|
||||||
|
"EDX-ENGL-1EN0": ("ENGLISH LANGUAGE", "1EN0", "GCSE", "2015"),
|
||||||
|
"EDX-ENGLIT-1ET0": ("ENGLISH LITERATURE", "1ET0", "GCSE", "2015"),
|
||||||
|
"EDX-GEOG-1GA0": ("GEOGRAPHY A", "1GA0", "GCSE", "2016"),
|
||||||
|
"EDX-HIST-1HI0": ("HISTORY", "1HI0", "GCSE", "2016"),
|
||||||
|
"EDX-BUS-1BS0": ("BUSINESS", "1BS0", "GCSE", "2017"),
|
||||||
|
"EDX-COMP-1CP2": ("COMPUTER SCIENCE", "1CP2", "GCSE", "2020"),
|
||||||
|
"EDX-MATH-9MA0": ("MATHEMATICS", "9MA0", "A-level", "2017"),
|
||||||
|
"EDX-ENGL-9EN0": ("ENGLISH LANGUAGE", "9EN0", "A-level", "2015"),
|
||||||
|
"EDX-ENGLIT-9ET0": ("ENGLISH LITERATURE", "9ET0", "A-level", "2015"),
|
||||||
|
"EDX-GEOG-9GE0": ("GEOGRAPHY", "9GE0", "A-level", "2016"),
|
||||||
|
}
|
||||||
|
EDEXCEL_PAPERS = [
|
||||||
|
# ── Sciences (round 1) ──
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-1h-que-20240511.pdf"),
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2f-que-20230610.pdf"),
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/2H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2h-que-20230610.pdf"),
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/1F", "F", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1f-rms-20230824.pdf"),
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1h-rms-20240822.pdf"),
|
||||||
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1bi0-1h-rms-20220825.pdf"),
|
||||||
|
("EDX-CHEM-1CH0", "1CH0/1F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1f-que-20230523.pdf"),
|
||||||
|
("EDX-CHEM-1CH0", "1CH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1h-que-20240518.pdf"),
|
||||||
|
("EDX-CHEM-1CH0", "1CH0/2H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1ch0-2h-rms-20240822.pdf"),
|
||||||
|
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20230526.pdf"),
|
||||||
|
("EDX-PHYS-1PH0", "1PH0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-2f-que-20230617.pdf"),
|
||||||
|
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20240523.pdf"),
|
||||||
|
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1ph0-2h-rms-20230824.pdf"),
|
||||||
|
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1ph0-2h-rms-20220825.pdf"),
|
||||||
|
("EDX-COMB-1SC0", "1SC0/1CH", None, "2023-Jun", "MS", f"{_EDX}/Exam-materials/1sc0-1ch-rms-20230824.pdf"),
|
||||||
|
# ── Maths 1MA1 (round 2) ──
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20230520.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20230824.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20230824.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-que-20240517.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20240517.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20240822.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Nov", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20240111.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-1h-rms-20220825.pdf"),
|
||||||
|
("EDX-MATH-1MA1", "1MA1/3H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-3h-rms-20220825.pdf"),
|
||||||
|
# ── English Language 1EN0 / Literature 1ET0 (round 2) ──
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20240524.pdf"),
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Nov", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20231108.pdf"),
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20240822.pdf"),
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/02", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20240822.pdf"),
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20230824.pdf"),
|
||||||
|
("EDX-ENGL-1EN0", "1EN0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20230824.pdf"),
|
||||||
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-que-20230518.pdf"),
|
||||||
|
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20230525.pdf"),
|
||||||
|
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20240521.pdf"),
|
||||||
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20230824.pdf"),
|
||||||
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20240822.pdf"),
|
||||||
|
# ── A-level Maths 9MA0 / English 9EN0 / 9ET0 (round 2) ──
|
||||||
|
("EDX-MATH-9MA0", "9MA0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-que-20230607.pdf"),
|
||||||
|
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-que-20230621.pdf"),
|
||||||
|
("EDX-MATH-9MA0", "9MA0/02", None, "2024-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-02-que-20240612.pdf"),
|
||||||
|
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-rms-20230817.pdf"),
|
||||||
|
("EDX-MATH-9MA0", "9MA0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-rms-20240815.pdf"),
|
||||||
|
("EDX-ENGL-9EN0", "9EN0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-01-rms-20240815.pdf"),
|
||||||
|
("EDX-ENGL-9EN0", "9EN0/02", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-02-rms-20240815.pdf"),
|
||||||
|
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2024-Jun", "QP", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-que-20240525.pdf"),
|
||||||
|
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-rms-20230817.pdf"),
|
||||||
|
("EDX-ENGLIT-9ET0", "9ET0/03", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-03-rms-20230817.pdf"),
|
||||||
|
# ── Humanities (round 2) ──
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-que-20230523.pdf"),
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20230824.pdf"),
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-que-20230610.pdf"),
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-rms-20230824.pdf"),
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20240822.pdf"),
|
||||||
|
("EDX-GEOG-1GA0", "1GA0/03", None, "2024-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-03-que-20240615.pdf"),
|
||||||
|
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "QP", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-que-20230519.pdf"),
|
||||||
|
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-rms-20230824.pdf"),
|
||||||
|
("EDX-HIST-1HI0", "1HI0/12", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-12-rms-20230824.pdf"),
|
||||||
|
("EDX-HIST-1HI0", "1HI0/13", None, "2024-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-13-rms-20240822.pdf"),
|
||||||
|
("EDX-HIST-1HI0", "1HI0/33", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-33-rms-20230824.pdf"),
|
||||||
|
("EDX-BUS-1BS0", "1BS0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-que-20230519.pdf"),
|
||||||
|
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20230613.pdf"),
|
||||||
|
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-rms-20230824.pdf"),
|
||||||
|
("EDX-BUS-1BS0", "1BS0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20240606.pdf"),
|
||||||
|
("EDX-BUS-1BS0", "1BS0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-rms-20240822.pdf"),
|
||||||
|
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-que-20230520.pdf"),
|
||||||
|
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20230824.pdf"),
|
||||||
|
("EDX-COMP-1CP2", "1CP2/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-02-que-20230526.pdf"),
|
||||||
|
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/Computer-Science/2020/Exam-materials/1cp2-01-que-20240702.pdf"),
|
||||||
|
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20240822.pdf"),
|
||||||
|
("EDX-GEOG-9GE0", "9GE0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Geography/2016/Exam-materials/9ge0-01-que-20230518.pdf"),
|
||||||
|
]
|
||||||
|
|
||||||
|
OCR_SPECS = {
|
||||||
|
"OCR-BIOL-J247": ("BIOLOGY", "J247", "GCSE", "2016"),
|
||||||
|
"OCR-CHEM-J248": ("CHEMISTRY", "J248", "GCSE", "2016"),
|
||||||
|
"OCR-PHYS-J249": ("PHYSICS", "J249", "GCSE", "2016"),
|
||||||
|
"OCR-COMB-J250": ("COMBINED SCIENCE", "J250", "GCSE", "2016"),
|
||||||
|
"OCR-MATH-J560": ("MATHEMATICS", "J560", "GCSE", "2015"),
|
||||||
|
"OCR-ENGL-J351": ("ENGLISH LANGUAGE", "J351", "GCSE", "2015"),
|
||||||
|
"OCR-ENGLIT-J352": ("ENGLISH LITERATURE", "J352", "GCSE", "2015"),
|
||||||
|
"OCR-COMP-J277": ("COMPUTER SCIENCE", "J277", "GCSE", "2020"),
|
||||||
|
"OCR-GEOG-J383": ("GEOGRAPHY A", "J383", "GCSE", "2016"),
|
||||||
|
"OCR-BUS-J204": ("BUSINESS", "J204", "GCSE", "2017"),
|
||||||
|
"OCR-HIST-J411": ("HISTORY B (SHP)", "J411", "GCSE", "2016"),
|
||||||
|
"OCR-MATH-H240": ("MATHEMATICS A", "H240", "A-level", "2017"),
|
||||||
|
"OCR-ENGLIT-H472": ("ENGLISH LITERATURE", "H472", "A-level", "2015"),
|
||||||
|
"OCR-ENGL-H470": ("ENGLISH LANGUAGE", "H470", "A-level", "2015"),
|
||||||
|
}
|
||||||
|
OCR_PAPERS = [
|
||||||
|
# ── Sciences (round 1) ──
|
||||||
|
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "QP", f"{_OCR}/727713-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "MS", f"{_OCR}/727745-mark-scheme-paper-1.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "QP", f"{_OCR}/727715-question-paper-paper-3.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "MS", f"{_OCR}/727747-mark-scheme-paper-3.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/01", "F", "2023-Jun", "QP", f"{_OCR}/704945-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/03", "H", "2023-Jun", "MS", f"{_OCR}/704979-mark-scheme-paper-3.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/03", "H", "2022-Jun", "QP", f"{_OCR}/678031-question-paper-paper-3.pdf"),
|
||||||
|
("OCR-BIOL-J247", "J247/01", "F", "2022-Jun", "MS", f"{_OCR}/678076-mark-scheme-paper-1.pdf"),
|
||||||
|
("OCR-CHEM-J248", "J248/01", "F", "2024-Jun", "QP", f"{_OCR}/727718-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-CHEM-J248", "J248/03", "H", "2024-Jun", "MS", f"{_OCR}/727751-mark-scheme-paper-3.pdf"),
|
||||||
|
("OCR-CHEM-J248", "J248/01", "F", "2023-Jun", "QP", f"{_OCR}/704950-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-CHEM-J248", "J248/03", "H", "2022-Jun", "QP", f"{_OCR}/678036-question-paper-paper-3.pdf"),
|
||||||
|
("OCR-PHYS-J249", "J249/01", "F", "2024-Jun", "QP", f"{_OCR}/727724-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-PHYS-J249", "J249/03", "H", "2024-Jun", "MS", f"{_OCR}/727755-mark-scheme-paper-3.pdf"),
|
||||||
|
("OCR-PHYS-J249", "J249/01", "F", "2023-Jun", "QP", f"{_OCR}/704956-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-PHYS-J249", "J249/03", "H", "2022-Jun", "MS", f"{_OCR}/678086-mark-scheme-paper-3.pdf"),
|
||||||
|
("OCR-COMB-J250", "J250/01", "F", "2024-Jun", "QP", f"{_OCR}/727730-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-COMB-J250", "J250/07", "H", "2024-Jun", "MS", f"{_OCR}/727763-mark-scheme-paper-7.pdf"),
|
||||||
|
# ── Maths J560 (round 2) ──
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "QP", f"{_OCR}/727817-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "MS", f"{_OCR}/727824-mark-scheme-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "QP", f"{_OCR}/727820-question-paper-paper-4.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "MS", f"{_OCR}/727827-mark-scheme-paper-4.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "QP", f"{_OCR}/705050-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "MS", f"{_OCR}/705057-mark-scheme-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "QP", f"{_OCR}/705053-question-paper-paper-4.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "MS", f"{_OCR}/705060-mark-scheme-paper-4.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "QP", f"{_OCR}/678149-question-paper-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "MS", f"{_OCR}/678156-mark-scheme-paper-1.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "QP", f"{_OCR}/678152-question-paper-paper-4.pdf"),
|
||||||
|
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "MS", f"{_OCR}/678159-mark-scheme-paper-4.pdf"),
|
||||||
|
# ── English Language J351 / Literature J352 (round 2) ──
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "QP", f"{_OCR}/727556-question-paper-communicating-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "MS", f"{_OCR}/727658-mark-scheme-communication-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "QP", f"{_OCR}/727558-question-paper-exploring-effects-and-impact.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "MS", f"{_OCR}/727659-mark-scheme-exploring-effects-and-impact.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "QP", f"{_OCR}/704782-question-paper-communicating-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "MS", f"{_OCR}/704888-mark-scheme-communication-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "QP", f"{_OCR}/677852-question-paper-communicating-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "MS", f"{_OCR}/677967-mark-scheme-communication-information-and-ideas.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "QP", f"{_OCR}/727830-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "MS", f"{_OCR}/727832-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "QP", f"{_OCR}/727831-question-paper-exploring-poetry-and-shakespeare.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "MS", f"{_OCR}/727833-mark-scheme-exploring-poetry-and-shakespeare.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "QP", f"{_OCR}/705069-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
|
||||||
|
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "MS", f"{_OCR}/705075-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
|
||||||
|
# ── A-level Maths H240 / English Lit H472 / Lang H470 (round 2) ──
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "QP", f"{_OCR}/726654-question-paper-pure-mathematics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "MS", f"{_OCR}/726795-mark-scheme-pure-mathematics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "QP", f"{_OCR}/726656-question-paper-pure-mathematics-and-statistics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "MS", f"{_OCR}/726796-mark-scheme-pure-mathematics-and-statistics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "QP", f"{_OCR}/703866-question-paper-pure-mathematics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "MS", f"{_OCR}/704008-mark-scheme-pure-mathematics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "QP", f"{_OCR}/676845-question-paper-pure-mathematics.pdf"),
|
||||||
|
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "MS", f"{_OCR}/677005-mark-scheme-pure-mathematics.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "QP", f"{_OCR}/726602-question-paper-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "MS", f"{_OCR}/726762-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "QP", f"{_OCR}/703813-question-paper-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "MS", f"{_OCR}/703974-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "QP", f"{_OCR}/676783-question-paper-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "MS", f"{_OCR}/676965-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "QP", f"{_OCR}/726595-question-paper-exploring-language.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "MS", f"{_OCR}/726764-mark-scheme-exploring-language.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "QP", f"{_OCR}/703806-question-paper-exploring-language.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "MS", f"{_OCR}/703976-mark-scheme-exploring-language.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "QP", f"{_OCR}/676772-question-paper-exploring-language.pdf"),
|
||||||
|
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "MS", f"{_OCR}/676967-mark-scheme-exploring-language.pdf"),
|
||||||
|
# ── Humanities (round 2) ──
|
||||||
|
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "QP", f"{_OCR}/727534-question-paper-computer-systems.pdf"),
|
||||||
|
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "MS", f"{_OCR}/727652-mark-scheme-computer-systems.pdf"),
|
||||||
|
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "QP", f"{_OCR}/727535-question-paper-computational-thinking-algorithms-and-programming.pdf"),
|
||||||
|
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "MS", f"{_OCR}/727653-mark-scheme-computational-thinking-algorithms-and-programming.pdf"),
|
||||||
|
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "QP", f"{_OCR}/727564-question-paper-living-in-the-uk-today.pdf"),
|
||||||
|
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "MS", f"{_OCR}/727661-mark-scheme-living-in-the-uk-today.pdf"),
|
||||||
|
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "QP", f"{_OCR}/727566-question-paper-the-world-around-us.pdf"),
|
||||||
|
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "MS", f"{_OCR}/727662-mark-scheme-the-world-around-us.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "QP", f"{_OCR}/727519-question-paper-business-1-business-activity-marketing-and-people.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "MS", f"{_OCR}/727634-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "QP", f"{_OCR}/727520-question-paper-business-2-operations-finance-and-influences-on-business.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "MS", f"{_OCR}/727635-mark-scheme-business-2-operations-finance-and-influences-on-business.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "QP", f"{_OCR}/704745-question-paper-business-1-business-activity-marketing-and-people.pdf"),
|
||||||
|
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "MS", f"{_OCR}/704864-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
|
||||||
|
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "QP", f"{_OCR}/727590-question-paper-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
|
||||||
|
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "MS", f"{_OCR}/727678-mark-scheme-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_board(board_code: str, specs_meta: Dict, papers: List) -> Dict[str, Any]:
|
||||||
|
prefix = EXAM_CODE_PREFIX[board_code]
|
||||||
|
print(f"[{board_code}] re-verifying {len(papers)} confirmed URLs...", file=sys.stderr)
|
||||||
|
live: Dict[int, bool] = {}
|
||||||
|
with cf.ThreadPoolExecutor(max_workers=24) as ex:
|
||||||
|
futs = {ex.submit(head_ok, p[5]): i for i, p in enumerate(papers)}
|
||||||
|
for fut in cf.as_completed(futs):
|
||||||
|
live[futs[fut]] = fut.result()
|
||||||
|
by_spec: Dict[str, List[Dict[str, Any]]] = {}
|
||||||
|
for i, (spec_code, paper_code, tier, session, role, url) in enumerate(papers):
|
||||||
|
if not live.get(i):
|
||||||
|
print(f" DROP (not live): {url}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
award = specs_meta[spec_code][1]
|
||||||
|
by_spec.setdefault(spec_code, []).append({
|
||||||
|
"exam_code": _mk_exam_code(prefix, award, paper_code, session, role),
|
||||||
|
"paper_code": paper_code, "tier": tier,
|
||||||
|
"session": session, "doc_type": role,
|
||||||
|
"file": {"source": f"url:{url}", "original_name": os.path.basename(url),
|
||||||
|
"provenance": {"source_url": url, "fetched": FETCHED,
|
||||||
|
"license": f"{board_code} public past paper"}},
|
||||||
|
})
|
||||||
|
spec_list = []
|
||||||
|
for spec_code, (subject, award, level, first_teach) in specs_meta.items():
|
||||||
|
if spec_code not in by_spec:
|
||||||
|
continue
|
||||||
|
spec_list.append({
|
||||||
|
"spec_code": spec_code, "exam_board_code": board_code, "subject_code": subject,
|
||||||
|
"award_code": award, "award_level": level, "first_teach": first_teach,
|
||||||
|
"papers": sorted(by_spec[spec_code], key=lambda p: p["exam_code"]),
|
||||||
|
})
|
||||||
|
print(f"[{board_code}] {spec_code}: {len(by_spec[spec_code])} live papers", file=sys.stderr)
|
||||||
|
return {"exam_board_code": board_code, "specifications": spec_list}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
out_path = os.path.join(os.path.dirname(__file__), "exam-corpus.yaml")
|
||||||
|
boards = [
|
||||||
|
build_aqa(),
|
||||||
|
build_board("EDEXCEL", EDEXCEL_SPECS, EDEXCEL_PAPERS),
|
||||||
|
build_board("OCR", OCR_SPECS, OCR_PAPERS),
|
||||||
|
]
|
||||||
|
n_specs = sum(len(b["specifications"]) for b in boards)
|
||||||
|
n_papers = sum(len(s["papers"]) for b in boards for s in b["specifications"])
|
||||||
|
manifest = {
|
||||||
|
"version": 1,
|
||||||
|
"defaults": {"bucket": "cc.examboards"},
|
||||||
|
"provenance": {
|
||||||
|
"collected_by": "kcar",
|
||||||
|
"collected_at": FETCHED,
|
||||||
|
"license_posture": ("Public exam-board past papers downloaded from each board's own "
|
||||||
|
"official site (AQA filestore, Pearson DAM, OCR Images). Stored in "
|
||||||
|
"the private dev cc.examboards bucket for internal exam-marker dev/test. "
|
||||||
|
"Each item records its source_url. Review redistribution rights before "
|
||||||
|
"any public exposure."),
|
||||||
|
"sources": {
|
||||||
|
"AQA": "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes/",
|
||||||
|
"EDEXCEL": "https://qualifications.pearson.com/en/support/support-topics/exams/past-papers.html",
|
||||||
|
"OCR": "https://www.ocr.org.uk/qualifications/past-paper-finder/",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# Optional: uncomment + set on dev .94 to exercise user-side flows / first-sweep.
|
||||||
|
# "test_subset": {"user_email": "teacher@kevlarai.test", "papers": 2},
|
||||||
|
# "system_identity": {"user_email": "teacher@kevlarai.test"},
|
||||||
|
"boards": boards,
|
||||||
|
}
|
||||||
|
with open(out_path, "w") as fh:
|
||||||
|
yaml.safe_dump(manifest, fh, sort_keys=False, default_flow_style=False, width=120)
|
||||||
|
print(f"\nWROTE {out_path}: {n_specs} specs, {n_papers} papers across {len(boards)} boards",
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -5,6 +5,7 @@ Clears:
|
|||||||
- Neo4j: drops ALL databases except system, neo4j (including gaisdata, cc.users.*, cc.institutes.*)
|
- Neo4j: drops ALL databases except system, neo4j (including gaisdata, cc.users.*, cc.institutes.*)
|
||||||
- Supabase: deletes ALL data tables except gais_local_authorities and gais_schools
|
- Supabase: deletes ALL data tables except gais_local_authorities and gais_schools
|
||||||
- Supabase: deletes all auth users except kcar, then re-seeds kcar profile state
|
- Supabase: deletes all auth users except kcar, then re-seeds kcar profile state
|
||||||
|
- Granular scopes can clear exam corpus, timetable data, or --user-subset seed copies
|
||||||
|
|
||||||
Safe invariants (never touched):
|
Safe invariants (never touched):
|
||||||
- kcar auth account
|
- kcar auth account
|
||||||
@ -82,6 +83,45 @@ SUPABASE_TABLES_TO_CLEAR = [
|
|||||||
"admin_profiles",
|
"admin_profiles",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Exam-marker subsystem tables, FK child-first. scope="exam-corpus" is deliberately
|
||||||
|
# broader than "public papers": it wipes public corpus eb_* rows, templates, layouts,
|
||||||
|
# questions, boundaries, response areas, marking batches, student submissions, and mark
|
||||||
|
# entries. NOT in the list above — the previous full reset() never cleared exam data
|
||||||
|
# or storage at all; the granular scopes below fold it in.
|
||||||
|
EXAM_CORPUS_TABLES = [
|
||||||
|
"mark_entries",
|
||||||
|
"student_submissions",
|
||||||
|
"marking_batches",
|
||||||
|
"exam_response_areas",
|
||||||
|
"exam_boundaries",
|
||||||
|
"exam_template_layout",
|
||||||
|
"exam_questions",
|
||||||
|
"exam_templates",
|
||||||
|
"eb_exams",
|
||||||
|
"eb_specifications",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Timetable / calendar materialization subset (for scope='timetable').
|
||||||
|
TIMETABLE_TABLES = [
|
||||||
|
"lesson_deliveries",
|
||||||
|
"lesson_collaborators",
|
||||||
|
"taught_lessons",
|
||||||
|
"academic_periods",
|
||||||
|
"academic_days",
|
||||||
|
"academic_weeks",
|
||||||
|
"academic_term_breaks",
|
||||||
|
"academic_terms",
|
||||||
|
"academic_years",
|
||||||
|
"teacher_timetable_slots",
|
||||||
|
"teacher_timetables",
|
||||||
|
"school_timetables",
|
||||||
|
"planned_lessons",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Bucket whose objects scope="exam-corpus" clears for the whole exam-marker subsystem
|
||||||
|
# (Storage API — protect_delete blocks raw SQL).
|
||||||
|
EXAM_STORAGE_BUCKET = "cc.examboards"
|
||||||
|
|
||||||
|
|
||||||
def _sb_headers():
|
def _sb_headers():
|
||||||
url = os.environ["SUPABASE_URL"]
|
url = os.environ["SUPABASE_URL"]
|
||||||
@ -94,6 +134,28 @@ def _sb_headers():
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Markers that identify a production Supabase target. Destructive reset against any of these is
|
||||||
|
# refused by default (project rule: ".94 only; .156 human-gated") — set RESET_ALLOW_PROD=1 to override.
|
||||||
|
PROD_TARGET_MARKERS = ("192.168.0.156", "supabase.classroomcopilot")
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_reset_allowed(url: str, scope: str) -> None:
|
||||||
|
"""Default-deny destructive reset against a production-looking Supabase target.
|
||||||
|
|
||||||
|
The /admin/reset route and this module both act on os.environ['SUPABASE_URL']; without this guard
|
||||||
|
a platform-admin call on a prod-deployed API would wipe prod data + exam corpus + storage. We refuse
|
||||||
|
when the target matches a known prod marker unless an explicit RESET_ALLOW_PROD opt-in is set.
|
||||||
|
"""
|
||||||
|
target = (url or "").lower()
|
||||||
|
looks_prod = any(m in target for m in PROD_TARGET_MARKERS)
|
||||||
|
override = os.environ.get("RESET_ALLOW_PROD", "").strip().lower() in ("1", "true", "yes")
|
||||||
|
if looks_prod and not override:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"refusing destructive reset (scope={scope}) against production-looking target {target!r}; "
|
||||||
|
f"this is human-gated — set RESET_ALLOW_PROD=1 to override."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ─── Neo4j helpers ────────────────────────────────────────────────────────────
|
# ─── Neo4j helpers ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def _neo4j_drop_all_non_system() -> Dict[str, List[str]]:
|
def _neo4j_drop_all_non_system() -> Dict[str, List[str]]:
|
||||||
@ -146,13 +208,133 @@ def _supabase_delete_auth_user(url: str, headers: dict, uid: str):
|
|||||||
logger.warning(f" Delete auth user {uid}: {r.status_code} {r.text[:80]}")
|
logger.warning(f" Delete auth user {uid}: {r.status_code} {r.text[:80]}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Granular helpers ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _clear_tables(url: str, headers: dict, tables: List[str]) -> "tuple[List[str], List[str]]":
|
||||||
|
cleared, failed = [], []
|
||||||
|
for table in tables:
|
||||||
|
if _sb_clear_table(url, headers, table) in (200, 204):
|
||||||
|
cleared.append(table)
|
||||||
|
logger.info(f" ✓ {table}")
|
||||||
|
else:
|
||||||
|
failed.append(table)
|
||||||
|
return cleared, failed
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_exam_storage() -> Dict[str, Any]:
|
||||||
|
"""Remove cc.examboards objects for the exam-marker subsystem.
|
||||||
|
|
||||||
|
scope="exam-corpus" is not limited to public-paper metadata: it also removes the
|
||||||
|
storage objects that back exam board corpus files and any downstream exam-marker
|
||||||
|
artifacts referenced from eb_exams/eb_specifications. Gathers storage_loc from
|
||||||
|
eb_exams/eb_specifications BEFORE the rows are cleared.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f" exam storage clear skipped (import): {exc}")
|
||||||
|
return {"removed": 0, "error": str(exc)}
|
||||||
|
sb = SupabaseServiceRoleClient().supabase
|
||||||
|
storage = StorageAdmin()
|
||||||
|
locs: List[str] = []
|
||||||
|
for table in ("eb_exams", "eb_specifications"):
|
||||||
|
try:
|
||||||
|
rows = sb.table(table).select("storage_loc").execute().data or []
|
||||||
|
locs += [r["storage_loc"] for r in rows if r.get("storage_loc")]
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f" storage_loc gather {table}: {exc}")
|
||||||
|
by_bucket: Dict[str, List[str]] = {}
|
||||||
|
for loc in locs:
|
||||||
|
if "/" in loc:
|
||||||
|
b, _, p = loc.partition("/")
|
||||||
|
by_bucket.setdefault(b, []).append(p)
|
||||||
|
removed = 0
|
||||||
|
for b, paths in by_bucket.items():
|
||||||
|
for i in range(0, len(paths), 100):
|
||||||
|
chunk = paths[i:i + 100]
|
||||||
|
try:
|
||||||
|
storage.client.supabase.storage.from_(b).remove(chunk)
|
||||||
|
removed += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f" storage remove {b}: {exc}")
|
||||||
|
logger.info(f" exam storage removed {removed} objects from {list(by_bucket)}")
|
||||||
|
return {"removed": removed, "buckets": list(by_bucket)}
|
||||||
|
|
||||||
|
|
||||||
|
def _clear_user_subset_files() -> Dict[str, Any]:
|
||||||
|
"""Remove files rows and cc.users storage objects created by --user-subset seeding.
|
||||||
|
|
||||||
|
Reuses the seed/unseed implementation so reset(scope="user-subset") has the
|
||||||
|
same storage-before-row deletion order and idempotency guarantees as
|
||||||
|
seed_exam_corpus.py --unseed. The helper only targets rows marked by the seeder:
|
||||||
|
bucket='cc.users', source='exam-corpus-seed', path LIKE 'exam-marker/%'.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f" user-subset clear skipped (import): {exc}")
|
||||||
|
return {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": [str(exc)]}
|
||||||
|
|
||||||
|
rep = LoadReport()
|
||||||
|
_delete_user_subset_files(
|
||||||
|
SupabaseServiceRoleClient(),
|
||||||
|
StorageAdmin(),
|
||||||
|
exam_codes=None,
|
||||||
|
rep=rep,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"files_rows_deleted": rep.unseed_user_files,
|
||||||
|
"storage_objects_removed": rep.unseed_objects,
|
||||||
|
"errors": rep.errors,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ─── Main reset ───────────────────────────────────────────────────────────────
|
# ─── Main reset ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def reset() -> Dict[str, Any]:
|
def reset(scope: str = "all") -> Dict[str, Any]:
|
||||||
|
"""Destructive reset. scope ∈ {all, exam-corpus, timetable, user-subset}.
|
||||||
|
|
||||||
|
- all : full wipe (Neo4j + Supabase data + auth users) AND the entire
|
||||||
|
exam-marker subsystem listed below, including --user-subset copies.
|
||||||
|
- exam-corpus : ONLY the entire exam-marker subsystem, not just public papers:
|
||||||
|
public corpus/eb_* data, cc.examboards storage objects, exam
|
||||||
|
templates, template layouts, questions, boundaries, response
|
||||||
|
areas, marking batches, student submissions, mark entries, and
|
||||||
|
--user-subset cc.users copies.
|
||||||
|
- timetable : ONLY timetable/calendar materialization tables.
|
||||||
|
- user-subset : ONLY files rows and cc.users storage objects created by
|
||||||
|
seed_exam_corpus.py --user-subset.
|
||||||
|
"""
|
||||||
|
scope = (scope or "all").lower()
|
||||||
|
if scope not in ("all", "exam-corpus", "timetable", "user-subset"):
|
||||||
|
raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable|user-subset)")
|
||||||
|
url, headers = _sb_headers()
|
||||||
|
_assert_reset_allowed(url, scope)
|
||||||
|
|
||||||
|
if scope == "exam-corpus":
|
||||||
|
logger.info("RESET (scope=exam-corpus) — entire exam-marker subsystem: public corpus/eb_* data, cc.examboards storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries, and --user-subset copies")
|
||||||
|
user_subset = _clear_user_subset_files()
|
||||||
|
storage = _clear_exam_storage()
|
||||||
|
cleared, failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
|
||||||
|
return {"scope": scope, "user_subset": user_subset, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
|
||||||
|
|
||||||
|
if scope == "timetable":
|
||||||
|
logger.info("RESET (scope=timetable) — timetable/calendar tables")
|
||||||
|
cleared, failed = _clear_tables(url, headers, TIMETABLE_TABLES)
|
||||||
|
return {"scope": scope, "tables_cleared": cleared, "tables_failed": failed}
|
||||||
|
|
||||||
|
if scope == "user-subset":
|
||||||
|
logger.info("RESET (scope=user-subset) — --user-subset cc.users storage objects and files rows")
|
||||||
|
user_subset = _clear_user_subset_files()
|
||||||
|
return {"scope": scope, "user_subset": user_subset}
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("RESET ENVIRONMENT — full destructive wipe starting")
|
logger.info("RESET ENVIRONMENT — full destructive wipe starting")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
results: Dict[str, Any] = {}
|
results: Dict[str, Any] = {"scope": scope}
|
||||||
|
|
||||||
# ── 1. Neo4j: drop everything except system + neo4j ──────────────────────
|
# ── 1. Neo4j: drop everything except system + neo4j ──────────────────────
|
||||||
logger.info("\n[Neo4j] Dropping all non-system databases...")
|
logger.info("\n[Neo4j] Dropping all non-system databases...")
|
||||||
@ -161,6 +343,9 @@ def reset() -> Dict[str, Any]:
|
|||||||
results["neo4j"] = {"dropped": dropped}
|
results["neo4j"] = {"dropped": dropped}
|
||||||
|
|
||||||
# ── 2. Supabase: clear all data tables (GAIS preserved) ──────────────────
|
# ── 2. Supabase: clear all data tables (GAIS preserved) ──────────────────
|
||||||
|
# First remove --user-subset cc.users storage objects (+ their files rows) via the
|
||||||
|
# Storage API, so the generic files-table clear below doesn't strand orphaned objects.
|
||||||
|
results["user_subset"] = _clear_user_subset_files()
|
||||||
logger.info("\n[Supabase] Clearing data tables (preserving gais_*)...")
|
logger.info("\n[Supabase] Clearing data tables (preserving gais_*)...")
|
||||||
url, headers = _sb_headers()
|
url, headers = _sb_headers()
|
||||||
cleared, failed = [], []
|
cleared, failed = [], []
|
||||||
@ -213,11 +398,25 @@ def reset() -> Dict[str, Any]:
|
|||||||
)
|
)
|
||||||
logger.info(" kcar → admin_profiles restored ✓")
|
logger.info(" kcar → admin_profiles restored ✓")
|
||||||
|
|
||||||
|
# ── 5. Exam-marker subsystem: storage objects (Storage API) + all exam tables ──
|
||||||
|
# This is the same destructive surface as scope="exam-corpus": public corpus/eb_*
|
||||||
|
# rows, cc.examboards storage, templates/layout/questions/boundaries/response
|
||||||
|
# areas, marking batches, submissions, and mark entries. (The legacy full reset
|
||||||
|
# cleared neither exam tables nor storage — folded in here.)
|
||||||
|
logger.info("\n[Supabase] Clearing entire exam-marker subsystem (public corpus, storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries)...")
|
||||||
|
exam_storage = _clear_exam_storage()
|
||||||
|
exam_cleared, exam_failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
|
||||||
|
|
||||||
results["supabase"] = {
|
results["supabase"] = {
|
||||||
"tables_cleared": cleared,
|
"tables_cleared": cleared,
|
||||||
"tables_failed": failed,
|
"tables_failed": failed,
|
||||||
"deleted_users": deleted_emails,
|
"deleted_users": deleted_emails,
|
||||||
}
|
}
|
||||||
|
results["exam"] = {
|
||||||
|
"storage": exam_storage,
|
||||||
|
"tables_cleared": exam_cleared,
|
||||||
|
"tables_failed": exam_failed,
|
||||||
|
}
|
||||||
|
|
||||||
logger.info("\n" + "=" * 60)
|
logger.info("\n" + "=" * 60)
|
||||||
logger.info("RESET COMPLETE")
|
logger.info("RESET COMPLETE")
|
||||||
|
|||||||
@ -1,15 +1,20 @@
|
|||||||
"""
|
"""
|
||||||
seed_curriculum.py — Create curriculum data: exam board specifications and exams.
|
seed_curriculum.py — DEPRECATED hardcoded curriculum/exam seeder.
|
||||||
|
|
||||||
Seeds eb_specifications and eb_exams tables with realistic UK exam board data
|
⚠️ SUPERSEDED (2026-06-07) by the manifest-driven corpus loader:
|
||||||
(AQA, Edexcel, OCR) for Physics, Maths, and Computer Science across both schools.
|
run/initialization/seed_exam_corpus.py (+ manifests/exam-corpus.yaml)
|
||||||
|
|
||||||
Also seeds curriculum_topics in Neo4j for the school databases.
|
The exam-board parts of this file (eb_specifications / eb_exams) are now seeded from a
|
||||||
|
verified, provenance-bearing manifest with real uploaded PDFs — not the hardcoded rows
|
||||||
|
below. This module also had a storage_loc inconsistency the overhaul standardises away:
|
||||||
|
exam-board files belong in the `cc.examboards` bucket at the canonical path
|
||||||
|
`cc.examboards/{board}/{subject}/{award}/{paper}/{session}/{role}.pdf`, NOT under
|
||||||
|
`cc.public.snapshots/curriculum/...` (the placeholder rows below still show the old path).
|
||||||
|
|
||||||
Tables: eb_specifications, eb_exams
|
KEEP ONLY for the Neo4j `curriculum_topics` seed (step [3]) which has no replacement yet.
|
||||||
Neo4j: curriculum topic nodes in school databases
|
Do NOT use the eb_specifications/eb_exams blocks for new work — use seed_exam_corpus.py.
|
||||||
|
|
||||||
Run inside ccapi container:
|
Run (Neo4j curriculum topics only is the supported remaining use):
|
||||||
python3 -c "from run.initialization.seed_curriculum import seed; seed()"
|
python3 -c "from run.initialization.seed_curriculum import seed; seed()"
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
|||||||
867
run/initialization/seed_exam_corpus.py
Normal file
867
run/initialization/seed_exam_corpus.py
Normal file
@ -0,0 +1,867 @@
|
|||||||
|
"""
|
||||||
|
seed_exam_corpus.py — manifest-driven loader for the public exam-paper corpus.
|
||||||
|
|
||||||
|
SCOPE (separate from infra): assumes storage buckets already exist (provisioned by
|
||||||
|
run/initialization/buckets.py during infra init). This loader UPLOADS papers and
|
||||||
|
SEEDS the catalogue; it does NOT create buckets.
|
||||||
|
|
||||||
|
Pipeline per manifest item:
|
||||||
|
validate -> resolve source bytes (local path | url:, cached) -> upload file to
|
||||||
|
cc.examboards (canonical path, skip-if-exists unless --force) -> upsert
|
||||||
|
eb_specifications / eb_exams (catalogue) -> (optional, --user-subset) copy a subset
|
||||||
|
into a test user's exam space so user-side flows are testable -> (optional,
|
||||||
|
--first-sweep) run the docling/auto-map first pass to gather structure.
|
||||||
|
|
||||||
|
Manifest template: ~/cc/specs/exam-corpus-manifest.example.yaml
|
||||||
|
|
||||||
|
Catalogue columns (real — verified against volumes/db/cc/61-core-schema.sql):
|
||||||
|
eb_specifications(spec_code UNIQUE, exam_board_code, award_code, subject_code,
|
||||||
|
first_teach, spec_ver, storage_loc, doc_type CHECK(pdf|json|...),
|
||||||
|
doc_details jsonb, docling_docs jsonb)
|
||||||
|
eb_exams(exam_code UNIQUE, spec_code FK, paper_code, tier, session, type_code,
|
||||||
|
storage_loc, doc_type CHECK(pdf|json|...), doc_details jsonb, docling_docs jsonb)
|
||||||
|
|
||||||
|
IMPORTANT schema note: the QP/MS/INSERT/ER *document role* is stored in `type_code`
|
||||||
|
(the `/catalogue` endpoint filters `type_code == 'QP'`). The `doc_type` column is the
|
||||||
|
*file format* and is CHECK-constrained to {pdf,json,md,html,txt,doctags} — so it is
|
||||||
|
always 'pdf' here. (The manifest field is named `doc_type` for the role; the loader
|
||||||
|
maps manifest.doc_type -> DB.type_code and sets DB.doc_type = 'pdf'.)
|
||||||
|
|
||||||
|
Locked conventions (see ~/cc/ideas/2026-06-07-exam-paper-ingestion.md):
|
||||||
|
session = "YYYY-Mon" e.g. "2022-Jun", "2021-Nov"
|
||||||
|
exam_code = "{BOARD}-{award}-{paper_safe}-{SESSIONCOMPACT}-{ROLE}" e.g. AQA-8463-1H-2022JUN-QP
|
||||||
|
spec path = cc.examboards/{board}/{subject}/{award}/spec/{spec_ver}.pdf
|
||||||
|
paper path = cc.examboards/{board}/{subject}/{award}/{paper_safe}/{session}/{role}.pdf
|
||||||
|
|
||||||
|
Run inside the api container (env: SUPABASE_URL + SERVICE_ROLE_KEY for dev .94), e.g.:
|
||||||
|
python3 -m run.initialization.seed_exam_corpus --manifest /path/exam-corpus.yaml --dry-run
|
||||||
|
python3 -m run.initialization.seed_exam_corpus --manifest ... --board AQA
|
||||||
|
python3 -m run.initialization.seed_exam_corpus --manifest ... --first-sweep
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml # PyYAML
|
||||||
|
|
||||||
|
from modules.logger_tool import initialise_logger
|
||||||
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||||
|
from modules.database.supabase.utils.storage import StorageAdmin, StorageError
|
||||||
|
|
||||||
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), "default", True)
|
||||||
|
|
||||||
|
EXAM_BUCKET = "cc.examboards"
|
||||||
|
# Manifest `doc_type` carries the document ROLE (stored in eb_exams.type_code).
|
||||||
|
DOC_ROLES = {"QP", "MS", "INSERT", "ER", "SPECIMEN", "GRADE_BOUNDARIES", "DATA_SHEET"}
|
||||||
|
TIERS = {"H", "F", None}
|
||||||
|
# Default working dir for cached url: downloads (override with --cache-dir / EXAM_CORPUS_CACHE).
|
||||||
|
DEFAULT_CACHE_DIR = os.getenv("EXAM_CORPUS_CACHE", "/tmp/exam-corpus-cache")
|
||||||
|
# Persistent, mountable local store laid out exactly like the bucket (download once, seed many,
|
||||||
|
# offline-repeatable). Override with --store-dir / EXAM_CORPUS_STORE. Distinct from --cache-dir,
|
||||||
|
# which is a throwaway url hash-cache.
|
||||||
|
DEFAULT_STORE_DIR = os.getenv(
|
||||||
|
"EXAM_CORPUS_STORE",
|
||||||
|
os.path.join(os.path.dirname(os.path.abspath(__file__)), "manifests", "_corpus_store"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── canonical storage paths ───────────────────────────────
|
||||||
|
def _lc(s: str) -> str:
|
||||||
|
return (s or "").strip().lower().replace(" ", "-")
|
||||||
|
|
||||||
|
def _paper_safe(paper_code: str) -> str:
|
||||||
|
# Drop the award prefix, keep all remaining segments so combined-science sub-papers
|
||||||
|
# don't collide on the storage path:
|
||||||
|
# "8463/1H" -> "1h"
|
||||||
|
# "8464/B/1H" -> "b-1h" (Trilogy: subject letter + paper + tier)
|
||||||
|
# "7408/1" -> "1"
|
||||||
|
parts = _lc(paper_code).split("/")
|
||||||
|
return "-".join(parts[1:]) if len(parts) > 1 else parts[0]
|
||||||
|
|
||||||
|
def spec_storage_loc(board: str, subject: str, award: str, spec_ver: str) -> str:
|
||||||
|
# e.g. cc.examboards/aqa/physics/8463/spec/1.1.pdf
|
||||||
|
return f"{EXAM_BUCKET}/{_lc(board)}/{_lc(subject)}/{_lc(award)}/spec/{_lc(spec_ver or 'spec')}.pdf"
|
||||||
|
|
||||||
|
def paper_storage_loc(board: str, subject: str, award: str, paper_code: str, session: str, doc_role: str) -> str:
|
||||||
|
# e.g. cc.examboards/aqa/physics/8463/1h/2022-jun/qp.pdf
|
||||||
|
return f"{EXAM_BUCKET}/{_lc(board)}/{_lc(subject)}/{_lc(award)}/{_paper_safe(paper_code)}/{_lc(session)}/{_lc(doc_role)}.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── report ───────────────────────────────
|
||||||
|
@dataclass
|
||||||
|
class LoadReport:
|
||||||
|
specs_upserted: int = 0
|
||||||
|
papers_upserted: int = 0
|
||||||
|
files_uploaded: int = 0
|
||||||
|
files_skipped: int = 0
|
||||||
|
files_failed: int = 0
|
||||||
|
user_copies: int = 0
|
||||||
|
swept: int = 0
|
||||||
|
sweep_failed: int = 0
|
||||||
|
downloaded: int = 0
|
||||||
|
download_cached: int = 0
|
||||||
|
unseed_objects: int = 0
|
||||||
|
unseed_user_files: int = 0
|
||||||
|
unseed_exams: int = 0
|
||||||
|
unseed_specs: int = 0
|
||||||
|
unseed_templates: int = 0
|
||||||
|
errors: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def as_dict(self) -> Dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"specs_upserted": self.specs_upserted,
|
||||||
|
"papers_upserted": self.papers_upserted,
|
||||||
|
"downloaded": self.downloaded,
|
||||||
|
"download_cached": self.download_cached,
|
||||||
|
"unseed_objects": self.unseed_objects,
|
||||||
|
"unseed_user_files": self.unseed_user_files,
|
||||||
|
"unseed_exams": self.unseed_exams,
|
||||||
|
"unseed_specs": self.unseed_specs,
|
||||||
|
"unseed_templates": self.unseed_templates,
|
||||||
|
"files_uploaded": self.files_uploaded,
|
||||||
|
"files_skipped": self.files_skipped,
|
||||||
|
"files_failed": self.files_failed,
|
||||||
|
"user_copies": self.user_copies,
|
||||||
|
"swept": self.swept,
|
||||||
|
"sweep_failed": self.sweep_failed,
|
||||||
|
"errors": self.errors,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── validation ───────────────────────────────
|
||||||
|
def validate_manifest(m: Dict[str, Any]) -> List[str]:
|
||||||
|
errs: List[str] = []
|
||||||
|
seen_specs, seen_exams = set(), set()
|
||||||
|
for board in m.get("boards", []):
|
||||||
|
bcode = board.get("exam_board_code")
|
||||||
|
if not bcode:
|
||||||
|
errs.append("board missing exam_board_code")
|
||||||
|
for spec in board.get("specifications", []):
|
||||||
|
sc = spec.get("spec_code")
|
||||||
|
if not sc or sc in seen_specs:
|
||||||
|
errs.append(f"spec_code missing/duplicate: {sc!r}")
|
||||||
|
seen_specs.add(sc)
|
||||||
|
for field_name in ("award_code", "subject_code"):
|
||||||
|
if not spec.get(field_name):
|
||||||
|
errs.append(f"{sc}: missing {field_name}")
|
||||||
|
for p in spec.get("papers", []):
|
||||||
|
ec = p.get("exam_code")
|
||||||
|
if not ec or ec in seen_exams:
|
||||||
|
errs.append(f"exam_code missing/duplicate: {ec!r}")
|
||||||
|
seen_exams.add(ec)
|
||||||
|
if p.get("doc_type") not in DOC_ROLES:
|
||||||
|
errs.append(f"{ec}: bad doc_type/role {p.get('doc_type')!r} (want one of {sorted(DOC_ROLES)})")
|
||||||
|
if p.get("tier") not in TIERS:
|
||||||
|
errs.append(f"{ec}: bad tier {p.get('tier')!r} (want H|F|null)")
|
||||||
|
if not p.get("paper_code"):
|
||||||
|
errs.append(f"{ec}: missing paper_code")
|
||||||
|
if not p.get("session"):
|
||||||
|
errs.append(f"{ec}: missing session")
|
||||||
|
src = (p.get("file") or {}).get("source")
|
||||||
|
if not src:
|
||||||
|
errs.append(f"{ec}: missing file.source")
|
||||||
|
elif not src.startswith("url:") and not os.path.exists(src):
|
||||||
|
errs.append(f"{ec}: local source not found: {src}")
|
||||||
|
return errs
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── source resolution (local | url:, cached) ───────────────────────────────
|
||||||
|
def _resolve_source_bytes(source: str, *, cache_dir: str) -> bytes:
|
||||||
|
"""Resolve a manifest file source to bytes.
|
||||||
|
|
||||||
|
'url:https://...' -> fetch (cached to cache_dir by url hash) ; verifies non-empty.
|
||||||
|
'<local path>' -> read from disk.
|
||||||
|
"""
|
||||||
|
if source.startswith("url:"):
|
||||||
|
url = source[len("url:"):]
|
||||||
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
cache_key = hashlib.sha1(url.encode("utf-8")).hexdigest()
|
||||||
|
cache_path = os.path.join(cache_dir, f"{cache_key}.pdf")
|
||||||
|
if os.path.exists(cache_path) and os.path.getsize(cache_path) > 0:
|
||||||
|
with open(cache_path, "rb") as fh:
|
||||||
|
return fh.read()
|
||||||
|
logger.info(f"[fetch] {url}")
|
||||||
|
resp = requests.get(url, timeout=60, allow_redirects=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.content
|
||||||
|
ctype = resp.headers.get("content-type", "")
|
||||||
|
if not data:
|
||||||
|
raise ValueError(f"empty download: {url}")
|
||||||
|
if "pdf" not in ctype.lower() and not data[:5].startswith(b"%PDF"):
|
||||||
|
raise ValueError(f"not a PDF (content-type={ctype!r}): {url}")
|
||||||
|
tmp = cache_path + ".part"
|
||||||
|
with open(tmp, "wb") as fh:
|
||||||
|
fh.write(data)
|
||||||
|
os.replace(tmp, cache_path)
|
||||||
|
return data
|
||||||
|
with open(source, "rb") as fh:
|
||||||
|
return fh.read()
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────── persistent local store (download-once, seed-many) ───────────────────────
|
||||||
|
def _store_path(store_dir: str, storage_loc: str) -> str:
|
||||||
|
"""Local path mirroring the bucket layout (so the store is directly mountable as the corpus):
|
||||||
|
storage_loc 'cc.examboards/aqa/physics/8463/1h/2022-jun/qp.pdf'
|
||||||
|
-> {store_dir}/aqa/physics/8463/1h/2022-jun/qp.pdf
|
||||||
|
"""
|
||||||
|
_, _, path = storage_loc.partition("/")
|
||||||
|
return os.path.join(store_dir, path)
|
||||||
|
|
||||||
|
def _item_bytes(source: str, storage_loc: str, *, store_dir: Optional[str], cache_dir: str,
|
||||||
|
populate: bool = True, rep: Optional[LoadReport] = None) -> bytes:
|
||||||
|
"""Resolve bytes for an item, preferring the persistent local store when present.
|
||||||
|
|
||||||
|
If store_dir holds the file → read it (offline). Otherwise resolve the source (local|url:) and,
|
||||||
|
when populate=True, write it into the store at its canonical path for future offline runs.
|
||||||
|
"""
|
||||||
|
if store_dir:
|
||||||
|
sp = _store_path(store_dir, storage_loc)
|
||||||
|
if os.path.exists(sp) and os.path.getsize(sp) > 0:
|
||||||
|
if rep is not None:
|
||||||
|
rep.download_cached += 1
|
||||||
|
with open(sp, "rb") as fh:
|
||||||
|
return fh.read()
|
||||||
|
data = _resolve_source_bytes(source, cache_dir=cache_dir)
|
||||||
|
if store_dir and populate:
|
||||||
|
sp = _store_path(store_dir, storage_loc)
|
||||||
|
os.makedirs(os.path.dirname(sp), exist_ok=True)
|
||||||
|
tmp = sp + ".part"
|
||||||
|
with open(tmp, "wb") as fh:
|
||||||
|
fh.write(data)
|
||||||
|
os.replace(tmp, sp)
|
||||||
|
if rep is not None:
|
||||||
|
rep.downloaded += 1
|
||||||
|
return data
|
||||||
|
|
||||||
|
def download_corpus(m: Dict[str, Any], *, store_dir: str, board_filter: Optional[str],
|
||||||
|
spec_filter: Optional[str], cache_dir: str, rep: LoadReport) -> None:
|
||||||
|
"""--download-only: populate the persistent local store from the manifest. No DB/bucket writes.
|
||||||
|
A later run with the same --store-dir (e.g. mounted into the container) seeds offline from it."""
|
||||||
|
for board in m.get("boards", []):
|
||||||
|
if board_filter and board.get("exam_board_code") != board_filter:
|
||||||
|
continue
|
||||||
|
for spec in board.get("specifications", []):
|
||||||
|
if spec_filter and spec.get("spec_code") != spec_filter:
|
||||||
|
continue
|
||||||
|
sf = spec.get("spec_file")
|
||||||
|
if sf and sf.get("source"):
|
||||||
|
sloc = spec_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
|
||||||
|
spec.get("award_code", ""), spec.get("spec_ver", ""))
|
||||||
|
try:
|
||||||
|
_item_bytes(sf["source"], sloc, store_dir=store_dir, cache_dir=cache_dir, rep=rep)
|
||||||
|
except Exception as exc:
|
||||||
|
rep.errors.append(f"download spec {spec.get('spec_code')}: {exc}")
|
||||||
|
for p in spec.get("papers", []):
|
||||||
|
ploc = paper_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
|
||||||
|
spec.get("award_code", ""), p["paper_code"], p["session"], p["doc_type"])
|
||||||
|
try:
|
||||||
|
_item_bytes(p["file"]["source"], ploc, store_dir=store_dir, cache_dir=cache_dir, rep=rep)
|
||||||
|
except Exception as exc:
|
||||||
|
rep.errors.append(f"download {p.get('exam_code')}: {exc}")
|
||||||
|
logger.info(f"download-only done: downloaded={rep.downloaded} already_in_store={rep.download_cached} "
|
||||||
|
f"errors={len(rep.errors)} store={store_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── storage upload (skip-if-exists + sha256) ───────────────────────────────
|
||||||
|
def _split_loc(storage_loc: str) -> Tuple[str, str]:
|
||||||
|
bucket, _, path = storage_loc.partition("/")
|
||||||
|
return bucket, path
|
||||||
|
|
||||||
|
def _object_exists(storage: StorageAdmin, bucket: str, path: str) -> bool:
|
||||||
|
"""Existence check by listing the object's parent folder (Supabase storage has no stat)."""
|
||||||
|
parent, _, name = path.rpartition("/")
|
||||||
|
try:
|
||||||
|
listing = storage.client.supabase.storage.from_(bucket).list(parent)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[exists?] list failed for {bucket}/{parent}: {exc}")
|
||||||
|
return False
|
||||||
|
return any((item.get("name") == name) for item in (listing or []))
|
||||||
|
|
||||||
|
def upload_file(storage: StorageAdmin, storage_loc: str, data: bytes, *, force: bool, rep: LoadReport) -> str:
|
||||||
|
"""Upload PDF bytes to storage at storage_loc. Returns the sha256 of the bytes.
|
||||||
|
|
||||||
|
Idempotent: if the object already exists and --force was not given, skips the upload
|
||||||
|
(the catalogue upsert still runs and records the checksum). With --force, overwrites.
|
||||||
|
"""
|
||||||
|
sha = hashlib.sha256(data).hexdigest()
|
||||||
|
bucket, path = _split_loc(storage_loc)
|
||||||
|
if not force and _object_exists(storage, bucket, path):
|
||||||
|
logger.info(f"[upload] skip-exists {storage_loc} (sha256={sha[:12]})")
|
||||||
|
rep.files_skipped += 1
|
||||||
|
return sha
|
||||||
|
try:
|
||||||
|
storage.upload_file(bucket, path, data, "application/pdf", upsert=True)
|
||||||
|
logger.info(f"[upload] {storage_loc} ({len(data)} bytes, sha256={sha[:12]}) force={force}")
|
||||||
|
rep.files_uploaded += 1
|
||||||
|
except StorageError as exc:
|
||||||
|
logger.error(f"[upload] FAILED {storage_loc}: {exc}")
|
||||||
|
rep.files_failed += 1
|
||||||
|
rep.errors.append(f"upload {storage_loc}: {exc}")
|
||||||
|
return sha
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── catalogue upserts ───────────────────────────────
|
||||||
|
def upsert_specification(client: SupabaseServiceRoleClient, spec: Dict[str, Any],
|
||||||
|
storage_loc: Optional[str], sha: Optional[str], rep: LoadReport) -> None:
|
||||||
|
sf = spec.get("spec_file") or {}
|
||||||
|
doc_details = {
|
||||||
|
"award_level": spec.get("award_level"),
|
||||||
|
"provenance": sf.get("provenance"),
|
||||||
|
"original_name": sf.get("original_name"),
|
||||||
|
"sha256": sha,
|
||||||
|
}
|
||||||
|
row = {
|
||||||
|
"spec_code": spec["spec_code"],
|
||||||
|
"exam_board_code": spec["exam_board_code"],
|
||||||
|
"award_code": spec.get("award_code"),
|
||||||
|
"subject_code": spec.get("subject_code"),
|
||||||
|
"first_teach": spec.get("first_teach"),
|
||||||
|
"spec_ver": spec.get("spec_ver"),
|
||||||
|
"storage_loc": storage_loc,
|
||||||
|
"doc_type": "pdf", # file format (CHECK-constrained); the role lives on eb_exams.type_code
|
||||||
|
"doc_details": {k: v for k, v in doc_details.items() if v is not None},
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
client.supabase.table("eb_specifications").upsert(row, on_conflict="spec_code").execute()
|
||||||
|
logger.info(f"[spec] upsert {row['spec_code']}")
|
||||||
|
rep.specs_upserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"[spec] FAILED {row['spec_code']}: {exc}")
|
||||||
|
rep.errors.append(f"spec {row['spec_code']}: {exc}")
|
||||||
|
|
||||||
|
def upsert_paper(client: SupabaseServiceRoleClient, spec_code: str, p: Dict[str, Any],
|
||||||
|
storage_loc: str, sha: Optional[str], rep: LoadReport) -> None:
|
||||||
|
f = p.get("file") or {}
|
||||||
|
doc_role = p["doc_type"] # manifest role: QP|MS|INSERT|ER...
|
||||||
|
doc_details = {
|
||||||
|
"doc_role": doc_role, # mirror of type_code for clarity
|
||||||
|
"original_name": f.get("original_name"),
|
||||||
|
"provenance": f.get("provenance"),
|
||||||
|
"sha256": sha,
|
||||||
|
}
|
||||||
|
row = {
|
||||||
|
"exam_code": p["exam_code"],
|
||||||
|
"spec_code": spec_code,
|
||||||
|
"paper_code": p.get("paper_code"),
|
||||||
|
"tier": p.get("tier"),
|
||||||
|
"session": p.get("session"),
|
||||||
|
"type_code": doc_role, # ROLE goes here (QP/MS/INSERT/ER)
|
||||||
|
"doc_type": "pdf", # file format (CHECK-constrained)
|
||||||
|
"storage_loc": storage_loc,
|
||||||
|
"doc_details": {k: v for k, v in doc_details.items() if v is not None},
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
client.supabase.table("eb_exams").upsert(row, on_conflict="exam_code").execute()
|
||||||
|
logger.info(f"[paper] upsert {row['exam_code']} type_code={doc_role}")
|
||||||
|
rep.papers_upserted += 1
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"[paper] FAILED {row['exam_code']}: {exc}")
|
||||||
|
rep.errors.append(f"paper {row['exam_code']}: {exc}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── user-side test subset ───────────────────────────────
|
||||||
|
def _resolve_test_user(client: SupabaseServiceRoleClient, cfg: Dict[str, Any]) -> Optional[Tuple[str, str]]:
|
||||||
|
"""Resolve (user_id, institute_id) for the user-side subset from config, with discovery fallback."""
|
||||||
|
user_id = cfg.get("user_id")
|
||||||
|
if not user_id and cfg.get("user_email"):
|
||||||
|
res = client.supabase.table("profiles").select("id").eq("email", cfg["user_email"]).limit(1).execute()
|
||||||
|
rows = getattr(res, "data", None) or []
|
||||||
|
user_id = rows[0]["id"] if rows else None
|
||||||
|
if not user_id:
|
||||||
|
logger.warning("[user-subset] no test user resolvable (set test_subset.user_id or user_email); skipping")
|
||||||
|
return None
|
||||||
|
institute_id = cfg.get("institute_id")
|
||||||
|
if not institute_id:
|
||||||
|
res = client.supabase.table("institute_memberships").select("institute_id").eq("profile_id", user_id).limit(1).execute()
|
||||||
|
rows = getattr(res, "data", None) or []
|
||||||
|
institute_id = rows[0]["institute_id"] if rows else None
|
||||||
|
if not institute_id:
|
||||||
|
logger.warning(f"[user-subset] no institute for user {user_id}; skipping")
|
||||||
|
return None
|
||||||
|
return user_id, institute_id
|
||||||
|
|
||||||
|
def copy_user_test_subset(client: SupabaseServiceRoleClient, storage: StorageAdmin,
|
||||||
|
m: Dict[str, Any], rep: LoadReport) -> None:
|
||||||
|
"""Copy a small subset of admin papers into a test user's exam space so user-side flows
|
||||||
|
(upload-as-exam / promote-from-cabinet / mark) are testable.
|
||||||
|
|
||||||
|
Driven by an optional manifest `test_subset:` block:
|
||||||
|
test_subset:
|
||||||
|
user_id: <uuid> # or user_email: <email>
|
||||||
|
institute_id: <uuid> # optional; discovered from membership if omitted
|
||||||
|
papers: 2 # how many QP papers to copy (default 2)
|
||||||
|
Degrades gracefully (logs + skips) if no test user is resolvable on this env.
|
||||||
|
"""
|
||||||
|
cfg = m.get("test_subset") or {}
|
||||||
|
resolved = _resolve_test_user(client, cfg)
|
||||||
|
if not resolved:
|
||||||
|
return
|
||||||
|
user_id, institute_id = resolved
|
||||||
|
limit = int(cfg.get("papers", 2))
|
||||||
|
|
||||||
|
# Gather candidate QP papers (admin corpus already uploaded to cc.examboards).
|
||||||
|
candidates: List[Tuple[str, Dict[str, Any]]] = []
|
||||||
|
for board in m.get("boards", []):
|
||||||
|
for spec in board.get("specifications", []):
|
||||||
|
for p in spec.get("papers", []):
|
||||||
|
if p.get("doc_type") == "QP":
|
||||||
|
candidates.append((board["exam_board_code"], spec, p))
|
||||||
|
candidates = candidates[:limit]
|
||||||
|
if not candidates:
|
||||||
|
logger.info("[user-subset] no QP papers to copy")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Ensure a cabinet for the user.
|
||||||
|
cab_name = "Exam Marker Template Sources"
|
||||||
|
res = client.supabase.table("file_cabinets").select("id").eq("user_id", user_id).eq("name", cab_name).limit(1).execute()
|
||||||
|
rows = getattr(res, "data", None) or []
|
||||||
|
if rows:
|
||||||
|
cabinet_id = rows[0]["id"]
|
||||||
|
else:
|
||||||
|
ins = client.supabase.table("file_cabinets").insert({"user_id": user_id, "name": cab_name}).execute()
|
||||||
|
cabinet_id = (getattr(ins, "data", None) or [{}])[0].get("id")
|
||||||
|
if not cabinet_id:
|
||||||
|
logger.warning("[user-subset] could not ensure cabinet; skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
import uuid as _uuid
|
||||||
|
for board_code, spec, p in candidates:
|
||||||
|
src_loc = paper_storage_loc(board_code, spec.get("subject_code", ""), spec.get("award_code", ""),
|
||||||
|
p["paper_code"], p["session"], p["doc_type"])
|
||||||
|
sbucket, spath = _split_loc(src_loc)
|
||||||
|
try:
|
||||||
|
data = storage.download_file(sbucket, spath)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[user-subset] source missing {src_loc}: {exc}; skipping {p['exam_code']}")
|
||||||
|
continue
|
||||||
|
file_id = str(_uuid.uuid4())
|
||||||
|
safe_name = f"{p['exam_code']}.pdf"
|
||||||
|
dst_bucket = "cc.users"
|
||||||
|
dst_path = f"exam-marker/{institute_id}/{cabinet_id}/{file_id}/{safe_name}"
|
||||||
|
try:
|
||||||
|
storage.upload_file(dst_bucket, dst_path, data, "application/pdf", upsert=True)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[user-subset] copy upload failed {dst_path}: {exc}")
|
||||||
|
continue
|
||||||
|
client.supabase.table("files").upsert({
|
||||||
|
"id": file_id, "cabinet_id": cabinet_id, "name": safe_name, "path": dst_path,
|
||||||
|
"bucket": dst_bucket, "mime_type": "application/pdf", "uploaded_by": user_id,
|
||||||
|
"size_bytes": len(data), "source": "exam-corpus-seed", "is_directory": False,
|
||||||
|
"relative_path": safe_name, "processing_status": "uploaded",
|
||||||
|
}).execute()
|
||||||
|
logger.info(f"[user-subset] copied {p['exam_code']} -> {dst_bucket}/{dst_path}")
|
||||||
|
rep.user_copies += 1
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── first sweep (docling auto-map) ───────────────────────────────
|
||||||
|
def _resolve_system_identity(client: SupabaseServiceRoleClient, m: Dict[str, Any]) -> Optional[Tuple[str, str]]:
|
||||||
|
cfg = m.get("system_identity") or m.get("test_subset") or {}
|
||||||
|
user_id = cfg.get("teacher_id") or cfg.get("user_id")
|
||||||
|
if not user_id and cfg.get("user_email"):
|
||||||
|
res = client.supabase.table("profiles").select("id").eq("email", cfg["user_email"]).limit(1).execute()
|
||||||
|
rows = getattr(res, "data", None) or []
|
||||||
|
user_id = rows[0]["id"] if rows else None
|
||||||
|
institute_id = cfg.get("institute_id")
|
||||||
|
if user_id and not institute_id:
|
||||||
|
res = client.supabase.table("institute_memberships").select("institute_id").eq("profile_id", user_id).limit(1).execute()
|
||||||
|
rows = getattr(res, "data", None) or []
|
||||||
|
institute_id = rows[0]["institute_id"] if rows else None
|
||||||
|
if not user_id or not institute_id:
|
||||||
|
logger.warning("[first-sweep] no system identity (set system_identity.teacher_id+institute_id); skipping sweep")
|
||||||
|
return None
|
||||||
|
return user_id, institute_id
|
||||||
|
|
||||||
|
def first_sweep(client: SupabaseServiceRoleClient, storage: StorageAdmin,
|
||||||
|
m: Dict[str, Any], board_filter: Optional[str], spec_filter: Optional[str],
|
||||||
|
cache_dir: str, rep: LoadReport) -> None:
|
||||||
|
"""Run the docling/auto_map first pass over seeded QP papers and persist the resulting
|
||||||
|
template structure (questions/response areas/boundaries/layout) via the same mapping the
|
||||||
|
/auto-map endpoint uses. System-owned exam_templates are created per QP paper.
|
||||||
|
|
||||||
|
Requires a resolvable `system_identity` (teacher_id/user_email + institute_id) on this env.
|
||||||
|
"""
|
||||||
|
identity = _resolve_system_identity(client, m)
|
||||||
|
if not identity:
|
||||||
|
return
|
||||||
|
teacher_id, institute_id = identity
|
||||||
|
|
||||||
|
# Import the auto-map mapping helpers lazily (pulls fastapi/router only when sweeping).
|
||||||
|
try:
|
||||||
|
from api.services.docling import auto_map, AutoMapError
|
||||||
|
from routers.exam.templates import _map_first_pass_to_rows
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"[first-sweep] could not import auto-map pipeline: {exc}")
|
||||||
|
rep.errors.append(f"first-sweep import: {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
sb = client.supabase
|
||||||
|
for board in m.get("boards", []):
|
||||||
|
if board_filter and board.get("exam_board_code") != board_filter:
|
||||||
|
continue
|
||||||
|
for spec in board.get("specifications", []):
|
||||||
|
if spec_filter and spec.get("spec_code") != spec_filter:
|
||||||
|
continue
|
||||||
|
for p in spec.get("papers", []):
|
||||||
|
if p.get("doc_type") != "QP":
|
||||||
|
continue
|
||||||
|
# Resolve the seeded eb_exams row (id) for the template join.
|
||||||
|
ex = sb.table("eb_exams").select("id, exam_code").eq("exam_code", p["exam_code"]).limit(1).execute()
|
||||||
|
ex_rows = getattr(ex, "data", None) or []
|
||||||
|
exam_id = ex_rows[0]["id"] if ex_rows else None
|
||||||
|
|
||||||
|
loc = paper_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
|
||||||
|
spec.get("award_code", ""), p["paper_code"], p["session"], p["doc_type"])
|
||||||
|
bkt, path = _split_loc(loc)
|
||||||
|
try:
|
||||||
|
pdf_bytes = storage.download_file(bkt, path)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[first-sweep] source missing {loc}: {exc}; skipping {p['exam_code']}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure a system-owned template for this paper (idempotent on exam_code+teacher).
|
||||||
|
tpl = sb.table("exam_templates").select("id").eq("exam_code", p["exam_code"]).eq("teacher_id", teacher_id).limit(1).execute()
|
||||||
|
tpl_rows = getattr(tpl, "data", None) or []
|
||||||
|
if tpl_rows:
|
||||||
|
template_id = tpl_rows[0]["id"]
|
||||||
|
else:
|
||||||
|
new_tpl = sb.table("exam_templates").insert({
|
||||||
|
"exam_id": exam_id, "exam_code": p["exam_code"], "institute_id": institute_id,
|
||||||
|
"teacher_id": teacher_id, "title": f"{p['exam_code']} (auto-map seed)",
|
||||||
|
"subject": spec.get("subject_code"), "status": "draft",
|
||||||
|
}).execute()
|
||||||
|
template_id = (getattr(new_tpl, "data", None) or [{}])[0].get("id")
|
||||||
|
if not template_id:
|
||||||
|
logger.warning(f"[first-sweep] could not ensure template for {p['exam_code']}; skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
first_pass = auto_map(pdf_bytes, source_pdf=loc)
|
||||||
|
rows = _map_first_pass_to_rows(template_id, first_pass, pdf_bytes)
|
||||||
|
except (AutoMapError, ValueError) as exc:
|
||||||
|
logger.warning(f"[first-sweep] auto-map failed for {p['exam_code']}: {exc}")
|
||||||
|
rep.sweep_failed += 1
|
||||||
|
continue
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception(f"[first-sweep] unexpected error for {p['exam_code']}: {exc}")
|
||||||
|
rep.sweep_failed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Refresh derived rows. Seed templates are system-owned with no human edits to
|
||||||
|
# preserve, so we clear ALL child rows for the template (not just ai/unconfirmed)
|
||||||
|
# and re-insert id-deduped payloads — idempotent across re-runs and robust to the
|
||||||
|
# deterministic uuid5 ids the mapper can repeat within a batch.
|
||||||
|
for table in ("exam_response_areas", "exam_boundaries", "exam_template_layout", "exam_questions"):
|
||||||
|
sb.table(table).delete().eq("template_id", template_id).execute()
|
||||||
|
for table, key in (("exam_questions", "questions"), ("exam_response_areas", "response_areas"),
|
||||||
|
("exam_boundaries", "boundaries"), ("exam_template_layout", "layout")):
|
||||||
|
seen_ids: set = set()
|
||||||
|
payload = []
|
||||||
|
for r in (rows.get(key) or []):
|
||||||
|
rid = r.get("id")
|
||||||
|
if rid is not None and rid in seen_ids:
|
||||||
|
continue
|
||||||
|
if rid is not None:
|
||||||
|
seen_ids.add(rid)
|
||||||
|
payload.append(r)
|
||||||
|
if payload:
|
||||||
|
sb.table(table).insert(payload).execute()
|
||||||
|
updates = {"page_count": first_pass.get("meta", {}).get("n_pages")}
|
||||||
|
sb.table("exam_templates").update({k: v for k, v in updates.items() if v is not None}).eq("id", template_id).execute()
|
||||||
|
logger.info(f"[first-sweep] swept {p['exam_code']} -> template {template_id} "
|
||||||
|
f"(q={len(rows.get('questions', []))} ra={len(rows.get('response_areas', []))})")
|
||||||
|
rep.swept += 1
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── unseed (inverse of the loader) ───────────────────────────────
|
||||||
|
def _chunks(seq: List[Any], n: int = 100):
|
||||||
|
for i in range(0, len(seq), n):
|
||||||
|
yield seq[i:i + n]
|
||||||
|
|
||||||
|
def _storage_remove(storage: StorageAdmin, bucket: str, paths: List[str]) -> None:
|
||||||
|
"""Remove object paths from a bucket through the Supabase Storage API.
|
||||||
|
|
||||||
|
The python client treats missing objects as a successful no-op, which is useful for
|
||||||
|
unseed idempotency. Any API/permission failure is raised so callers can avoid
|
||||||
|
deleting the matching DB rows while storage may still exist.
|
||||||
|
"""
|
||||||
|
result = storage.client.supabase.storage.from_(bucket).remove(paths)
|
||||||
|
error = getattr(result, "error", None)
|
||||||
|
if error:
|
||||||
|
raise StorageError(str(error))
|
||||||
|
if isinstance(result, dict) and result.get("error"):
|
||||||
|
raise StorageError(str(result["error"]))
|
||||||
|
|
||||||
|
def _delete_user_subset_files(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||||
|
exam_codes: Optional[List[str]], rep: LoadReport) -> None:
|
||||||
|
"""Delete --user-subset files from cc.users storage, then their files rows.
|
||||||
|
|
||||||
|
User-subset seeding writes rows with source='exam-corpus-seed', bucket='cc.users',
|
||||||
|
and paths under exam-marker/. Storage must be removed before the files rows: the
|
||||||
|
files GC trigger also tries to delete storage when rows are deleted, so removing
|
||||||
|
objects first avoids trigger failures and keeps this operation idempotent.
|
||||||
|
|
||||||
|
exam_codes=None means remove all user-subset seed rows (used by unscoped unseed
|
||||||
|
even if the eb_* rows were already removed by a prior partial run).
|
||||||
|
"""
|
||||||
|
sb = client.supabase
|
||||||
|
seeded_files: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
|
def _base_query():
|
||||||
|
return sb.table("files").select("id, bucket, path, name, source") \
|
||||||
|
.eq("bucket", "cc.users").eq("source", "exam-corpus-seed") \
|
||||||
|
.like("path", "exam-marker/%")
|
||||||
|
|
||||||
|
if exam_codes is None:
|
||||||
|
seeded_files.extend(getattr(_base_query().execute(), "data", None) or [])
|
||||||
|
elif exam_codes:
|
||||||
|
for chunk in _chunks([f"{code}.pdf" for code in exam_codes if code], 100):
|
||||||
|
seeded_files.extend(getattr(_base_query().in_("name", chunk).execute(), "data", None) or [])
|
||||||
|
|
||||||
|
rows_by_id: Dict[str, Dict[str, Any]] = {}
|
||||||
|
paths_by_bucket: Dict[str, List[str]] = {}
|
||||||
|
seen_paths: set = set()
|
||||||
|
for row in seeded_files:
|
||||||
|
row_id = row.get("id")
|
||||||
|
bucket = row.get("bucket")
|
||||||
|
path = row.get("path")
|
||||||
|
if row_id:
|
||||||
|
rows_by_id[str(row_id)] = row
|
||||||
|
if bucket == "cc.users" and isinstance(path, str) and path.startswith("exam-marker/"):
|
||||||
|
key = (bucket, path)
|
||||||
|
if key not in seen_paths:
|
||||||
|
seen_paths.add(key)
|
||||||
|
paths_by_bucket.setdefault(bucket, []).append(path)
|
||||||
|
|
||||||
|
removable_ids = list(rows_by_id)
|
||||||
|
if not removable_ids and not paths_by_bucket:
|
||||||
|
logger.info("[unseed] no user-subset cc.users files to remove")
|
||||||
|
return
|
||||||
|
|
||||||
|
for bkt, paths in paths_by_bucket.items():
|
||||||
|
for chunk in _chunks(paths, 100):
|
||||||
|
try:
|
||||||
|
_storage_remove(storage, bkt, chunk)
|
||||||
|
rep.unseed_objects += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] user-subset storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
|
||||||
|
rep.errors.append(f"user-subset storage remove {bkt}: {exc}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for chunk in _chunks(removable_ids, 100):
|
||||||
|
try:
|
||||||
|
sb.table("files").delete().in_("id", chunk).execute()
|
||||||
|
rep.unseed_user_files += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] user-subset files delete failed: {exc}")
|
||||||
|
rep.errors.append(f"user-subset files delete: {exc}")
|
||||||
|
|
||||||
|
def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||||
|
board_filter: Optional[str], spec_filter: Optional[str],
|
||||||
|
drop_specs: bool = True, drop_seed_templates: bool = True, rep: LoadReport) -> None:
|
||||||
|
"""Inverse of the loader: remove the seeded public corpus, scoped by --board/--spec (or all).
|
||||||
|
|
||||||
|
Deletes (in FK-safe order): cc.examboards storage objects (via the Storage API, since the
|
||||||
|
protect_delete trigger blocks direct SQL deletes), first-sweep exam_templates created by the
|
||||||
|
seed (title '... (auto-map seed)', cascades children), eb_exams rows, then eb_specifications.
|
||||||
|
"""
|
||||||
|
sb = client.supabase
|
||||||
|
q = sb.table("eb_specifications").select("spec_code, storage_loc, exam_board_code")
|
||||||
|
if board_filter:
|
||||||
|
q = q.eq("exam_board_code", board_filter)
|
||||||
|
if spec_filter:
|
||||||
|
q = q.eq("spec_code", spec_filter)
|
||||||
|
specs = getattr(q.execute(), "data", None) or []
|
||||||
|
spec_codes = [s["spec_code"] for s in specs]
|
||||||
|
if not spec_codes:
|
||||||
|
if not board_filter and not spec_filter:
|
||||||
|
_delete_user_subset_files(client, storage, exam_codes=None, rep=rep)
|
||||||
|
logger.info("[unseed] no matching specifications; nothing to do")
|
||||||
|
return
|
||||||
|
|
||||||
|
exams: List[Dict[str, Any]] = []
|
||||||
|
for chunk in _chunks(spec_codes):
|
||||||
|
res = sb.table("eb_exams").select("id, exam_code, storage_loc").in_("spec_code", chunk).execute()
|
||||||
|
exams.extend(getattr(res, "data", None) or [])
|
||||||
|
|
||||||
|
# 1) User-subset storage/rows. Storage is removed before files rows so trg_files_gc has
|
||||||
|
# nothing left to collect when rows are deleted.
|
||||||
|
user_subset_exam_codes = None if not board_filter and not spec_filter else [
|
||||||
|
e.get("exam_code") for e in exams if e.get("exam_code")
|
||||||
|
]
|
||||||
|
_delete_user_subset_files(client, storage, exam_codes=user_subset_exam_codes, rep=rep)
|
||||||
|
|
||||||
|
# 2) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
|
||||||
|
by_bucket: Dict[str, List[str]] = {}
|
||||||
|
for row in exams + specs:
|
||||||
|
loc = row.get("storage_loc")
|
||||||
|
if not loc or "/" not in loc:
|
||||||
|
continue
|
||||||
|
bkt, _, path = loc.partition("/")
|
||||||
|
by_bucket.setdefault(bkt, []).append(path)
|
||||||
|
for bkt, paths in by_bucket.items():
|
||||||
|
for chunk in _chunks(paths, 100):
|
||||||
|
try:
|
||||||
|
storage.client.supabase.storage.from_(bkt).remove(chunk)
|
||||||
|
rep.unseed_objects += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
|
||||||
|
|
||||||
|
# 3) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
|
||||||
|
if drop_seed_templates and exams:
|
||||||
|
exam_codes = [e["exam_code"] for e in exams if e.get("exam_code")]
|
||||||
|
for chunk in _chunks(exam_codes, 100):
|
||||||
|
try:
|
||||||
|
res = sb.table("exam_templates").delete(count="exact") \
|
||||||
|
.in_("exam_code", chunk).like("title", "%(auto-map seed)%").execute()
|
||||||
|
rep.unseed_templates += getattr(res, "count", None) or len(getattr(res, "data", []) or [])
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] template delete failed: {exc}")
|
||||||
|
|
||||||
|
# 4) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
|
||||||
|
exam_ids = [e["id"] for e in exams]
|
||||||
|
for chunk in _chunks(exam_ids, 100):
|
||||||
|
try:
|
||||||
|
sb.table("eb_exams").delete().in_("id", chunk).execute()
|
||||||
|
rep.unseed_exams += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] eb_exams delete failed: {exc}")
|
||||||
|
if drop_specs:
|
||||||
|
for chunk in _chunks(spec_codes, 100):
|
||||||
|
try:
|
||||||
|
sb.table("eb_specifications").delete().in_("spec_code", chunk).execute()
|
||||||
|
rep.unseed_specs += len(chunk)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(f"[unseed] eb_specifications delete failed: {exc}")
|
||||||
|
|
||||||
|
logger.info(f"unseed done: storage_objects={rep.unseed_objects} user_files={rep.unseed_user_files} "
|
||||||
|
f"templates={rep.unseed_templates} exams={rep.unseed_exams} specs={rep.unseed_specs}")
|
||||||
|
|
||||||
|
|
||||||
|
# ─────────────────────────────── orchestration ───────────────────────────────
|
||||||
|
def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Optional[str],
|
||||||
|
spec_filter: Optional[str], user_subset: bool, do_first_sweep: bool,
|
||||||
|
cache_dir: str = DEFAULT_CACHE_DIR, store_dir: Optional[str] = None) -> LoadReport:
|
||||||
|
with open(manifest_path) as f:
|
||||||
|
m = yaml.safe_load(f)
|
||||||
|
rep = LoadReport()
|
||||||
|
|
||||||
|
errs = validate_manifest(m)
|
||||||
|
if errs:
|
||||||
|
rep.errors = list(errs)
|
||||||
|
logger.error(f"manifest validation failed: {len(errs)} error(s)")
|
||||||
|
for e in errs[:40]:
|
||||||
|
logger.error(f" - {e}")
|
||||||
|
if not dry_run:
|
||||||
|
return rep
|
||||||
|
|
||||||
|
client = None if dry_run else SupabaseServiceRoleClient()
|
||||||
|
storage = None if dry_run else StorageAdmin()
|
||||||
|
|
||||||
|
for board in m.get("boards", []):
|
||||||
|
if board_filter and board.get("exam_board_code") != board_filter:
|
||||||
|
continue
|
||||||
|
for spec in board.get("specifications", []):
|
||||||
|
if spec_filter and spec.get("spec_code") != spec_filter:
|
||||||
|
continue
|
||||||
|
# Specification document (optional).
|
||||||
|
sloc = None
|
||||||
|
spec_sha = None
|
||||||
|
sf = spec.get("spec_file")
|
||||||
|
if sf and sf.get("source"):
|
||||||
|
sloc = spec_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
|
||||||
|
spec.get("award_code", ""), spec.get("spec_ver", ""))
|
||||||
|
if not dry_run:
|
||||||
|
try:
|
||||||
|
spec_sha = upload_file(storage, sloc,
|
||||||
|
_item_bytes(sf["source"], sloc, store_dir=store_dir,
|
||||||
|
cache_dir=cache_dir, rep=rep),
|
||||||
|
force=force, rep=rep)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"[spec-file] {spec.get('spec_code')}: {exc}")
|
||||||
|
rep.files_failed += 1
|
||||||
|
rep.errors.append(f"spec-file {spec.get('spec_code')}: {exc}")
|
||||||
|
if not dry_run:
|
||||||
|
upsert_specification(client, spec, sloc, spec_sha, rep)
|
||||||
|
|
||||||
|
# Papers.
|
||||||
|
for p in spec.get("papers", []):
|
||||||
|
ploc = paper_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
|
||||||
|
spec.get("award_code", ""), p["paper_code"], p["session"], p["doc_type"])
|
||||||
|
if dry_run:
|
||||||
|
continue
|
||||||
|
psha = None
|
||||||
|
try:
|
||||||
|
psha = upload_file(storage, ploc,
|
||||||
|
_item_bytes(p["file"]["source"], ploc, store_dir=store_dir,
|
||||||
|
cache_dir=cache_dir, rep=rep),
|
||||||
|
force=force, rep=rep)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(f"[paper-file] {p.get('exam_code')}: {exc}")
|
||||||
|
rep.files_failed += 1
|
||||||
|
rep.errors.append(f"paper-file {p.get('exam_code')}: {exc}")
|
||||||
|
upsert_paper(client, spec["spec_code"], p, ploc, psha, rep)
|
||||||
|
|
||||||
|
if user_subset and not dry_run:
|
||||||
|
copy_user_test_subset(client, storage, m, rep)
|
||||||
|
if do_first_sweep and not dry_run:
|
||||||
|
first_sweep(client, storage, m, board_filter, spec_filter, cache_dir, rep)
|
||||||
|
|
||||||
|
logger.info(f"corpus load done: specs={rep.specs_upserted} papers={rep.papers_upserted} "
|
||||||
|
f"uploaded={rep.files_uploaded} skipped={rep.files_skipped} failed={rep.files_failed} "
|
||||||
|
f"user_copies={rep.user_copies} swept={rep.swept} errors={len(rep.errors)}")
|
||||||
|
return rep
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description="Seed (or unseed) the public exam-paper corpus from a manifest.")
|
||||||
|
ap.add_argument("--manifest", help="corpus manifest (required except for --unseed)")
|
||||||
|
ap.add_argument("--dry-run", action="store_true", help="validate + report, no writes")
|
||||||
|
ap.add_argument("--force", action="store_true", help="re-upload/overwrite existing storage objects")
|
||||||
|
ap.add_argument("--board", default=None, help="only this exam_board_code")
|
||||||
|
ap.add_argument("--spec", default=None, help="only this spec_code")
|
||||||
|
ap.add_argument("--user-subset", action="store_true", help="also seed a user-side test subset")
|
||||||
|
ap.add_argument("--first-sweep", action="store_true", help="run docling/auto-map first pass on seeded papers")
|
||||||
|
ap.add_argument("--cache-dir", default=DEFAULT_CACHE_DIR, help="throwaway url-hash cache dir")
|
||||||
|
ap.add_argument("--store-dir", default=DEFAULT_STORE_DIR,
|
||||||
|
help="persistent, bucket-shaped local store (download-once, seed-many)")
|
||||||
|
ap.add_argument("--no-store", action="store_true",
|
||||||
|
help="ignore the local store; always fetch from source (don't read/populate the store)")
|
||||||
|
ap.add_argument("--download-only", action="store_true",
|
||||||
|
help="populate the local store from the manifest; no DB/bucket writes")
|
||||||
|
ap.add_argument("--unseed", action="store_true",
|
||||||
|
help="INVERSE: remove seeded eb_*/storage/first-sweep templates (scoped by --board/--spec)")
|
||||||
|
a = ap.parse_args()
|
||||||
|
store_dir = None if a.no_store else a.store_dir
|
||||||
|
import json
|
||||||
|
|
||||||
|
if a.unseed:
|
||||||
|
rep = LoadReport()
|
||||||
|
unseed(SupabaseServiceRoleClient(), StorageAdmin(),
|
||||||
|
board_filter=a.board, spec_filter=a.spec, rep=rep)
|
||||||
|
print(json.dumps(rep.as_dict(), indent=2))
|
||||||
|
return
|
||||||
|
|
||||||
|
if not a.manifest:
|
||||||
|
ap.error("--manifest is required unless --unseed is given")
|
||||||
|
|
||||||
|
if a.download_only:
|
||||||
|
with open(a.manifest) as f:
|
||||||
|
m = yaml.safe_load(f)
|
||||||
|
rep = LoadReport()
|
||||||
|
download_corpus(m, store_dir=(a.store_dir), board_filter=a.board, spec_filter=a.spec,
|
||||||
|
cache_dir=a.cache_dir, rep=rep)
|
||||||
|
print(json.dumps(rep.as_dict(), indent=2))
|
||||||
|
return
|
||||||
|
|
||||||
|
rep = load(a.manifest, dry_run=a.dry_run, force=a.force, board_filter=a.board, spec_filter=a.spec,
|
||||||
|
user_subset=a.user_subset, do_first_sweep=a.first_sweep, cache_dir=a.cache_dir,
|
||||||
|
store_dir=store_dir)
|
||||||
|
print(json.dumps(rep.as_dict(), indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
53
tests/test_docling_auto_map.py
Normal file
53
tests/test_docling_auto_map.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from api.services.docling import FIRST_PASS_SCHEMA, auto_map
|
||||||
|
|
||||||
|
|
||||||
|
SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
|
||||||
|
PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf"
|
||||||
|
PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json"
|
||||||
|
BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present")
|
||||||
|
def test_auto_map_matches_spike_physics_template_shape():
|
||||||
|
expected = json.loads(PHYSICS_TEMPLATE.read_text())
|
||||||
|
result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT)
|
||||||
|
|
||||||
|
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
|
||||||
|
assert result["meta"]["schema"] == expected["meta"]["schema"]
|
||||||
|
assert set(result.keys()) == set(expected.keys())
|
||||||
|
assert result["meta"]["board"] == expected["meta"]["board"]
|
||||||
|
assert result["meta"]["paper_code"] == expected["meta"]["paper_code"]
|
||||||
|
assert len(result["margins"]) == len(expected["margins"])
|
||||||
|
assert set(result["pages"].keys()) == set(expected["pages"].keys())
|
||||||
|
assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
|
||||||
|
part_band = result["pages"]["2"]["part_bands"][0]
|
||||||
|
assert set(expected["pages"]["2"]["part_bands"][0].keys()).issubset(part_band.keys())
|
||||||
|
assert part_band["box"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")
|
||||||
|
def test_auto_map_fast_path_without_cache_produces_first_pass_template():
|
||||||
|
result = auto_map(
|
||||||
|
BORN_DIGITAL_PDF.read_bytes(),
|
||||||
|
source_pdf="samples/physics-p1h-2022-qp.pdf",
|
||||||
|
spike_root=SPIKE_ROOT,
|
||||||
|
prefer_cache=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
|
||||||
|
assert result["meta"]["board"] == "aqa"
|
||||||
|
assert result["meta"]["paper_code"] == "8463/1"
|
||||||
|
assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf"
|
||||||
|
assert result["margins"]
|
||||||
|
assert result["pages"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_rejects_empty_pdf_bytes():
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
auto_map(b"")
|
||||||
81
tests/test_docling_extract.py
Normal file
81
tests/test_docling_extract.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from api.services.docling.extract import aqa_questions_rapid
|
||||||
|
|
||||||
|
|
||||||
|
def _text(raw, page, l, t, r=120, b=None):
|
||||||
|
return {
|
||||||
|
"text": raw,
|
||||||
|
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
|
||||||
|
(tmp_path / "p1.json").write_text(
|
||||||
|
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "02.3" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
|
||||||
|
(tmp_path / "p1.json").write_text(
|
||||||
|
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
(tmp_path / "p2.json").write_text(
|
||||||
|
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert parts["07.1"]["page"] == 2
|
||||||
|
assert parts["07.1"]["bbox"]["l"] == 49
|
||||||
|
assert "07.2" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
|
||||||
|
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
|
||||||
|
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
|
||||||
|
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
|
||||||
|
assert label in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "01.3" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
|
||||||
|
import json
|
||||||
|
texts = [
|
||||||
|
_text("05.2 Some question text", 1, 49, 700),
|
||||||
|
_text("05.3 Middle question text", 1, 49, 620),
|
||||||
|
_text("05.5 Later question text", 2, 49, 740),
|
||||||
|
]
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
|
||||||
|
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "05.1" in parts
|
||||||
|
assert "05.4" in parts
|
||||||
|
assert "05.5" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "03.0" in parts
|
||||||
83
tests/test_docling_regions.py
Normal file
83
tests/test_docling_regions.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from PIL import Image, ImageDraw
|
||||||
|
|
||||||
|
from api.services.docling import extract
|
||||||
|
from api.services.docling.regions import detect_response_regions_from_image
|
||||||
|
|
||||||
|
|
||||||
|
def test_detects_grouped_answer_lines() -> None:
|
||||||
|
image = Image.new("RGB", (900, 1200), "white")
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for y in (420, 470, 520):
|
||||||
|
draw.line((160, y, 760, y), fill="black", width=3)
|
||||||
|
|
||||||
|
candidates = detect_response_regions_from_image(image, page_index=2)
|
||||||
|
|
||||||
|
line_regions = [c.to_mapper_dict() for c in candidates if c.region_type == "answer_lines"]
|
||||||
|
assert line_regions
|
||||||
|
best = line_regions[0]
|
||||||
|
assert best["kind"] == "response"
|
||||||
|
assert best["source"] == "ai"
|
||||||
|
assert best["confirmed"] is False
|
||||||
|
assert best["page_index"] == 2
|
||||||
|
assert best["line_count"] == 3
|
||||||
|
assert best["bbox"]["coord_origin"] == "TOPLEFT"
|
||||||
|
assert best["bbox"]["w"] > 550
|
||||||
|
assert best["bbox"]["h"] > 80
|
||||||
|
|
||||||
|
|
||||||
|
def test_detects_answer_box() -> None:
|
||||||
|
image = Image.new("RGB", (900, 1200), "white")
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
draw.rectangle((140, 300, 780, 520), outline="black", width=3)
|
||||||
|
|
||||||
|
candidates = detect_response_regions_from_image(image, page_index=0)
|
||||||
|
|
||||||
|
boxes = [c.to_mapper_dict() for c in candidates if c.region_type == "answer_box"]
|
||||||
|
assert boxes
|
||||||
|
assert boxes[0]["bbox"]["w"] > 600
|
||||||
|
assert boxes[0]["bbox"]["h"] > 200
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_response_region_taxonomy_for_lines_and_boxes():
|
||||||
|
img = Image.new("RGB", (800, 1000), "white")
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
for y in (220, 260, 300):
|
||||||
|
draw.line((120, y, 680, y), fill="black", width=2)
|
||||||
|
draw.rectangle((140, 520, 660, 640), outline="black", width=3)
|
||||||
|
|
||||||
|
regions = detect_response_regions_from_image(img, min_confidence=0.1)
|
||||||
|
types = {r.region_type for r in regions}
|
||||||
|
|
||||||
|
assert "answer_lines" in types
|
||||||
|
assert "answer_box" in types
|
||||||
|
|
||||||
|
|
||||||
|
def test_attach_detected_response_regions_normalizes_taxonomy_and_uses_pdf_y(monkeypatch, tmp_path):
|
||||||
|
pdf = tmp_path / "paper.pdf"
|
||||||
|
pdf.write_bytes(b"%PDF test placeholder")
|
||||||
|
parts = {
|
||||||
|
"01.1": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 700, "b": 680}, "regions": []},
|
||||||
|
"01.2": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 500, "b": 480}, "regions": []},
|
||||||
|
}
|
||||||
|
|
||||||
|
def fake_detect(path, min_confidence=0.32):
|
||||||
|
return [{
|
||||||
|
"page_index": 0,
|
||||||
|
"region_type": "answer-box",
|
||||||
|
"confidence": 0.77,
|
||||||
|
"bbox": {"x": 100, "y": 335, "w": 500, "h": 40},
|
||||||
|
"detection_method": "test",
|
||||||
|
"meta": {"page_height_px": 1000, "page_height_pdf": 800},
|
||||||
|
}]
|
||||||
|
|
||||||
|
monkeypatch.setattr(extract.region_mod, "detect_response_regions_from_pdf", fake_detect)
|
||||||
|
|
||||||
|
attached, candidates = extract.attach_detected_response_regions(parts, str(pdf))
|
||||||
|
|
||||||
|
assert attached == 1
|
||||||
|
assert len(candidates) == 1
|
||||||
|
assert parts["01.1"]["regions"] == []
|
||||||
|
assert parts["01.2"]["regions"][0]["type"] == "answer_box"
|
||||||
|
assert parts["01.2"]["regions"][0]["source"] == "opencv"
|
||||||
55
tests/test_docling_template.py
Normal file
55
tests/test_docling_template.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
from api.services.docling.template import build, synthesize_part_box
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthesize_part_box_uses_content_margins_and_band_y():
|
||||||
|
part_band = {"label": "01.1", "y_start": 712.34, "y_end": 601.27}
|
||||||
|
content_x_band = {"x_left": 54.04, "x_right": 521.96}
|
||||||
|
|
||||||
|
assert synthesize_part_box(part_band, content_x_band) == {
|
||||||
|
"l": 54.0,
|
||||||
|
"t": 712.3,
|
||||||
|
"r": 522.0,
|
||||||
|
"b": 601.3,
|
||||||
|
"coord_origin": "BOTTOMLEFT",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthesize_part_box_returns_none_without_margin_contract():
|
||||||
|
assert synthesize_part_box({"y_start": 700, "y_end": 650}, {}) is None
|
||||||
|
assert synthesize_part_box({"y_start": 700}, {"x_left": 50, "x_right": 520}) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_carries_label_box_as_anchor_and_single_synthesized_part_box():
|
||||||
|
structured = {
|
||||||
|
"board": "aqa",
|
||||||
|
"paper_code": "8463/1",
|
||||||
|
"questions": [{
|
||||||
|
"question": "01",
|
||||||
|
"parts": [{
|
||||||
|
"label": "01.1",
|
||||||
|
"page": 2,
|
||||||
|
"bbox": {"l": 40, "t": 720, "r": 70, "b": 705},
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
bands = {
|
||||||
|
"pages": {
|
||||||
|
"2": {
|
||||||
|
"main": [{"question": "01", "y_start": 730, "y_end": 0, "is_start": True}],
|
||||||
|
"part": [{"label": "01.1", "question": "01", "y_start": 720, "y_end": 610}],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
furniture = {
|
||||||
|
"n_pages": 2,
|
||||||
|
"content_margins": {
|
||||||
|
"content_x_band": {"x_left": 55, "x_right": 515},
|
||||||
|
"per_page": {"2": {"top": 760, "bottom": 40, "left": 55, "right": 515}},
|
||||||
|
},
|
||||||
|
"items": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
part = build(structured, bands, furniture)["pages"]["2"]["part_bands"][0]
|
||||||
|
|
||||||
|
assert part["label_box"] == {"l": 40, "t": 720, "r": 70, "b": 705}
|
||||||
|
assert part["box"] == {"l": 55, "t": 720, "r": 515, "b": 610, "coord_origin": "BOTTOMLEFT"}
|
||||||
@ -143,6 +143,9 @@ class _FakeStorageAdmin:
|
|||||||
def download_file(self, bucket_id, file_path):
|
def download_file(self, bucket_id, file_path):
|
||||||
return b"%PDF-1.7 fake"
|
return b"%PDF-1.7 fake"
|
||||||
|
|
||||||
|
def create_signed_url(self, bucket_id, file_path, expires_in=3600):
|
||||||
|
return {"signedURL": f"https://storage.test/{bucket_id}/{file_path}?token=fake&expires_in={expires_in}"}
|
||||||
|
|
||||||
|
|
||||||
class _FakeServiceRoleClient:
|
class _FakeServiceRoleClient:
|
||||||
def __init__(self, store):
|
def __init__(self, store):
|
||||||
@ -171,6 +174,65 @@ def test_requires_auth_when_not_overridden():
|
|||||||
assert resp.status_code in (401, 403) # unauthenticated, not processed
|
assert resp.status_code in (401, 403) # unauthenticated, not processed
|
||||||
|
|
||||||
|
|
||||||
|
def test_catalogue_requires_auth_when_not_overridden():
|
||||||
|
app = FastAPI()
|
||||||
|
app.include_router(router, prefix="/api/exam")
|
||||||
|
resp = TestClient(app).get("/api/exam/catalogue")
|
||||||
|
assert resp.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_catalogue_papers_uses_as_user_metadata():
|
||||||
|
store = {
|
||||||
|
"eb_exams": [
|
||||||
|
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/p.pdf"},
|
||||||
|
{"id": "e2", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/ms.pdf"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
client, _ = make_client(store=store)
|
||||||
|
resp = client.get("/api/exam/catalogue")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert [p["id"] for p in resp.json()["papers"]] == ["e1"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_catalogue_signed_url_requires_auth_and_signs_examboard_pdf(monkeypatch):
|
||||||
|
store = {
|
||||||
|
"eb_exams": [
|
||||||
|
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/physics/qp.pdf"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
client, _ = make_client(store=store)
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||||
|
resp = client.get("/api/exam/catalogue/e1/signed-url?expires_in=120")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
body = resp.json()
|
||||||
|
assert body["bucket"] == "cc.examboards"
|
||||||
|
assert body["path"] == "aqa/physics/qp.pdf"
|
||||||
|
assert body["expires_in"] == 120
|
||||||
|
assert "token=fake" in body["signed_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_catalogue_signed_url_rejects_non_examboard_storage(monkeypatch):
|
||||||
|
store = {
|
||||||
|
"eb_exams": [
|
||||||
|
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.public/aqa/physics/qp.pdf"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
client, _ = make_client(store=store)
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||||
|
assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_catalogue_signed_url_rejects_non_catalogue_doc_type(monkeypatch):
|
||||||
|
store = {
|
||||||
|
"eb_exams": [
|
||||||
|
{"id": "e1", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/physics/ms.pdf"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
client, _ = make_client(store=store)
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||||
|
assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
|
||||||
|
|
||||||
|
|
||||||
def test_create_template_sets_owner_and_institute():
|
def test_create_template_sets_owner_and_institute():
|
||||||
client, store = make_client()
|
client, store = make_client()
|
||||||
resp = client.post("/api/exam/templates", json={"title": "AQA Physics 1H", "subject": "Physics"})
|
resp = client.post("/api/exam/templates", json={"title": "AQA Physics 1H", "subject": "Physics"})
|
||||||
@ -255,12 +317,14 @@ def test_get_template_bundles_children():
|
|||||||
"exam_questions": [{"id": "q1", "template_id": "t1", "label": "01", "order": 0}],
|
"exam_questions": [{"id": "q1", "template_id": "t1", "label": "01", "order": 0}],
|
||||||
"exam_response_areas": [{"id": "r1", "template_id": "t1", "question_id": "q1", "page": 1}],
|
"exam_response_areas": [{"id": "r1", "template_id": "t1", "question_id": "q1", "page": 1}],
|
||||||
"exam_boundaries": [{"id": "b1", "template_id": "t1", "page_index": 0, "y": 10}],
|
"exam_boundaries": [{"id": "b1", "template_id": "t1", "page_index": 0, "y": 10}],
|
||||||
|
"exam_template_layout": [{"id": "l1", "template_id": "t1", "page_index": 0, "role": "question_page"}],
|
||||||
}
|
}
|
||||||
client, _ = make_client(store=store)
|
client, _ = make_client(store=store)
|
||||||
body = client.get("/api/exam/templates/t1").json()
|
body = client.get("/api/exam/templates/t1").json()
|
||||||
assert len(body["questions"]) == 1
|
assert len(body["questions"]) == 1
|
||||||
assert len(body["response_areas"]) == 1
|
assert len(body["response_areas"]) == 1
|
||||||
assert len(body["boundaries"]) == 1
|
assert len(body["boundaries"]) == 1
|
||||||
|
assert body["layout"] == [{"id": "l1", "template_id": "t1", "page_index": 0, "role": "question_page"}]
|
||||||
|
|
||||||
|
|
||||||
def test_get_other_institute_template_is_404():
|
def test_get_other_institute_template_is_404():
|
||||||
@ -315,6 +379,42 @@ def test_put_persists_region_kinds_and_part_geometry():
|
|||||||
assert ras["c1"]["context_type"] == "data_table"
|
assert ras["c1"]["context_type"] == "data_table"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_put_round_trips_s5_layout_and_provenance_fields():
|
||||||
|
store = {"exam_templates": [{"id": "t1", "title": "p", "status": "draft", "institute_id": INST_A, "teacher_id": TEACHER}]}
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
payload = {
|
||||||
|
"questions": [{
|
||||||
|
"id": "q1", "label": "01", "order": 0, "max_marks": 4, "source": "ai",
|
||||||
|
"confirmed": False, "confidence": 0.82, "derivation": "docling:heading",
|
||||||
|
}],
|
||||||
|
"response_areas": [{
|
||||||
|
"id": "m1", "question_id": "q1", "page": 1, "bounds": {"x": 1}, "kind": "mark_area",
|
||||||
|
"mark_subtype": "grader_box", "source": "ai", "confirmed": False, "confidence": 0.71,
|
||||||
|
"derivation": "detected-explicit-grader-box",
|
||||||
|
}],
|
||||||
|
"boundaries": [{
|
||||||
|
"id": "b1", "question_id": "q1", "page_index": 0, "y": 99, "source": "ai",
|
||||||
|
"confirmed": False, "confidence": 0.66, "derivation": "bbox-gap",
|
||||||
|
}],
|
||||||
|
"layout": [{
|
||||||
|
"id": "l1", "page_index": 0, "role": "question_page", "margin_left": 12.5,
|
||||||
|
"margins_enabled": False, "source": "ai", "confirmed": False, "confidence": 0.93,
|
||||||
|
"derivation": "docling-page-layout", "meta": {"columns": 2},
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
resp = client.put("/api/exam/templates/t1", json=payload)
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert store["exam_questions"][0]["source"] == "ai"
|
||||||
|
assert store["exam_questions"][0]["confidence"] == 0.82
|
||||||
|
assert store["exam_response_areas"][0]["mark_subtype"] == "grader_box"
|
||||||
|
assert store["exam_response_areas"][0]["derivation"] == "detected-explicit-grader-box"
|
||||||
|
assert store["exam_boundaries"][0]["confidence"] == 0.66
|
||||||
|
assert store["exam_template_layout"][0]["meta"] == {"columns": 2}
|
||||||
|
body = resp.json()
|
||||||
|
assert body["layout"][0]["id"] == "l1"
|
||||||
|
assert body["layout"][0]["margins_enabled"] is False
|
||||||
|
|
||||||
def test_put_replace_clears_previous_children():
|
def test_put_replace_clears_previous_children():
|
||||||
store = {
|
store = {
|
||||||
"exam_templates": [{"id": "t1", "title": "p", "status": "draft", "institute_id": INST_A, "teacher_id": TEACHER}],
|
"exam_templates": [{"id": "t1", "title": "p", "status": "draft", "institute_id": INST_A, "teacher_id": TEACHER}],
|
||||||
@ -443,3 +543,179 @@ def test_neo4j_sync_non_owner_403():
|
|||||||
def test_neo4j_sync_404():
|
def test_neo4j_sync_404():
|
||||||
client, _ = make_client(store={"exam_templates": []})
|
client, _ = make_client(store={"exam_templates": []})
|
||||||
assert client.post("/api/exam/templates/does-not-exist/neo4j-sync").status_code == 404
|
assert client.post("/api/exam/templates/does-not-exist/neo4j-sync").status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
# ─── S5 auto-map endpoint ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _first_pass_template():
|
||||||
|
return {
|
||||||
|
"meta": {"schema": "exam-template/first-pass/v1", "paper_code": "8463/1", "n_pages": 1},
|
||||||
|
"margins": [
|
||||||
|
{"edge": "left", "axis": "x", "value": 50, "scope": "document", "source": "auto", "confirmed": False},
|
||||||
|
{"edge": "right", "axis": "x", "value": 550, "scope": "document", "source": "auto", "confirmed": False},
|
||||||
|
{"edge": "top", "axis": "y", "value": 780, "scope": "page", "page": 1, "source": "auto", "confirmed": False},
|
||||||
|
{"edge": "bottom", "axis": "y", "value": 60, "scope": "page", "page": 1, "source": "auto", "confirmed": False},
|
||||||
|
],
|
||||||
|
"pages": {
|
||||||
|
"1": {
|
||||||
|
"role": "question", "role_source": "auto", "margins_enabled": True,
|
||||||
|
"main_bands": [{"question": "01", "y_start": 780, "y_end": 60, "source": "auto", "confirmed": False}],
|
||||||
|
"part_bands": [{"label": "01.1", "question": "01", "y_start": 700, "y_end": 500, "label_box": {"l": 50, "t": 700, "r": 90, "b": 680, "coord_origin": "BOTTOMLEFT"}, "source": "auto", "confirmed": False}],
|
||||||
|
"furniture": [], "figures": [], "tables": [],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_auto_map(monkeypatch, store, *, fast=True):
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||||
|
monkeypatch.setattr(templates_mod, "SupabaseServiceRoleClient", lambda: _FakeServiceRoleClient(store))
|
||||||
|
monkeypatch.setattr(templates_mod, "_pdf_has_text_layer", lambda _pdf: fast)
|
||||||
|
monkeypatch.setattr(templates_mod, "auto_map", lambda *_a, **_k: _first_pass_template())
|
||||||
|
monkeypatch.setattr(templates_mod, "detect_response_regions_from_pdf", lambda *_a, **_k: [])
|
||||||
|
monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0}])
|
||||||
|
templates_mod._AUTO_MAP_JOB_STATUS.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def _template_with_source(owner=TEACHER):
|
||||||
|
return {
|
||||||
|
"exam_templates": [{"id": "t1", "title": "p", "status": "draft", "institute_id": INST_A, "teacher_id": owner, "source_file_id": "f1"}],
|
||||||
|
"files": [{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/paper.pdf", "name": "paper.pdf"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_box_to_canvas_uses_cropbox_as_page_origin():
|
||||||
|
pages = [{
|
||||||
|
"media_x0": 0.0, "crop_x0": 100.0, "crop_y0": 200.0,
|
||||||
|
"page_pt_w": 400.0, "page_pt_h": 600.0,
|
||||||
|
"rendered_w": 400.0, "rendered_h": 600.0,
|
||||||
|
"page_top": 25.0,
|
||||||
|
}]
|
||||||
|
box = {"l": 100.0, "t": 800.0, "r": 180.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
|
||||||
|
assert templates_mod._box_to_canvas(box, 1, pages) == {"x": 0.0, "y": 25.0, "w": 80.0, "h": 40.0}
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_deduplicates_continued_part_labels(monkeypatch):
|
||||||
|
monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [
|
||||||
|
{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0},
|
||||||
|
{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 800.0},
|
||||||
|
])
|
||||||
|
first_pass = _first_pass_template()
|
||||||
|
first_pass["meta"]["n_pages"] = 2
|
||||||
|
first_pass["pages"]["2"] = {
|
||||||
|
"role": "question", "role_source": "auto", "margins_enabled": True,
|
||||||
|
"main_bands": [],
|
||||||
|
"part_bands": [{"label": "01.1", "question": "01", "y_start": 760, "y_end": 600, "label_box": {"l": 50, "t": 760, "r": 90, "b": 740, "coord_origin": "BOTTOMLEFT"}, "source": "auto", "confirmed": False}],
|
||||||
|
"furniture": [], "figures": [], "tables": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
rows = templates_mod._map_first_pass_to_rows("t1", first_pass, b"%PDF", [])
|
||||||
|
|
||||||
|
question_ids = [q["id"] for q in rows["questions"]]
|
||||||
|
assert len(question_ids) == len(set(question_ids))
|
||||||
|
assert [q["label"] for q in rows["questions"]].count("01.1") == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_response_region_types_are_mapped_to_response_form_enum(monkeypatch):
|
||||||
|
monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0}])
|
||||||
|
first_pass = _first_pass_template()
|
||||||
|
regions = [
|
||||||
|
{"page_index": 0, "bbox": {"l": 50, "t": 700, "r": 100, "b": 680, "coord_origin": "BOTTOMLEFT"}, "region_type": "answer_lines", "confidence": 0.9},
|
||||||
|
{"page_index": 0, "bbox": {"l": 50, "t": 650, "r": 100, "b": 620, "coord_origin": "BOTTOMLEFT"}, "region_type": "answer_box", "confidence": 0.9},
|
||||||
|
{"page_index": 0, "bbox": {"l": 50, "t": 600, "r": 100, "b": 560, "coord_origin": "BOTTOMLEFT"}, "region_type": "working_space", "confidence": 0.9},
|
||||||
|
]
|
||||||
|
rows = templates_mod._map_first_pass_to_rows("t1", first_pass, b"%PDF", regions)
|
||||||
|
forms = [r.get("response_form") for r in rows["response_areas"] if r.get("derivation") == "opencv-response-region"]
|
||||||
|
assert forms == ["lines", "answer-box", "working"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_fast_path_merges_ai_rows_and_returns_detail(monkeypatch):
|
||||||
|
store = _template_with_source()
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
_patch_auto_map(monkeypatch, store, fast=True)
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
body = resp.json()
|
||||||
|
assert body["exam_code"] == "8463/1"
|
||||||
|
assert body["layout"] and body["layout"][0]["source"] == "ai"
|
||||||
|
assert any(q["label"] == "01.1" and q["source"] == "ai" and q["confirmed"] is False for q in store["exam_questions"])
|
||||||
|
assert store["exam_boundaries"] and store["exam_boundaries"][0]["derivation"] == "docling-main-band"
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_deduplicates_repeated_response_area_ids(monkeypatch):
|
||||||
|
store = _template_with_source()
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
_patch_auto_map(monkeypatch, store, fast=True)
|
||||||
|
dup = {"page_index": 0, "bbox": {"l": 50, "t": 700, "r": 100, "b": 680, "coord_origin": "BOTTOMLEFT"}, "region_type": "answer_lines", "confidence": 0.9}
|
||||||
|
monkeypatch.setattr(templates_mod, "detect_response_regions_from_pdf", lambda *_a, **_k: [dup, dict(dup)])
|
||||||
|
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
response_area_ids = [r["id"] for r in store["exam_response_areas"]]
|
||||||
|
assert len(response_area_ids) == len(set(response_area_ids))
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_preserves_manual_and_confirmed_rows_on_rerun(monkeypatch):
|
||||||
|
store = _template_with_source()
|
||||||
|
store.update({
|
||||||
|
"exam_questions": [
|
||||||
|
{"id": "manual", "template_id": "t1", "label": "manual", "order": 0, "source": "manual", "confirmed": True},
|
||||||
|
{"id": "accepted-ai", "template_id": "t1", "label": "accepted", "order": 1, "source": "ai", "confirmed": True},
|
||||||
|
{"id": "old-ai", "template_id": "t1", "label": "old", "order": 2, "source": "ai", "confirmed": False},
|
||||||
|
],
|
||||||
|
"exam_response_areas": [], "exam_boundaries": [], "exam_template_layout": [],
|
||||||
|
})
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
_patch_auto_map(monkeypatch, store, fast=True)
|
||||||
|
assert client.post("/api/exam/templates/t1/auto-map").status_code == 200
|
||||||
|
ids = {q["id"] for q in store["exam_questions"]}
|
||||||
|
assert {"manual", "accepted-ai"}.issubset(ids)
|
||||||
|
assert "old-ai" not in ids
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_non_owner_is_403_before_download(monkeypatch):
|
||||||
|
store = _template_with_source(owner=OTHER_TEACHER)
|
||||||
|
client, store = make_client(user_id=TEACHER, institute_ids=(INST_A,), store=store)
|
||||||
|
def _no_download(*_a, **_k):
|
||||||
|
raise AssertionError("download should not run before owner gate")
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _no_download)
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
assert resp.status_code == 403
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_owner_lost_institute_membership_is_404_before_download(monkeypatch):
|
||||||
|
store = _template_with_source(owner=TEACHER)
|
||||||
|
client, store = make_client(user_id=TEACHER, institute_ids=(INST_B,), store=store)
|
||||||
|
def _no_download(*_a, **_k):
|
||||||
|
raise AssertionError("download should not run before visibility gate")
|
||||||
|
monkeypatch.setattr(templates_mod, "StorageAdmin", _no_download)
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_blocks_when_marks_recorded(monkeypatch):
|
||||||
|
store = _template_with_source()
|
||||||
|
store.update({
|
||||||
|
"marking_batches": [{"id": "b1", "template_id": "t1"}],
|
||||||
|
"mark_entries": [{"id": "m1", "batch_id": "b1"}],
|
||||||
|
})
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
_patch_auto_map(monkeypatch, store, fast=True)
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
assert resp.status_code == 409
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_map_ocr_returns_job_id_and_status_completes(monkeypatch):
|
||||||
|
store = _template_with_source()
|
||||||
|
client, store = make_client(store=store)
|
||||||
|
_patch_auto_map(monkeypatch, store, fast=False)
|
||||||
|
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||||
|
assert resp.status_code == 202
|
||||||
|
job_id = resp.json()["job_id"]
|
||||||
|
status = client.get(f"/api/exam/templates/t1/auto-map/{job_id}/status")
|
||||||
|
assert status.status_code == 200
|
||||||
|
body = status.json()
|
||||||
|
assert body["status"] == "completed"
|
||||||
|
assert body["counts"]["questions"] >= 2
|
||||||
|
assert body["template"]["layout"]
|
||||||
|
|||||||
103
tests/test_files_idor.py
Normal file
103
tests/test_files_idor.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import routers.database.files.files as files_router
|
||||||
|
import routers.database.files.files_simplified as files_simplified_router
|
||||||
|
|
||||||
|
|
||||||
|
ROUTERS = [files_router, files_simplified_router]
|
||||||
|
|
||||||
|
USER_A = "00000000-0000-0000-0000-000000000001"
|
||||||
|
USER_B = "00000000-0000-0000-0000-000000000002"
|
||||||
|
CAB_A = "10000000-0000-0000-0000-000000000001"
|
||||||
|
CAB_B = "10000000-0000-0000-0000-000000000002"
|
||||||
|
|
||||||
|
|
||||||
|
class FakeQuery:
|
||||||
|
def __init__(self, rows):
|
||||||
|
self.rows = list(rows)
|
||||||
|
|
||||||
|
def select(self, *_args, **_kwargs):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def eq(self, key, value):
|
||||||
|
self.rows = [row for row in self.rows if row.get(key) == value]
|
||||||
|
return self
|
||||||
|
|
||||||
|
def limit(self, _n):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def execute(self):
|
||||||
|
return SimpleNamespace(data=self.rows)
|
||||||
|
|
||||||
|
|
||||||
|
class FakeSupabase:
|
||||||
|
def __init__(self, store):
|
||||||
|
self.store = store
|
||||||
|
|
||||||
|
def table(self, name):
|
||||||
|
return FakeQuery(self.store.get(name, []))
|
||||||
|
|
||||||
|
|
||||||
|
class FakeServiceRoleClient:
|
||||||
|
def __init__(self, store):
|
||||||
|
self.supabase = FakeSupabase(store)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||||
|
def test_list_files_hides_unowned_unshared_cabinet(monkeypatch, router_module):
|
||||||
|
store = {
|
||||||
|
"file_cabinets": [
|
||||||
|
{"id": CAB_A, "user_id": USER_A},
|
||||||
|
{"id": CAB_B, "user_id": USER_B},
|
||||||
|
],
|
||||||
|
"cabinet_memberships": [],
|
||||||
|
"files": [
|
||||||
|
{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A},
|
||||||
|
{"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(
|
||||||
|
router_module,
|
||||||
|
"SupabaseServiceRoleClient",
|
||||||
|
lambda: FakeServiceRoleClient(store),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||||
|
def test_list_files_allows_own_cabinet(monkeypatch, router_module):
|
||||||
|
store = {
|
||||||
|
"file_cabinets": [{"id": CAB_A, "user_id": USER_A}],
|
||||||
|
"cabinet_memberships": [],
|
||||||
|
"files": [{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}],
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(
|
||||||
|
router_module,
|
||||||
|
"SupabaseServiceRoleClient",
|
||||||
|
lambda: FakeServiceRoleClient(store),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert router_module.list_files(CAB_A, {"sub": USER_A}) == [
|
||||||
|
{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||||
|
def test_list_files_denies_non_owner_even_with_cabinet_membership(monkeypatch, router_module):
|
||||||
|
store = {
|
||||||
|
"file_cabinets": [{"id": CAB_B, "user_id": USER_B}],
|
||||||
|
"cabinet_memberships": [
|
||||||
|
{"cabinet_id": CAB_B, "profile_id": USER_A, "role": "viewer"}
|
||||||
|
],
|
||||||
|
"files": [{"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B}],
|
||||||
|
}
|
||||||
|
monkeypatch.setattr(
|
||||||
|
router_module,
|
||||||
|
"SupabaseServiceRoleClient",
|
||||||
|
lambda: FakeServiceRoleClient(store),
|
||||||
|
)
|
||||||
|
|
||||||
|
assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
|
||||||
51
tests/test_reset_environment_user_subset.py
Normal file
51
tests/test_reset_environment_user_subset.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from run.initialization import reset_environment
|
||||||
|
|
||||||
|
|
||||||
|
def test_reset_user_subset_scope_only_runs_user_subset_cleanup(monkeypatch):
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
reset_environment,
|
||||||
|
"_sb_headers",
|
||||||
|
lambda: ("http://192.168.0.94:8000", {"Authorization": "Bearer redacted"}),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
reset_environment,
|
||||||
|
"_assert_reset_allowed",
|
||||||
|
lambda url, scope: calls.append(("guard", url, scope)),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
reset_environment,
|
||||||
|
"_clear_user_subset_files",
|
||||||
|
lambda: {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
|
||||||
|
)
|
||||||
|
|
||||||
|
def fail_if_called(*_args, **_kwargs):
|
||||||
|
raise AssertionError("reset(scope='user-subset') must not clear unrelated tables or databases")
|
||||||
|
|
||||||
|
monkeypatch.setattr(reset_environment, "_clear_tables", fail_if_called)
|
||||||
|
monkeypatch.setattr(reset_environment, "_neo4j_drop_all_non_system", fail_if_called)
|
||||||
|
monkeypatch.setattr(reset_environment, "_clear_exam_storage", fail_if_called)
|
||||||
|
|
||||||
|
result = reset_environment.reset(scope="user-subset")
|
||||||
|
|
||||||
|
assert calls == [("guard", "http://192.168.0.94:8000", "user-subset")]
|
||||||
|
assert result == {
|
||||||
|
"scope": "user-subset",
|
||||||
|
"user_subset": {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_reset_accepts_case_insensitive_user_subset_scope(monkeypatch):
|
||||||
|
monkeypatch.setattr(reset_environment, "_sb_headers", lambda: ("http://192.168.0.94:8000", {}))
|
||||||
|
monkeypatch.setattr(reset_environment, "_assert_reset_allowed", lambda *_args, **_kwargs: None)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
reset_environment,
|
||||||
|
"_clear_user_subset_files",
|
||||||
|
lambda: {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert reset_environment.reset(scope="USER-SUBSET") == {
|
||||||
|
"scope": "user-subset",
|
||||||
|
"user_subset": {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
|
||||||
|
}
|
||||||
171
tests/test_seed_exam_corpus_unseed.py
Normal file
171
tests/test_seed_exam_corpus_unseed.py
Normal file
@ -0,0 +1,171 @@
|
|||||||
|
from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
|
||||||
|
|
||||||
|
|
||||||
|
class _Result:
|
||||||
|
def __init__(self, data=None):
|
||||||
|
self.data = data or []
|
||||||
|
|
||||||
|
|
||||||
|
class _FilesQuery:
|
||||||
|
def __init__(self, db, op="select"):
|
||||||
|
self.db = db
|
||||||
|
self.op = op
|
||||||
|
self.filters = []
|
||||||
|
self.in_filters = []
|
||||||
|
|
||||||
|
def select(self, *_args, **_kwargs):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def delete(self, *_args, **_kwargs):
|
||||||
|
self.op = "delete"
|
||||||
|
return self
|
||||||
|
|
||||||
|
def eq(self, key, value):
|
||||||
|
self.filters.append(("eq", key, value))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def like(self, key, pattern):
|
||||||
|
self.filters.append(("like", key, pattern))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def in_(self, key, values):
|
||||||
|
self.in_filters.append((key, set(values)))
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _matches(self, row):
|
||||||
|
for kind, key, value in self.filters:
|
||||||
|
actual = row.get(key)
|
||||||
|
if kind == "eq" and actual != value:
|
||||||
|
return False
|
||||||
|
if kind == "like":
|
||||||
|
assert value.endswith("%")
|
||||||
|
if not isinstance(actual, str) or not actual.startswith(value[:-1]):
|
||||||
|
return False
|
||||||
|
for key, values in self.in_filters:
|
||||||
|
if row.get(key) not in values:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def execute(self):
|
||||||
|
matched = [row for row in self.db.rows if self._matches(row)]
|
||||||
|
if self.op == "delete":
|
||||||
|
self.db.ops.append(("delete", [row["id"] for row in matched]))
|
||||||
|
self.db.rows = [row for row in self.db.rows if not self._matches(row)]
|
||||||
|
return _Result(matched)
|
||||||
|
return _Result(matched)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeDb:
|
||||||
|
def __init__(self, rows):
|
||||||
|
self.rows = list(rows)
|
||||||
|
self.ops = []
|
||||||
|
|
||||||
|
def table(self, name):
|
||||||
|
assert name == "files"
|
||||||
|
return _FilesQuery(self)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeStorageBucket:
|
||||||
|
def __init__(self, storage, bucket):
|
||||||
|
self.storage = storage
|
||||||
|
self.bucket = bucket
|
||||||
|
|
||||||
|
def remove(self, paths):
|
||||||
|
self.storage.ops.append(("remove", self.bucket, list(paths)))
|
||||||
|
if self.storage.fail:
|
||||||
|
raise RuntimeError("storage unavailable")
|
||||||
|
if self.storage.result_error:
|
||||||
|
return {"error": self.storage.result_error}
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeStorageRoot:
|
||||||
|
def __init__(self, storage):
|
||||||
|
self.storage = storage
|
||||||
|
|
||||||
|
def from_(self, bucket):
|
||||||
|
return _FakeStorageBucket(self.storage, bucket)
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeStorage:
|
||||||
|
def __init__(self, fail=False, result_error=None):
|
||||||
|
self.fail = fail
|
||||||
|
self.result_error = result_error
|
||||||
|
self.ops = []
|
||||||
|
self.client = type("Client", (), {"supabase": type("SB", (), {"storage": _FakeStorageRoot(self)})()})()
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeClient:
|
||||||
|
def __init__(self, db):
|
||||||
|
self.supabase = db
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_user_subset_storage_before_files_rows_for_scoped_exams():
|
||||||
|
db = _FakeDb([
|
||||||
|
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||||
|
{"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
|
||||||
|
{"id": "f3", "bucket": "cc.users", "path": "exam-marker/i/c/f3/A.pdf", "name": "A.pdf", "source": "manual"},
|
||||||
|
{"id": "f4", "bucket": "cc.users", "path": "other/f4/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||||
|
])
|
||||||
|
storage = _FakeStorage()
|
||||||
|
rep = LoadReport()
|
||||||
|
|
||||||
|
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||||
|
|
||||||
|
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||||
|
assert db.ops == [("delete", ["f1"])]
|
||||||
|
assert [row["id"] for row in db.rows] == ["f2", "f3", "f4"]
|
||||||
|
assert rep.unseed_objects == 1
|
||||||
|
assert rep.unseed_user_files == 1
|
||||||
|
assert rep.errors == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_user_subset_keeps_files_rows_when_storage_remove_fails():
|
||||||
|
db = _FakeDb([
|
||||||
|
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||||
|
])
|
||||||
|
storage = _FakeStorage(fail=True)
|
||||||
|
rep = LoadReport()
|
||||||
|
|
||||||
|
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||||
|
|
||||||
|
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||||
|
assert db.ops == []
|
||||||
|
assert [row["id"] for row in db.rows] == ["f1"]
|
||||||
|
assert rep.unseed_objects == 0
|
||||||
|
assert rep.unseed_user_files == 0
|
||||||
|
assert rep.errors
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_user_subset_keeps_files_rows_when_storage_remove_returns_error():
|
||||||
|
db = _FakeDb([
|
||||||
|
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||||
|
])
|
||||||
|
storage = _FakeStorage(result_error="permission denied")
|
||||||
|
rep = LoadReport()
|
||||||
|
|
||||||
|
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||||
|
|
||||||
|
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||||
|
assert db.ops == []
|
||||||
|
assert [row["id"] for row in db.rows] == ["f1"]
|
||||||
|
assert rep.unseed_objects == 0
|
||||||
|
assert rep.unseed_user_files == 0
|
||||||
|
assert rep.errors
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_user_subset_unscoped_cleans_all_seeded_exam_marker_rows():
|
||||||
|
db = _FakeDb([
|
||||||
|
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||||
|
{"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
|
||||||
|
])
|
||||||
|
storage = _FakeStorage()
|
||||||
|
rep = LoadReport()
|
||||||
|
|
||||||
|
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=None, rep=rep)
|
||||||
|
|
||||||
|
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf", "exam-marker/i/c/f2/B.pdf"])]
|
||||||
|
assert db.ops == [("delete", ["f1", "f2"])]
|
||||||
|
assert db.rows == []
|
||||||
|
assert rep.unseed_objects == 2
|
||||||
|
assert rep.unseed_user_files == 2
|
||||||
54
tests/test_upload_validation.py
Normal file
54
tests/test_upload_validation.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import asyncio
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi import HTTPException
|
||||||
|
|
||||||
|
from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes
|
||||||
|
|
||||||
|
|
||||||
|
class FakeUpload:
|
||||||
|
def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"):
|
||||||
|
self._data = data
|
||||||
|
self._pos = 0
|
||||||
|
self.content_type = content_type
|
||||||
|
self.filename = filename
|
||||||
|
|
||||||
|
async def read(self, size: int = -1) -> bytes:
|
||||||
|
if self._pos >= len(self._data):
|
||||||
|
return b""
|
||||||
|
if size is None or size < 0:
|
||||||
|
size = len(self._data) - self._pos
|
||||||
|
chunk = self._data[self._pos : self._pos + size]
|
||||||
|
self._pos += len(chunk)
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
|
def run(coro):
|
||||||
|
return asyncio.run(coro)
|
||||||
|
|
||||||
|
|
||||||
|
def test_valid_pdf_upload_passes_and_returns_mime():
|
||||||
|
data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf")))
|
||||||
|
assert data.startswith(b"%PDF-")
|
||||||
|
assert mime == "application/pdf"
|
||||||
|
|
||||||
|
|
||||||
|
def test_disallowed_mime_rejected_with_415():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python")))
|
||||||
|
assert exc.value.status_code == 415
|
||||||
|
assert "Unsupported upload type" in exc.value.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_oversize_upload_rejected_with_413():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain")))
|
||||||
|
assert exc.value.status_code == 413
|
||||||
|
assert "exceeds max size" in exc.value.detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdf_helper_rejects_spoofed_pdf_mime():
|
||||||
|
with pytest.raises(HTTPException) as exc:
|
||||||
|
run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf")))
|
||||||
|
assert exc.value.status_code == 415
|
||||||
|
assert "not a valid PDF" in exc.value.detail
|
||||||
Loading…
x
Reference in New Issue
Block a user