Compare commits
16 Commits
feature/ex
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6c73174829 | ||
|
|
5434a5bf21 | ||
|
|
44ccba2151 | ||
|
|
e83873e822 | ||
| 150b915282 | |||
| 76e11b0b06 | |||
| 52d1ece212 | |||
|
|
69d9c46abe | ||
|
|
34fc7edd68 | ||
| c69451fba2 | |||
| e98fed661f | |||
|
|
a6753d092f | ||
| 7f7e843563 | |||
| 7819e6e346 | |||
| 5da108df13 | |||
|
|
25d02aedeb |
5
api/services/docling/.gitignore
vendored
Normal file
5
api/services/docling/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# B1 image-only eval corpus + pipeline outputs: fetched/generated at runtime, never committed.
|
||||
# Exam-board PDFs are third-party copyright (served only via signed URLs); results/ are reproducible.
|
||||
/samples/b1/
|
||||
/results/b1_rapid/
|
||||
/results/final/
|
||||
@ -40,6 +40,10 @@ try:
|
||||
from . import tables as tbl_mod
|
||||
except ImportError: # pragma: no cover - CLI execution
|
||||
import tables as tbl_mod
|
||||
try:
|
||||
from . import regions as region_mod
|
||||
except ImportError: # pragma: no cover - CLI execution
|
||||
import regions as region_mod
|
||||
|
||||
# ----------------------------------------------------------------- line model
|
||||
Line = namedtuple("Line", "text page bbox") # bbox is None for text-only sources
|
||||
@ -245,6 +249,11 @@ def extract_front_matter(lines, board, code):
|
||||
# ====================================================================== AQA
|
||||
# --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
|
||||
PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2
|
||||
# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
|
||||
# Normalise only tight margin-column candidates before matching; body decimals
|
||||
# remain protected by the label-column gate below.
|
||||
AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
|
||||
AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"})
|
||||
NUM_RE = re.compile(r"^(\d{2})$") # 08
|
||||
DIG_RE = re.compile(r"^(\d)$") # 4
|
||||
# A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
|
||||
@ -275,21 +284,47 @@ def _rapid_pages(rapid_glob):
|
||||
yield pg, json.load(open(fn))
|
||||
|
||||
|
||||
def _clean_aqa_label(raw):
|
||||
compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
|
||||
# Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
|
||||
# genuine glued label+prose cases from the raw text under the label-column gate.
|
||||
if re.search(r"[A-Za-z]", compact):
|
||||
return compact
|
||||
return AQA_LABEL_NOISE.sub("", compact)
|
||||
|
||||
|
||||
def _synthetic_label_bbox(page_lines, fallback):
|
||||
"""Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
|
||||
body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
|
||||
if body:
|
||||
top = max(body, key=lambda b: b.get("t", 0))
|
||||
return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
|
||||
"coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
|
||||
if fallback:
|
||||
return dict(fallback)
|
||||
return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
|
||||
|
||||
|
||||
def aqa_questions_rapid(rapid_glob):
|
||||
"""Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
|
||||
* GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
|
||||
* A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
|
||||
* A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
|
||||
parts = {}
|
||||
page_lines = defaultdict(list) # page -> [(bbox, raw)] for deterministic inference
|
||||
mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order
|
||||
top_cands = {} # NN -> (page, bbox) explicit top-level question headers
|
||||
for pg, d in _rapid_pages(rapid_glob):
|
||||
margin = []
|
||||
for t in d.get("texts", []):
|
||||
raw = (t.get("text") or "").strip()
|
||||
s = raw.replace(" ", "")
|
||||
s = _clean_aqa_label(raw)
|
||||
prov = t.get("prov") or []
|
||||
bb = prov[0].get("bbox") if prov else None
|
||||
if bb is None or bb["l"] > 140:
|
||||
if bb is None:
|
||||
continue
|
||||
page_lines[pg].append((bb, raw))
|
||||
if bb["l"] > 140:
|
||||
continue
|
||||
margin.append((bb, s))
|
||||
m = PART_RE.match(s)
|
||||
@ -307,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
|
||||
nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
|
||||
digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
|
||||
for nbb, nn in nums:
|
||||
top_cands.setdefault(nn, (pg, nbb))
|
||||
ny = (nbb["t"] + nbb["b"]) / 2
|
||||
for dbb, dd in digs:
|
||||
dy = (dbb["t"] + dbb["b"]) / 2
|
||||
if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
|
||||
parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
|
||||
# Section B: walk MCQ candidates in reading order, accept the next number in sequence only
|
||||
structured_q = {int(lab.split(".")[0]) for lab in parts}
|
||||
# Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
|
||||
# immediately after the core structured section. This prevents OCR option text such as "36.7Q"
|
||||
# from moving the MCQ start from Q07 to Q37.
|
||||
q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
|
||||
core_q = q_nums[:]
|
||||
while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
|
||||
core_q.pop()
|
||||
mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
|
||||
if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
|
||||
core_set = set(core_q)
|
||||
for lab in list(parts):
|
||||
if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
|
||||
parts.pop(lab, None)
|
||||
|
||||
# Infer an OCR-dropped leading .1 part when later structured parts for the same question are
|
||||
# present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
|
||||
# RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
|
||||
by_q = defaultdict(list)
|
||||
for lab, v in parts.items():
|
||||
q, sub = lab.split(".")
|
||||
if sub != "0":
|
||||
by_q[q].append((int(sub), v))
|
||||
for q, vals in list(by_q.items()):
|
||||
if f"{q}.1" not in parts:
|
||||
first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
|
||||
if first_sub > 1 and first_v.get("page"):
|
||||
pg = int(first_v["page"])
|
||||
parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
|
||||
subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
|
||||
# Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
|
||||
# false high subpart into a whole run of synthetic labels.
|
||||
if len(subs) >= 3:
|
||||
for prev_sub, next_sub in zip(subs, subs[1:]):
|
||||
if next_sub - prev_sub == 2:
|
||||
missing = prev_sub + 1
|
||||
anchor = parts[f"{q}.{next_sub}"]
|
||||
parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
|
||||
|
||||
# Preserve explicit one-part structured questions seen as a bare top-level header (for example
|
||||
# GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
|
||||
# headers that already have .1/.2 children into extra .0 parts.
|
||||
present_q = {lab.split(".")[0] for lab in parts}
|
||||
for q, (pg, bb) in top_cands.items():
|
||||
if q not in present_q:
|
||||
parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
|
||||
|
||||
# Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
|
||||
structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
|
||||
expect = (max(structured_q) + 1) if structured_q else 1
|
||||
mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down
|
||||
cand = {} # nn -> (page, bbox), first occurrence in reading order
|
||||
for pg, nn, bb in mcq_cands:
|
||||
cand.setdefault(int(nn), (pg, bb))
|
||||
# Walk the sequence: take the exact expected number when present; only jump a small gap
|
||||
# (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
|
||||
# truncate the section. Out-of-window noise (misread "60") never enters.
|
||||
# Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
|
||||
# deterministic placeholders so a single garbled number does not end Section B recovery.
|
||||
seq = []
|
||||
while True:
|
||||
if expect in cand and expect not in structured_q:
|
||||
@ -330,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
|
||||
continue
|
||||
nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
|
||||
if nxt:
|
||||
expect = min(nxt)
|
||||
jump_to = min(nxt)
|
||||
for missing in range(expect, jump_to):
|
||||
seq.append((missing, cand[jump_to]))
|
||||
expect = jump_to
|
||||
continue
|
||||
break
|
||||
# Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
|
||||
@ -521,6 +605,11 @@ def docling_regions(doc):
|
||||
return regions
|
||||
|
||||
|
||||
def _norm_region_type(kind):
|
||||
kind = (kind or "answer_lines").strip().lower().replace("-", "_")
|
||||
return kind if kind in {"answer_lines", "answer_box", "working_space"} else "working_space"
|
||||
|
||||
|
||||
def merge_gemma(parts, gemma_dir):
|
||||
"""Attach gemma4:e4b answer_regions (#3) to parts by for_part; gap-fill missing marks."""
|
||||
n_reg = n_fill = 0
|
||||
@ -529,8 +618,9 @@ def merge_gemma(parts, gemma_dir):
|
||||
for r in d.get("answer_regions", []):
|
||||
lab = _norm_label(r.get("for_part", ""))
|
||||
if lab in parts:
|
||||
parts[lab]["regions"].append({"type": r.get("kind", "answer_lines"),
|
||||
"source": "gemma"})
|
||||
parts[lab]["regions"].append({"type": _norm_region_type(r.get("kind", "answer_lines")),
|
||||
"source": "gemma",
|
||||
**({"bbox": r.get("bbox")} if r.get("bbox") else {})})
|
||||
n_reg += 1
|
||||
for qp in d.get("question_parts", []):
|
||||
lab = _norm_label(qp.get("label", ""))
|
||||
@ -548,6 +638,70 @@ def _norm_label(s):
|
||||
return s
|
||||
|
||||
|
||||
|
||||
def attach_detected_response_regions(parts, pdf_path):
|
||||
"""Attach OpenCV response-region candidates to the nearest known part on the same page.
|
||||
|
||||
This is the deterministic answer-region backbone used before/alongside gemma: it emits the
|
||||
same answer_lines / answer_box / working_space taxonomy and keeps the mapper schema unchanged.
|
||||
Coordinates from regions.py are rendered-page TOPLEFT px; callers can persist them as candidate
|
||||
response areas or use the counts as harness coverage.
|
||||
"""
|
||||
if not pdf_path or not os.path.exists(pdf_path):
|
||||
return 0, []
|
||||
try:
|
||||
candidates = region_mod.detect_response_regions_from_pdf(pdf_path, min_confidence=0.32)
|
||||
except RuntimeError as exc:
|
||||
print(f"response-regions : unavailable ({exc})")
|
||||
return 0, []
|
||||
except Exception as exc:
|
||||
print(f"response-regions : failed ({exc})")
|
||||
return 0, []
|
||||
|
||||
by_page = defaultdict(list)
|
||||
for lab, part in parts.items():
|
||||
if part.get("page") is not None and part.get("bbox"):
|
||||
by_page[int(part["page"])].append((lab, part))
|
||||
|
||||
attached = 0
|
||||
for cand in candidates:
|
||||
# regions.py page_index is zero-based; extraction/template parts are one-based.
|
||||
pg = int(cand.get("page_index", 0)) + 1
|
||||
page_parts = by_page.get(pg) or []
|
||||
if not page_parts:
|
||||
continue
|
||||
rb = cand.get("bbox") or {}
|
||||
meta = cand.get("meta") or {}
|
||||
center_top_px = float(rb.get("y", 0)) + float(rb.get("h", 0)) / 2
|
||||
page_height_px = float(meta.get("page_height_px") or 0)
|
||||
page_height_pdf = float(meta.get("page_height_pdf") or 0)
|
||||
if page_height_px > 0 and page_height_pdf > 0:
|
||||
region_y_pdf = (1.0 - center_top_px / page_height_px) * page_height_pdf
|
||||
else:
|
||||
region_y_pdf = -center_top_px
|
||||
best_lab = None
|
||||
best_score = 1e9
|
||||
for lab, part in page_parts:
|
||||
pb = part.get("bbox") or {}
|
||||
part_mid = (float(pb.get("t", 0)) + float(pb.get("b", 0))) / 2
|
||||
# Prefer the nearest label above/near the response area; a small penalty keeps
|
||||
# previous-part assignment stable when regions sit between two labels.
|
||||
below_penalty = 0 if region_y_pdf <= float(pb.get("t", 0)) + 18 else 120
|
||||
score = abs(part_mid - region_y_pdf) + below_penalty
|
||||
if score < best_score:
|
||||
best_lab, best_score = lab, score
|
||||
if best_lab:
|
||||
parts[best_lab].setdefault("regions", []).append({
|
||||
"type": _norm_region_type(cand.get("region_type")),
|
||||
"source": "opencv",
|
||||
"confidence": cand.get("confidence"),
|
||||
"bbox": rb,
|
||||
"detection_method": cand.get("detection_method"),
|
||||
**({"line_count": cand.get("line_count")} if cand.get("line_count") is not None else {}),
|
||||
})
|
||||
attached += 1
|
||||
return attached, candidates
|
||||
|
||||
def extract_tables(parts, doc, granite="off", pdf=None, cache_glob=None):
|
||||
"""Selective table-cell extraction (PLAN.md §B): standard TableFormer grids always; Granite
|
||||
<otsl> on router-flagged pages when granite!='off'. Returns (data_tables, all_tables).
|
||||
@ -626,7 +780,7 @@ GT_PARTS_PHYSICS = ["01.1","01.2","01.3","01.4","02.1","02.2","02.3","02.4","03.
|
||||
"10.1","10.2","10.3","11.1","11.2","11.3","11.4"]
|
||||
|
||||
# official paper maxima — the strongest grammar sanity check (marks_sum should match)
|
||||
EXPECTED_MAX = {"8463": 100, "7408": 85, "8461": 100, "1MA1": 80, "H556": 70}
|
||||
EXPECTED_MAX = {"8463": 100, "7408": 85, "7402": 91, "7405": 105, "8461": 100, "8462": 100, "8464": 70, "1MA1": 80, "H556": 70}
|
||||
|
||||
|
||||
def expected_max(code):
|
||||
@ -666,6 +820,7 @@ def main():
|
||||
ap.add_argument("--pdf", help="source PDF for live Granite table passes (--granite live)")
|
||||
ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (the v1 95% path)")
|
||||
ap.add_argument("--gemma", help="gemma sweep dir with p*.json answer_regions")
|
||||
ap.add_argument("--response-regions", dest="response_regions_pdf", help="PDF to scan with deterministic response-region detector and attach to parts")
|
||||
ap.add_argument("--marks-fill", dest="marks_fill",
|
||||
help="gemma_marks.py fills JSON: fill marks=None parts (Edexcel/OCR (N)/[N] gap-fill)")
|
||||
ap.add_argument("--granite", default="off", choices=["off", "cached", "live"],
|
||||
@ -673,6 +828,7 @@ def main():
|
||||
ap.add_argument("--granite-cache", default="results/VLM_granite_p*.doctags",
|
||||
help="glob of cached *.doctags for --granite cached / live fallback")
|
||||
ap.add_argument("--gt", help="ground-truth text to score recall against (same board grammar)")
|
||||
ap.add_argument("--expected-max", type=int, help="authoritative paper max marks for OCR eval harnesses when front matter/code OCR is missing")
|
||||
ap.add_argument("--board", default="auto", choices=["auto", "aqa", "edexcel", "ocr"])
|
||||
ap.add_argument("--out", default="results/structured.json")
|
||||
a = ap.parse_args()
|
||||
@ -751,6 +907,11 @@ def main():
|
||||
n_reg = n_fill = 0
|
||||
if a.gemma and os.path.isdir(a.gemma):
|
||||
n_reg, n_fill = merge_gemma(parts, a.gemma)
|
||||
n_cv_regions = 0
|
||||
cv_region_candidates = []
|
||||
response_pdf = a.response_regions_pdf or a.pdf or a.ocr
|
||||
if response_pdf:
|
||||
n_cv_regions, cv_region_candidates = attach_detected_response_regions(parts, response_pdf)
|
||||
n_marks_fill = 0
|
||||
if a.marks_fill and os.path.exists(a.marks_fill):
|
||||
fills = json.load(open(a.marks_fill)).get("fills", {})
|
||||
@ -758,6 +919,20 @@ def main():
|
||||
if lab in parts and parts[lab].get("marks") is None:
|
||||
parts[lab]["marks"] = int(mk); n_marks_fill += 1
|
||||
|
||||
exp_max_override = a.expected_max
|
||||
# Targeted marks gap-fill: if OCR recovered all but one mark and the authoritative
|
||||
# paper max leaves a small plausible residual, attach that residual to the lone
|
||||
# missing part. This keeps the deterministic label backbone and only fills the
|
||||
# narrow low-confidence gap instead of using gemma/full extraction as source of truth.
|
||||
n_residual_marks_fill = 0
|
||||
if exp_max_override:
|
||||
missing_labs = [lab for lab, part in parts.items() if part.get("marks") is None]
|
||||
known_sum = sum(part["marks"] for part in parts.values() if part.get("marks") is not None)
|
||||
residual = exp_max_override - known_sum
|
||||
if len(missing_labs) == 1 and 1 <= residual <= 9:
|
||||
parts[missing_labs[0]]["marks"] = residual
|
||||
n_residual_marks_fill = 1
|
||||
|
||||
questions = build_questions(parts)
|
||||
|
||||
# --- coverage ------------------------------------------------------------------------
|
||||
@ -774,7 +949,7 @@ def main():
|
||||
|
||||
marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
|
||||
marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
|
||||
exp_max = expected_max(code) or fm.get("max_marks") # code-based, else front-matter total
|
||||
exp_max = exp_max_override or expected_max(code) or fm.get("max_marks") # harness override, code-based, else front-matter total
|
||||
marks_check = (None if exp_max is None else
|
||||
{"sum": marks_sum, "expected_max": exp_max,
|
||||
"pct": round(marks_sum / exp_max * 100, 1)})
|
||||
@ -791,6 +966,9 @@ def main():
|
||||
"marks_check": marks_check,
|
||||
"gemma_answer_regions": n_reg, "gemma_marks_filled": n_fill,
|
||||
"gemma_marks_gapfilled": n_marks_fill,
|
||||
"residual_marks_gapfilled": n_residual_marks_fill,
|
||||
"opencv_answer_regions": n_cv_regions,
|
||||
"opencv_answer_region_candidates": len(cv_region_candidates),
|
||||
"n_data_tables": len(data_tables),
|
||||
"n_furniture_tables": sum(1 for t in all_tables if t["is_furniture"]),
|
||||
"table_sources": {s: sum(1 for t in data_tables if t["source"] == s)
|
||||
@ -810,7 +988,10 @@ def main():
|
||||
print(f"marks : {marks_known}/{len(parts)} parts known (sum {marks_sum}){mc}"
|
||||
+ (f"; +{n_mark_geo} by geometry" if n_mark_geo else ""))
|
||||
print(f"gemma regions : {n_reg} answer_regions, {n_fill} marks gap-filled"
|
||||
+ (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else ""))
|
||||
+ (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else "")
|
||||
+ (f"; +{n_residual_marks_fill} residual marks gap-fill" if n_residual_marks_fill else ""))
|
||||
if response_pdf:
|
||||
print(f"opencv regions : {n_cv_regions} attached / {len(cv_region_candidates)} candidates")
|
||||
print(f"tables : {len(data_tables)} data table(s) "
|
||||
f"{result['stats']['table_sources']} on pages {tbl_pages}; "
|
||||
f"{result['stats']['n_furniture_tables']} furniture filtered; {n_tbl} parts flagged")
|
||||
|
||||
@ -59,6 +59,61 @@ GEOMETRY = [
|
||||
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
|
||||
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
|
||||
]
|
||||
|
||||
B1_GEOMETRY = [
|
||||
dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
|
||||
gt_key="b1-aqa-biology-7402-1-2023jun", expected_max=91),
|
||||
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
|
||||
gt_key="b1-aqa-chemistry-7405-1-2022jun", expected_max=105),
|
||||
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
|
||||
gt_key="b1-aqa-physics-7408-1-2022jun", expected_max=85),
|
||||
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
|
||||
gt_key="b1-aqa-biology-8461-1h-2022jun", expected_max=100),
|
||||
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
|
||||
gt_key="b1-aqa-chemistry-8462-1h-2022jun", expected_max=100),
|
||||
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
|
||||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
|
||||
gt_key="b1-aqa-combined-8464-b1h-2022jun", expected_max=70),
|
||||
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
|
||||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
|
||||
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
|
||||
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
|
||||
gt_key="b1-aqa-combined-8464-c1h-2022jun", expected_max=70),
|
||||
]
|
||||
|
||||
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
|
||||
|
||||
FAST = [
|
||||
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
|
||||
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||||
@ -95,16 +150,68 @@ def jload(p):
|
||||
return {}
|
||||
|
||||
|
||||
def stats_from(struct, val):
|
||||
|
||||
def load_gt_labels():
|
||||
try:
|
||||
return json.load(open(GT_LABELS_PATH))
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def part_labels(struct):
|
||||
labels = []
|
||||
for q in struct.get("questions", []) or []:
|
||||
for part in q.get("parts", []) or []:
|
||||
lab = part.get("label")
|
||||
if lab:
|
||||
labels.append(lab)
|
||||
return labels
|
||||
|
||||
|
||||
def coverage_against_labels(struct, labels):
|
||||
if not labels:
|
||||
return None
|
||||
rec = set(part_labels(struct))
|
||||
gt = set(labels)
|
||||
hit = sorted(rec & gt)
|
||||
miss = sorted(gt - rec)
|
||||
return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
|
||||
"recovered": len(hit), "total": len(gt), "missed": miss,
|
||||
"source": "fixtures/b1_gt_labels.json"}
|
||||
|
||||
|
||||
def answer_region_count(struct):
|
||||
top = len(struct.get("regions", []) or [])
|
||||
per_part = 0
|
||||
for q in struct.get("questions", []) or []:
|
||||
for part in q.get("parts", []) or []:
|
||||
per_part += len(part.get("regions", []) or [])
|
||||
return top + per_part
|
||||
|
||||
|
||||
def ensure_rapid_cache(p):
|
||||
if os.path.exists(p["docling"]):
|
||||
return True
|
||||
if not os.path.exists(p["pdf"]):
|
||||
print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
|
||||
return False
|
||||
return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
|
||||
|
||||
def stats_from(struct, val, gt_labels=None):
|
||||
st = struct.get("stats", {}) or {}
|
||||
mc = st.get("marks_check") or {}
|
||||
cov = struct.get("coverage", {}) or {}
|
||||
cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
|
||||
return {
|
||||
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
|
||||
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
|
||||
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
|
||||
"marks_pct": mc.get("pct"),
|
||||
"coverage_pct": cov.get("coverage_pct"), "coverage_missed": cov.get("missed", []),
|
||||
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
|
||||
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
|
||||
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
|
||||
"opencv_answer_regions": st.get("opencv_answer_regions"),
|
||||
"opencv_answer_region_candidates": st.get("opencv_answer_region_candidates"),
|
||||
"residual_marks_gapfilled": st.get("residual_marks_gapfilled"),
|
||||
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
|
||||
"validate_flags": val.get("flags", []),
|
||||
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
|
||||
@ -113,12 +220,19 @@ def stats_from(struct, val):
|
||||
}
|
||||
|
||||
|
||||
def do_geometry(p, overlays):
|
||||
def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
|
||||
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||||
S, F, B, R, T, V = (os.path.join(d, f) for f in
|
||||
("structured.json", "furniture.json", "bands.json", "page_roles.json",
|
||||
"template.json", "validate.json"))
|
||||
ex = ["extract.py"] + p["extract"] + ["--out", S]
|
||||
if prepare_ocr and not ensure_rapid_cache(p):
|
||||
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
|
||||
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
|
||||
ex = ["extract.py"] + extract_args + ["--out", S]
|
||||
if p.get("pdf"):
|
||||
ex += ["--response-regions", p["pdf"]]
|
||||
if p.get("expected_max"):
|
||||
ex += ["--expected-max", str(p["expected_max"])]
|
||||
if p.get("gt"):
|
||||
ex += ["--gt", p["gt"]]
|
||||
run(ex)
|
||||
@ -138,7 +252,7 @@ def do_geometry(p, overlays):
|
||||
odbg = os.path.join(d, "overlays", "debug")
|
||||
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
|
||||
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
|
||||
return stats_from(jload(S), jload(V)), d
|
||||
return stats_from(jload(S), jload(V), gt_labels), d
|
||||
|
||||
|
||||
def do_fast(p):
|
||||
@ -164,6 +278,9 @@ def per_paper_report(p, s, d, kind):
|
||||
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
|
||||
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
|
||||
f"- **G6 verdict:** {s['validate_verdict']}",
|
||||
f"- **answer-region count:** {s.get('answer_regions')}",
|
||||
f"- **opencv response regions:** {s.get('opencv_answer_regions')} attached / "
|
||||
f"{s.get('opencv_answer_region_candidates')} candidates",
|
||||
]
|
||||
if s["validate_flags"]:
|
||||
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
|
||||
@ -178,21 +295,28 @@ def per_paper_report(p, s, d, kind):
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--no-overlays", action="store_true")
|
||||
ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
|
||||
ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
|
||||
a = ap.parse_args()
|
||||
os.makedirs(FINAL, exist_ok=True)
|
||||
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||
"papers": []}
|
||||
total_imgs = 0
|
||||
|
||||
for p in GEOMETRY:
|
||||
gt_fixtures = load_gt_labels()
|
||||
geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
|
||||
fast = [] if a.b1_only else FAST
|
||||
|
||||
for p in geometry:
|
||||
print(f"[geometry] {p['slug']}")
|
||||
s, d = do_geometry(p, not a.no_overlays)
|
||||
gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
|
||||
s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
|
||||
n = per_paper_report(p, s, d, p["path"])
|
||||
total_imgs += n
|
||||
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||||
"kind": "geometry", "path": p["path"], "dir": d,
|
||||
"overlay_images": n, **s})
|
||||
for p in FAST:
|
||||
for p in fast:
|
||||
print(f"[fast] {p['slug']}")
|
||||
s, d = do_fast(p)
|
||||
per_paper_report(p, s, d, "born-digital fast-path")
|
||||
@ -214,13 +338,13 @@ def write_index(catalog, total_imgs):
|
||||
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
|
||||
"Machine catalog: `catalog.json`.", "",
|
||||
"## Image-only / OCR-path (with geometry + overlays)", "",
|
||||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 | Images |",
|
||||
"|---|---|---|---|---|---|---|"]
|
||||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
|
||||
"|---|---|---|---|---|---|---|---|"]
|
||||
for p in g:
|
||||
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
|
||||
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||||
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||||
f"({p['marks_pct']}%) | {cov} | {p['validate_verdict']} | "
|
||||
f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
|
||||
f"{p['overlay_images']} |")
|
||||
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
|
||||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
|
||||
|
||||
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
@ -0,0 +1,356 @@
|
||||
{
|
||||
"b1-aqa-biology-7402-1-2023jun": {
|
||||
"source_pdf": "cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": "7402/1",
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"06.4",
|
||||
"07.1",
|
||||
"07.2",
|
||||
"89.6",
|
||||
"08.1",
|
||||
"08.2",
|
||||
"08.3",
|
||||
"08.4",
|
||||
"09.1",
|
||||
"09.2",
|
||||
"09.3",
|
||||
"09.4",
|
||||
"09.5",
|
||||
"09.6",
|
||||
"10.1",
|
||||
"10.2",
|
||||
"10.3"
|
||||
]
|
||||
},
|
||||
"b1-aqa-chemistry-7405-1-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": "7405/1",
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"01.6",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"02.5",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"05.6",
|
||||
"05.7",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"06.4",
|
||||
"06.5",
|
||||
"06.6",
|
||||
"06.7",
|
||||
"07.1",
|
||||
"07.2",
|
||||
"07.3",
|
||||
"07.4",
|
||||
"07.5",
|
||||
"07.6",
|
||||
"07.7",
|
||||
"08.1",
|
||||
"08.2",
|
||||
"08.3",
|
||||
"08.4",
|
||||
"08.5"
|
||||
]
|
||||
},
|
||||
"b1-aqa-physics-7408-1-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": "7408/1",
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"05.6",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"07.0",
|
||||
"08.0",
|
||||
"09.0",
|
||||
"10.0",
|
||||
"11.0",
|
||||
"12.0",
|
||||
"13.0",
|
||||
"14.0",
|
||||
"15.0",
|
||||
"16.0",
|
||||
"17.0",
|
||||
"18.0",
|
||||
"19.0",
|
||||
"20.0",
|
||||
"21.0",
|
||||
"22.0",
|
||||
"23.0",
|
||||
"24.0",
|
||||
"25.0",
|
||||
"26.0",
|
||||
"27.0",
|
||||
"28.0",
|
||||
"29.0",
|
||||
"30.0",
|
||||
"31.0"
|
||||
]
|
||||
},
|
||||
"b1-aqa-biology-8461-1h-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": "8461/1",
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"01.6",
|
||||
"01.7",
|
||||
"01.8",
|
||||
"01.9",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"02.5",
|
||||
"02.6",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"06.4",
|
||||
"06.5",
|
||||
"07.1",
|
||||
"07.2",
|
||||
"07.3",
|
||||
"07.4",
|
||||
"07.5",
|
||||
"07.6",
|
||||
"07.7",
|
||||
"07.8"
|
||||
]
|
||||
},
|
||||
"b1-aqa-chemistry-8462-1h-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": "8462/1",
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"01.6",
|
||||
"01.7",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"02.5",
|
||||
"02.6",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"04.6",
|
||||
"04.7",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"06.4",
|
||||
"06.5",
|
||||
"06.6",
|
||||
"07.1",
|
||||
"07.2",
|
||||
"07.3",
|
||||
"07.4",
|
||||
"07.5",
|
||||
"07.6",
|
||||
"08.1",
|
||||
"08.2",
|
||||
"08.3",
|
||||
"08.4",
|
||||
"08.5"
|
||||
]
|
||||
},
|
||||
"b1-aqa-combined-8464-b1h-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": null,
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"01.6",
|
||||
"01.7",
|
||||
"01.8",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"02.5",
|
||||
"02.6",
|
||||
"02.7",
|
||||
"03.1",
|
||||
"03.2",
|
||||
"03.3",
|
||||
"03.4",
|
||||
"03.5",
|
||||
"03.6",
|
||||
"03.7",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"05.5",
|
||||
"05.6",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3"
|
||||
]
|
||||
},
|
||||
"b1-aqa-combined-8464-c1h-2022jun": {
|
||||
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||
"board_detected": "aqa",
|
||||
"paper_code_detected": null,
|
||||
"labels": [
|
||||
"01.1",
|
||||
"01.2",
|
||||
"01.3",
|
||||
"01.4",
|
||||
"01.5",
|
||||
"02.1",
|
||||
"02.2",
|
||||
"02.3",
|
||||
"02.4",
|
||||
"02.5",
|
||||
"03.0",
|
||||
"04.1",
|
||||
"04.2",
|
||||
"04.3",
|
||||
"04.4",
|
||||
"04.5",
|
||||
"04.6",
|
||||
"04.7",
|
||||
"05.1",
|
||||
"05.2",
|
||||
"05.3",
|
||||
"05.4",
|
||||
"06.1",
|
||||
"06.2",
|
||||
"06.3",
|
||||
"06.4",
|
||||
"06.5",
|
||||
"07.1",
|
||||
"07.2",
|
||||
"07.3",
|
||||
"07.4",
|
||||
"07.5",
|
||||
"07.6"
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -162,7 +162,16 @@ def detect_response_regions_from_pdf(
|
||||
page_index=page_index,
|
||||
min_confidence=min_confidence,
|
||||
)
|
||||
candidates.extend(candidate.to_mapper_dict() for candidate in page_candidates)
|
||||
for candidate in page_candidates:
|
||||
item = candidate.to_mapper_dict()
|
||||
item.setdefault("meta", {}).update({
|
||||
"page_width_px": pix.width,
|
||||
"page_height_px": pix.height,
|
||||
"page_width_pdf": float(doc[page_index].rect.width),
|
||||
"page_height_pdf": float(doc[page_index].rect.height),
|
||||
"render_dpi": dpi,
|
||||
})
|
||||
candidates.append(item)
|
||||
return candidates
|
||||
finally:
|
||||
doc.close()
|
||||
@ -280,7 +289,7 @@ def _detect_answer_lines(binary: np.ndarray, *, page_index: int, width: int, hei
|
||||
span_ratio = box_w / max(width, 1)
|
||||
count_bonus = min(0.2, max(0, line_count - 1) * 0.05)
|
||||
confidence = min(0.92, 0.42 + span_ratio * 0.35 + count_bonus)
|
||||
region_type = "answer_lines" if line_count > 1 else "working_space"
|
||||
region_type = "answer_lines"
|
||||
candidates.append(
|
||||
RegionCandidate(
|
||||
page_index=page_index,
|
||||
@ -351,6 +360,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
|
||||
if rectangularity < 0.03:
|
||||
continue
|
||||
confidence = min(0.88, 0.46 + min(0.24, w / width * 0.24) + min(0.18, h / height * 0.5))
|
||||
region_type = "working_space" if (h > height * 0.12 and rectangularity < 0.18) else "answer_box"
|
||||
padded_x = max(0, x - 2)
|
||||
padded_y = max(0, y - 2)
|
||||
padded_right = min(width, x + w + 2)
|
||||
@ -362,7 +372,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
|
||||
y=padded_y,
|
||||
w=padded_right - padded_x,
|
||||
h=padded_bottom - padded_y,
|
||||
region_type="answer_box",
|
||||
region_type=region_type,
|
||||
confidence=confidence,
|
||||
detection_method="opencv_contour_box",
|
||||
meta={"rectangularity": round(float(rectangularity), 3)},
|
||||
|
||||
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
@ -0,0 +1,87 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Populate the gitignored B1 image-only eval corpus from the .94 exam-board store.
|
||||
|
||||
The B1 eval papers are NOT committed (third-party copyright; served only via signed URLs).
|
||||
This script downloads each B1_GEOMETRY paper's `storage_loc` object from cc.examboards via the
|
||||
Storage API into its local `pdf` path (under samples/b1/), so finalize.py --b1-only and the
|
||||
B1-2/B1-3 generalization work can run against a real corpus.
|
||||
|
||||
Run from api/services/docling/ inside the cc-api-dev container (SUPABASE_URL/SERVICE_ROLE_KEY in env):
|
||||
python3 scripts/fetch_b1_corpus.py # fetch all B1 papers (skip existing)
|
||||
python3 scripts/fetch_b1_corpus.py --force # re-download
|
||||
python3 scripts/fetch_b1_corpus.py --only b1-aqa-physics-7408-1-2022jun
|
||||
python3 scripts/fetch_b1_corpus.py --list # show what would be fetched, no download
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Import the canonical B1 corpus definition (slug, storage_loc, local pdf path) from finalize.
|
||||
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||
_DOCLING_DIR = os.path.dirname(_HERE)
|
||||
sys.path.insert(0, _DOCLING_DIR)
|
||||
from finalize import B1_GEOMETRY # noqa: E402
|
||||
|
||||
|
||||
def _split_storage_loc(storage_loc: str) -> tuple[str, str]:
|
||||
"""'cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf' -> ('cc.examboards', 'aqa/.../qp.pdf')."""
|
||||
bucket, _, path = storage_loc.partition("/")
|
||||
if not bucket or not path:
|
||||
raise ValueError(f"malformed storage_loc: {storage_loc!r}")
|
||||
return bucket, path
|
||||
|
||||
|
||||
def _entries(only: str | None):
|
||||
for p in B1_GEOMETRY:
|
||||
loc = p.get("storage_loc")
|
||||
pdf = p.get("pdf")
|
||||
if not loc or not pdf:
|
||||
continue
|
||||
if only and p.get("slug") != only:
|
||||
continue
|
||||
yield p["slug"], loc, pdf
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Fetch the B1 image-only eval corpus from .94 cc.examboards")
|
||||
ap.add_argument("--force", action="store_true", help="re-download even if the local file exists")
|
||||
ap.add_argument("--only", help="fetch a single paper by slug")
|
||||
ap.add_argument("--list", action="store_true", help="list what would be fetched and exit")
|
||||
args = ap.parse_args()
|
||||
|
||||
todo = list(_entries(args.only))
|
||||
if not todo:
|
||||
print("no matching B1 papers", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.list:
|
||||
for slug, loc, pdf in todo:
|
||||
print(f"{slug}\t{loc}\t-> {pdf}")
|
||||
return 0
|
||||
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
storage = StorageAdmin()
|
||||
|
||||
ok = skipped = 0
|
||||
for slug, loc, pdf in todo:
|
||||
dest = os.path.join(_DOCLING_DIR, pdf) if not os.path.isabs(pdf) else pdf
|
||||
if os.path.exists(dest) and not args.force:
|
||||
print(f"[skip] {slug} (exists)")
|
||||
skipped += 1
|
||||
continue
|
||||
bucket, path = _split_storage_loc(loc)
|
||||
data = storage.download_file(bucket, path)
|
||||
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||
with open(dest, "wb") as fh:
|
||||
fh.write(data)
|
||||
print(f"[ok] {slug} <- {bucket}/{path} ({len(data)} bytes)")
|
||||
ok += 1
|
||||
|
||||
print(f"fetched {ok}, skipped {skipped}, of {len(todo)}")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
32
api/services/docling/scripts/make_b1_gt.py
Normal file
32
api/services/docling/scripts/make_b1_gt.py
Normal file
@ -0,0 +1,32 @@
|
||||
import json, sys
|
||||
from pathlib import Path
|
||||
base=Path('/app/api/services/docling')
|
||||
sys.path.insert(0, str(base))
|
||||
import extract
|
||||
papers=[
|
||||
('b1-aqa-biology-7402-1-2023jun','samples/b1/aqa-biology-7402-1-2023jun.pdf','cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf'),
|
||||
('b1-aqa-chemistry-7405-1-2022jun','samples/b1/aqa-chemistry-7405-1-2022jun.pdf','cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf'),
|
||||
('b1-aqa-physics-7408-1-2022jun','samples/b1/aqa-physics-7408-1-2022jun.pdf','cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf'),
|
||||
('b1-aqa-biology-8461-1h-2022jun','samples/b1/aqa-biology-8461-1h-2022jun.pdf','cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf'),
|
||||
('b1-aqa-chemistry-8462-1h-2022jun','samples/b1/aqa-chemistry-8462-1h-2022jun.pdf','cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf'),
|
||||
('b1-aqa-combined-8464-b1h-2022jun','samples/b1/aqa-combined-8464-b1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf'),
|
||||
('b1-aqa-combined-8464-c1h-2022jun','samples/b1/aqa-combined-8464-c1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf'),
|
||||
]
|
||||
out={}
|
||||
for slug, rel, storage in papers:
|
||||
lines=extract.lines_from_pdftext(str(base/rel))
|
||||
board, code=extract.detect_board(lines)
|
||||
if board != 'aqa':
|
||||
raise RuntimeError(f'{slug}: expected AQA board, detected {board!r} ({code!r})')
|
||||
parts=extract.parse_text_by_board(lines, board)
|
||||
labels=list(parts)
|
||||
out[slug]={
|
||||
'source_pdf': storage,
|
||||
'source_method': 'AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.',
|
||||
'board_detected': board,
|
||||
'paper_code_detected': code,
|
||||
'labels': labels,
|
||||
}
|
||||
print(slug, board, code, len(labels), labels[:5], labels[-5:])
|
||||
Path(base/'fixtures').mkdir(exist_ok=True)
|
||||
Path(base/'fixtures/b1_gt_labels.json').write_text(json.dumps(out, indent=2)+"\n")
|
||||
69
api/services/docling/scripts/rapid_pass.py
Normal file
69
api/services/docling/scripts/rapid_pass.py
Normal file
@ -0,0 +1,69 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
rapid_pass.py — generalise the proven AQA "RapidOCR margin-pass" (95.2% on the image-only
|
||||
8463 paper) to any AQA paper. Born-digital AQA QPs ship a text layer, so we force RapidOCR
|
||||
over the *rendered* page (`force_ocr:true`) to simulate the image-only redistribution case
|
||||
and recover the boxed `NN.M` question numbers Tesseract shatters.
|
||||
|
||||
For each page it writes results/<outdir>/p{N}.json (a full per-page DoclingDocument, the
|
||||
shape extract.py's aqa_questions_rapid expects) and a merged.json (for board / front-matter
|
||||
detection). All GPU work is serialised + OOM-resilient through dsync.
|
||||
|
||||
Usage:
|
||||
python scripts/rapid_pass.py samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf rapid_7408
|
||||
python scripts/rapid_pass.py <pdf> <outdir-slug> [first_page] [last_page]
|
||||
"""
|
||||
import os, sys, json, subprocess, re
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
import dsync
|
||||
|
||||
OPTS = {"ocr_engine": "rapidocr", "force_ocr": True}
|
||||
|
||||
|
||||
def npages(pdf):
|
||||
out = subprocess.check_output(["pdfinfo", pdf]).decode()
|
||||
return int(out.split("Pages:")[1].split()[0])
|
||||
|
||||
|
||||
def main():
|
||||
pdf = sys.argv[1]
|
||||
slug = sys.argv[2]
|
||||
if os.path.isabs(slug) or ".." in slug.split(os.sep) or not re.fullmatch(r"[A-Za-z0-9._/-]+", slug):
|
||||
raise SystemExit(f"unsafe output slug: {slug!r}")
|
||||
n = npages(pdf)
|
||||
first = int(sys.argv[3]) if len(sys.argv) > 3 else 1
|
||||
last = min(int(sys.argv[4]), n) if len(sys.argv) > 4 else n
|
||||
if first > n or first > last:
|
||||
print(f"requested page range {first}-{last} is outside PDF ({n} pages); nothing to do")
|
||||
return
|
||||
outdir = os.path.join("results", slug)
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
r = dsync._redis()
|
||||
print(f"redis: {'connected' if r else 'NO CACHE'} pdf={pdf} pages {first}-{last}/{n}")
|
||||
merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
|
||||
for pg in range(first, last + 1):
|
||||
page_path = os.path.join(outdir, f"p{pg}.json")
|
||||
if os.path.exists(page_path):
|
||||
doc = json.load(open(page_path))
|
||||
print(f" p{pg}: file cache HIT ({len(doc.get(texts, []))} texts)")
|
||||
else:
|
||||
doc = dsync.convert_page(pdf, pg, OPTS, r=r)
|
||||
if not doc:
|
||||
merged["_failed_pages"].append(pg)
|
||||
print(f" p{pg}: FAILED")
|
||||
continue
|
||||
json.dump(doc, open(page_path, "w"))
|
||||
for k in ("texts", "tables", "pictures"):
|
||||
merged[k].extend(doc.get(k, []))
|
||||
merged["pages"].update(doc.get("pages", {}))
|
||||
nmarg = sum(1 for t in doc.get("texts", [])
|
||||
if (t.get("prov") or [{}])[0].get("bbox", {}).get("l", 999) <= 140)
|
||||
print(f" p{pg}: {len(doc.get('texts', []))} texts ({nmarg} left-margin)")
|
||||
json.dump(merged, open(os.path.join(outdir, "merged.json"), "w"))
|
||||
print(f"-> {outdir}/ ({last-first+1-len(merged['_failed_pages'])} pages, "
|
||||
f"failed={merged['_failed_pages']})")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
99
modules/upload_validation.py
Normal file
99
modules/upload_validation.py
Normal file
@ -0,0 +1,99 @@
|
||||
"""Upload boundary validation shared by file-upload endpoints.
|
||||
|
||||
E3 hardening: keep user-facing upload routes from buffering arbitrary data and
|
||||
from accepting arbitrary MIME/types into Supabase storage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Iterable, Optional
|
||||
|
||||
from fastapi import HTTPException, UploadFile
|
||||
|
||||
# Conservative defaults: Classroom Copilot uploads are user documents/images.
|
||||
# Exam scan uploads already have their own 50 MB PDF-only guard in routers.exam.batches.
|
||||
MAX_UPLOAD_BYTES = int(os.getenv("CC_UPLOAD_MAX_BYTES", str(25 * 1024 * 1024)))
|
||||
UPLOAD_CHUNK_BYTES = 1024 * 1024
|
||||
|
||||
ALLOWED_UPLOAD_MIME_TYPES = frozenset(
|
||||
mt.strip().lower()
|
||||
for mt in os.getenv(
|
||||
"CC_UPLOAD_ALLOWED_MIME_TYPES",
|
||||
",".join(
|
||||
[
|
||||
"application/pdf",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/webp",
|
||||
"image/gif",
|
||||
"text/plain",
|
||||
"text/csv",
|
||||
"text/markdown",
|
||||
"application/msword",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
),
|
||||
).split(",")
|
||||
if mt.strip()
|
||||
)
|
||||
|
||||
_PDF_MIME_TYPES = {"application/pdf", "application/x-pdf"}
|
||||
|
||||
|
||||
def allowed_upload_mime_types_csv() -> str:
|
||||
"""Stable display string for evidence/errors without leaking config internals."""
|
||||
return ", ".join(sorted(ALLOWED_UPLOAD_MIME_TYPES))
|
||||
|
||||
|
||||
def _declared_mime(upload: UploadFile) -> str:
|
||||
return (upload.content_type or "application/octet-stream").split(";", 1)[0].strip().lower()
|
||||
|
||||
|
||||
def validate_upload_mime(upload: UploadFile, *, allowed_mime_types: Optional[Iterable[str]] = None) -> str:
|
||||
"""Validate client-declared upload MIME/type and return its normalised value."""
|
||||
declared = _declared_mime(upload)
|
||||
allowed = {mt.lower() for mt in (allowed_mime_types or ALLOWED_UPLOAD_MIME_TYPES)}
|
||||
if declared not in allowed:
|
||||
raise HTTPException(
|
||||
status_code=415,
|
||||
detail=(
|
||||
f"Unsupported upload type '{declared}'. Allowed MIME types: "
|
||||
f"{', '.join(sorted(allowed))}"
|
||||
),
|
||||
)
|
||||
return declared
|
||||
|
||||
|
||||
async def read_upload_bytes(
|
||||
upload: UploadFile,
|
||||
*,
|
||||
max_bytes: int = MAX_UPLOAD_BYTES,
|
||||
allowed_mime_types: Optional[Iterable[str]] = None,
|
||||
) -> tuple[bytes, str]:
|
||||
"""Validate MIME and read an UploadFile with a hard size ceiling."""
|
||||
mime_type = validate_upload_mime(upload, allowed_mime_types=allowed_mime_types)
|
||||
chunks: list[bytes] = []
|
||||
total = 0
|
||||
while True:
|
||||
chunk = await upload.read(UPLOAD_CHUNK_BYTES)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
if total > max_bytes:
|
||||
raise HTTPException(status_code=413, detail=f"Upload exceeds max size ({max_bytes} bytes)")
|
||||
chunks.append(chunk)
|
||||
return b"".join(chunks), mime_type
|
||||
|
||||
|
||||
async def read_pdf_upload_bytes(upload: UploadFile, *, max_bytes: int = MAX_UPLOAD_BYTES) -> bytes:
|
||||
"""Read a PDF-only upload with size and lightweight magic-header validation."""
|
||||
data, _mime_type = await read_upload_bytes(upload, max_bytes=max_bytes, allowed_mime_types=_PDF_MIME_TYPES)
|
||||
if not data:
|
||||
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
||||
if not data.startswith(b"%PDF-"):
|
||||
raise HTTPException(status_code=415, detail="Uploaded file is not a valid PDF")
|
||||
return data
|
||||
@ -12,6 +12,7 @@ from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
|
||||
from modules.logger_tool import initialise_logger
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from modules.upload_validation import read_upload_bytes
|
||||
from modules.document_processor import DocumentProcessor
|
||||
from modules.queue_system import (
|
||||
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
||||
@ -36,6 +37,24 @@ DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hou
|
||||
|
||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||
|
||||
def _user_id_from_payload(payload: Dict[str, Any]) -> str:
|
||||
user_id = payload.get('sub') or payload.get('user_id')
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="Invalid token payload")
|
||||
return user_id
|
||||
|
||||
def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
|
||||
"""Require cabinet ownership before service-role reads file metadata."""
|
||||
owned = (
|
||||
client.supabase.table('file_cabinets')
|
||||
.select('id')
|
||||
.eq('id', cabinet_id)
|
||||
.eq('user_id', user_id)
|
||||
.limit(1)
|
||||
.execute()
|
||||
)
|
||||
return bool(owned.data)
|
||||
|
||||
def _safe_filename(name: str) -> str:
|
||||
base = os.path.basename(name or 'file')
|
||||
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
||||
@ -70,13 +89,13 @@ async def upload_file(
|
||||
# Stage DB row to get file_id
|
||||
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
||||
name = _safe_filename(path or file.filename)
|
||||
file_bytes = await file.read()
|
||||
file_bytes, mime_type = await read_upload_bytes(file)
|
||||
insert_res = client.supabase.table('files').insert({
|
||||
'cabinet_id': cabinet_id,
|
||||
'name': name,
|
||||
'path': staged_path,
|
||||
'bucket': bucket,
|
||||
'mime_type': file.content_type,
|
||||
'mime_type': mime_type,
|
||||
'uploaded_by': user_id,
|
||||
'size_bytes': len(file_bytes),
|
||||
'source': 'classroomcopilot-web'
|
||||
@ -89,7 +108,7 @@ async def upload_file(
|
||||
# Final storage path: bucket/cabinet_id/file_id/file
|
||||
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
||||
try:
|
||||
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
|
||||
storage.upload_file(bucket, final_storage_path, file_bytes, mime_type, upsert=True)
|
||||
except Exception as e:
|
||||
# cleanup staged row
|
||||
client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||
@ -117,7 +136,10 @@ async def upload_file(
|
||||
|
||||
@router.get("/files")
|
||||
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
user_id = _user_id_from_payload(payload)
|
||||
client = SupabaseServiceRoleClient()
|
||||
if not _cabinet_visible_to_user(client, cabinet_id, user_id):
|
||||
return []
|
||||
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
||||
return res.data
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ from fastapi.responses import JSONResponse
|
||||
from modules.auth.supabase_bearer import SupabaseBearer
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from modules.upload_validation import read_upload_bytes
|
||||
from modules.logger_tool import initialise_logger
|
||||
|
||||
router = APIRouter()
|
||||
@ -26,6 +27,24 @@ auth = SupabaseBearer()
|
||||
|
||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||
|
||||
def _user_id_from_payload(payload: Dict[str, Any]) -> str:
|
||||
user_id = payload.get('sub') or payload.get('user_id')
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="Invalid token payload")
|
||||
return user_id
|
||||
|
||||
def _cabinet_visible_to_user(client: SupabaseServiceRoleClient, cabinet_id: str, user_id: str) -> bool:
|
||||
"""Require cabinet ownership before service-role reads file metadata."""
|
||||
owned = (
|
||||
client.supabase.table('file_cabinets')
|
||||
.select('id')
|
||||
.eq('id', cabinet_id)
|
||||
.eq('user_id', user_id)
|
||||
.limit(1)
|
||||
.execute()
|
||||
)
|
||||
return bool(owned.data)
|
||||
|
||||
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
|
||||
"""Choose appropriate bucket based on scope - matches old system logic."""
|
||||
scope = (scope or 'teacher').lower()
|
||||
@ -54,10 +73,9 @@ async def upload_file(
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="User ID required")
|
||||
|
||||
# Read file content
|
||||
file_bytes = await file.read()
|
||||
# Validate MIME/type and read file content with a hard size limit.
|
||||
file_bytes, mime_type = await read_upload_bytes(file)
|
||||
file_size = len(file_bytes)
|
||||
mime_type = file.content_type or 'application/octet-stream'
|
||||
filename = file.filename or path
|
||||
|
||||
logger.info(f"📤 Simplified upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||
@ -134,7 +152,10 @@ async def upload_file(
|
||||
@router.get("/files")
|
||||
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""List files in a cabinet."""
|
||||
user_id = _user_id_from_payload(payload)
|
||||
client = SupabaseServiceRoleClient()
|
||||
if not _cabinet_visible_to_user(client, cabinet_id, user_id):
|
||||
return []
|
||||
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
||||
return res.data
|
||||
|
||||
|
||||
@ -132,9 +132,13 @@ async def reset_environment(
|
||||
"""DESTRUCTIVE: wipe test data. Platform admin only.
|
||||
|
||||
scope (query param):
|
||||
- all : full wipe (Neo4j + Supabase data + auth users) AND exam subsystem + storage.
|
||||
- exam-corpus : ONLY the exam corpus — eb_*/exam_* tables + cc.examboards storage objects
|
||||
(load/unload the public corpus without touching schools/users).
|
||||
- all : full wipe (Neo4j + Supabase data + auth users) AND the entire
|
||||
exam-marker subsystem below.
|
||||
- exam-corpus : ONLY the entire exam-marker subsystem, not just public papers:
|
||||
public corpus/eb_* data, cc.examboards storage objects, exam
|
||||
templates, template layouts, questions, boundaries, response
|
||||
areas, marking batches, student submissions, and mark entries
|
||||
(without touching schools/users).
|
||||
- timetable : ONLY timetable/calendar materialization tables.
|
||||
"""
|
||||
if scope not in ("all", "exam-corpus", "timetable"):
|
||||
|
||||
@ -13,6 +13,7 @@ join keys (spec §2).
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
import tempfile
|
||||
import time
|
||||
@ -28,6 +29,7 @@ from api.services.docling.regions import detect_response_regions_from_pdf
|
||||
from modules.database.services.exam_projection import project_template, project_template_safe
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from modules.upload_validation import read_pdf_upload_bytes
|
||||
from modules.logger_tool import initialise_logger
|
||||
from routers.exam.dependencies import ExamContext, get_exam_context, lookup_exam_code
|
||||
from routers.exam.schemas import (
|
||||
@ -136,6 +138,22 @@ def _lookup_exam_storage_loc(exam_id: str) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _signed_url_value(result: Any) -> str:
|
||||
"""Normalise supabase-py signed URL responses across v1/v2 shapes."""
|
||||
if isinstance(result, str):
|
||||
return result
|
||||
if isinstance(result, dict):
|
||||
value = result.get("signedURL") or result.get("signedUrl") or result.get("signed_url")
|
||||
if value:
|
||||
return str(value)
|
||||
data = getattr(result, "data", None)
|
||||
if isinstance(data, dict):
|
||||
value = data.get("signedURL") or data.get("signedUrl") or data.get("signed_url")
|
||||
if value:
|
||||
return str(value)
|
||||
raise ValueError("Storage service did not return a signed URL")
|
||||
|
||||
|
||||
async def _parse_create_template_request(request: Request) -> tuple[CreateTemplateRequest, Optional[UploadFile]]:
|
||||
content_type = request.headers.get("content-type", "")
|
||||
if "multipart/form-data" in content_type:
|
||||
@ -164,11 +182,7 @@ async def _upload_template_source_file(
|
||||
institute_id: str,
|
||||
upload: UploadFile,
|
||||
) -> str:
|
||||
file_bytes = await upload.read()
|
||||
if not file_bytes:
|
||||
raise HTTPException(status_code=400, detail="Uploaded PDF is empty")
|
||||
if upload.content_type and upload.content_type != "application/pdf":
|
||||
raise HTTPException(status_code=400, detail="Uploaded file must be a PDF")
|
||||
file_bytes = await read_pdf_upload_bytes(upload)
|
||||
|
||||
service = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
@ -329,6 +343,13 @@ def _pdf_has_text_layer(pdf_bytes: bytes) -> bool:
|
||||
pass
|
||||
|
||||
|
||||
# Canvas page width the frontend renders each PDF page at (app src/utils/exam-canvas/model.ts
|
||||
# PAGE_WIDTH). All auto-map canvas coords are emitted in this 780-wide, proportional-height space.
|
||||
CANVAS_PAGE_WIDTH = 780.0
|
||||
# Response/answer-region detector (api/services/docling/regions.py) renders at 144 DPI = 2 px / PDF point.
|
||||
REGIONS_PX_PER_PT = 2.0
|
||||
|
||||
|
||||
def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
|
||||
with tempfile.NamedTemporaryFile(prefix="cc-auto-map-geom-", suffix=".pdf", delete=False) as fh:
|
||||
fh.write(pdf_bytes)
|
||||
@ -342,14 +363,23 @@ def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
|
||||
for page in doc:
|
||||
media = page.mediabox
|
||||
crop = page.cropbox
|
||||
rendered_w = float(crop.width or page.rect.width or 595.0)
|
||||
rendered_h = float(crop.height or page.rect.height or 842.0)
|
||||
page_pt_w = float(crop.width or page.rect.width or 1.0)
|
||||
page_pt_h = float(crop.height or page.rect.height or 1.0)
|
||||
# Emit canvas coords in the FRONTEND render space: the app draws each page at
|
||||
# CANVAS_PAGE_WIDTH (app model.ts PAGE_WIDTH=780) with proportional height and stacks
|
||||
# pages by those heights. Previously rendered_w/h were left in PDF points (~595x842),
|
||||
# so every shape landed shrunk (~0.76x) and shifted up-left on the 780-wide canvas.
|
||||
rendered_w = CANVAS_PAGE_WIDTH
|
||||
# Mirror the app's canvas.height = Math.ceil(viewport.height) EXACTLY (pdfLoader.ts),
|
||||
# so page_top accumulates identically. Using the raw float drifts ~1px/page, compounding
|
||||
# to a visible upward shift on later pages of long papers (~36px over 40 pages).
|
||||
rendered_h = float(math.ceil(CANVAS_PAGE_WIDTH * page_pt_h / page_pt_w))
|
||||
pages.append({
|
||||
"media_x0": float(media.x0),
|
||||
"crop_x0": float(crop.x0),
|
||||
"crop_y0": float(crop.y0),
|
||||
"page_pt_w": float(crop.width or page.rect.width or 1),
|
||||
"page_pt_h": float(crop.height or page.rect.height or 1),
|
||||
"page_pt_w": page_pt_w,
|
||||
"page_pt_h": page_pt_h,
|
||||
"rendered_w": rendered_w,
|
||||
"rendered_h": rendered_h,
|
||||
"page_top": page_top,
|
||||
@ -371,11 +401,12 @@ def _pdf_page_geometry(pdf_bytes: bytes) -> List[Dict[str, float]]:
|
||||
def _page_geom(pages: List[Dict[str, float]], page_number: int) -> Dict[str, float]:
|
||||
if 1 <= page_number <= len(pages):
|
||||
return pages[page_number - 1]
|
||||
_fallback_h = float(math.ceil(CANVAS_PAGE_WIDTH * 842.0 / 595.0))
|
||||
return {
|
||||
"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0,
|
||||
"page_pt_w": 595.0, "page_pt_h": 842.0,
|
||||
"rendered_w": 595.0, "rendered_h": 842.0,
|
||||
"page_top": (page_number - 1) * 842.0,
|
||||
"rendered_w": CANVAS_PAGE_WIDTH, "rendered_h": _fallback_h,
|
||||
"page_top": (page_number - 1) * _fallback_h,
|
||||
}
|
||||
|
||||
|
||||
@ -384,12 +415,16 @@ def _box_to_canvas(box: Optional[Dict[str, Any]], page_number: int, pages: List[
|
||||
return None
|
||||
g = _page_geom(pages, page_number)
|
||||
if box.get("coord_origin") == "TOPLEFT" and {"x", "y", "w", "h"}.issubset(box):
|
||||
scale = 0.5 if box.get("unit") == "px" else 1.0
|
||||
# Scale the box into the 780-wide canvas space. px boxes (opencv/gemma regions) are in
|
||||
# rendered-image px at REGIONS_PX_PER_PT px/point; TOPLEFT point boxes are 1 px/point.
|
||||
px_per_pt = REGIONS_PX_PER_PT if box.get("unit") == "px" else 1.0
|
||||
sx = g["rendered_w"] / (g["page_pt_w"] * px_per_pt)
|
||||
sy = g["rendered_h"] / (g["page_pt_h"] * px_per_pt)
|
||||
return {
|
||||
"x": round(float(box["x"]) * scale, 2),
|
||||
"y": round(g["page_top"] + float(box["y"]) * scale, 2),
|
||||
"w": round(float(box["w"]) * scale, 2),
|
||||
"h": round(float(box["h"]) * scale, 2),
|
||||
"x": round(float(box["x"]) * sx, 2),
|
||||
"y": round(g["page_top"] + float(box["y"]) * sy, 2),
|
||||
"w": round(float(box["w"]) * sx, 2),
|
||||
"h": round(float(box["h"]) * sy, 2),
|
||||
}
|
||||
if not {"l", "t", "r", "b"}.issubset(box):
|
||||
return None
|
||||
@ -494,6 +529,12 @@ def _map_first_pass_to_rows(template_id: str, first_pass: Dict[str, Any], pdf_by
|
||||
questions.append({"id": parent_id, "template_id": template_id, "label": parent_label, "order": len(q_ids) - 1, "max_marks": 0, "is_container": True, "source": "ai", "confirmed": False, "confidence": 0.7, "derivation": "docling-inferred-main-question"})
|
||||
pid = _ai_id(template_id, "part", label)
|
||||
first_part_by_page.setdefault(page_index, pid)
|
||||
# B1 live-route papers can carry continuation bands for the same part label
|
||||
# on later pages. The UUID is intentionally stable per template+part label,
|
||||
# so only insert the first question row; later continuations still map
|
||||
# response/context regions through first_part_by_page.
|
||||
if any(q["id"] == pid for q in questions):
|
||||
continue
|
||||
bounds = None
|
||||
y1, y2 = band.get("y_start"), band.get("y_end")
|
||||
if margins["left"] is not None and margins["right"] is not None and y1 is not None and y2 is not None:
|
||||
@ -521,15 +562,47 @@ def _map_first_pass_to_rows(template_id: str, first_pass: Dict[str, Any], pdf_by
|
||||
response_form = _response_form_from_region_type(region.get("region_type"))
|
||||
if response_form:
|
||||
response_areas.append({"id": _ai_id(template_id, "region", page_index, idx), "template_id": template_id, "question_id": first_part_by_page.get(page_index, default_qid), "page": page_index + 1, "bounds": bounds, "kind": "response", "response_form": response_form, "source": "ai", "confirmed": False, "confidence": _safe_confidence(region.get("confidence")), "derivation": region.get("detection_method") or "opencv-response-region"})
|
||||
# Integrity guard: every response_area/boundary question_id must reference an inserted question
|
||||
# (FK exam_response_areas/exam_boundaries -> exam_questions). On papers where band detection yields
|
||||
# few/no questions but opencv/gemma still emit regions, those regions point at the synthetic
|
||||
# default_qid which was never inserted. Ensure that fallback container question exists and reattach
|
||||
# any orphan child rows to it, so persistence can't violate the FK.
|
||||
qid_set = {q["id"] for q in questions}
|
||||
orphans = [r for r in (response_areas + boundaries) if r.get("question_id") not in qid_set]
|
||||
if orphans:
|
||||
if default_qid not in qid_set:
|
||||
questions.insert(0, {"id": default_qid, "template_id": template_id, "label": "Unassigned",
|
||||
"order": 0, "max_marks": 0, "is_container": True, "source": "ai",
|
||||
"confirmed": False, "confidence": 0.5,
|
||||
"derivation": "auto-map-fallback-container"})
|
||||
qid_set.add(default_qid)
|
||||
for r in orphans:
|
||||
r["question_id"] = default_qid
|
||||
|
||||
return {"questions": questions, "response_areas": response_areas, "boundaries": boundaries, "layout": layout}
|
||||
|
||||
|
||||
def _dedupe_rows_by_id(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""Preserve first occurrence of stable AI row ids emitted by noisy OCR detectors."""
|
||||
out: List[Dict[str, Any]] = []
|
||||
seen: set[str] = set()
|
||||
for row in rows:
|
||||
row_id = row.get("id")
|
||||
if row_id:
|
||||
key = str(row_id)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
out.append(row)
|
||||
return out
|
||||
|
||||
|
||||
def _refresh_ai_rows(ctx: ExamContext, template_id: str, rows: Dict[str, List[Dict[str, Any]]]) -> None:
|
||||
sb = ctx.supabase
|
||||
for table in ("exam_response_areas", "exam_boundaries", "exam_template_layout", "exam_questions"):
|
||||
sb.table(table).delete().eq("template_id", template_id).eq("source", "ai").eq("confirmed", False).execute()
|
||||
for table, key in (("exam_questions", "questions"), ("exam_response_areas", "response_areas"), ("exam_boundaries", "boundaries"), ("exam_template_layout", "layout")):
|
||||
payload = rows.get(key) or []
|
||||
payload = _dedupe_rows_by_id(rows.get(key) or [])
|
||||
if payload:
|
||||
sb.table(table).insert(payload).execute()
|
||||
|
||||
@ -611,12 +684,13 @@ async def create_template(
|
||||
|
||||
|
||||
@router.get("/catalogue")
|
||||
async def list_catalogue_papers() -> Dict[str, Any]:
|
||||
"""Lightweight exam-board paper catalogue for the create dialog."""
|
||||
async def list_catalogue_papers(
|
||||
ctx: ExamContext = Depends(get_exam_context),
|
||||
) -> Dict[str, Any]:
|
||||
"""Lightweight authenticated exam-board metadata catalogue for the create dialog."""
|
||||
try:
|
||||
sb = SupabaseServiceRoleClient().supabase
|
||||
res = (
|
||||
sb.table("eb_exams")
|
||||
ctx.supabase.table("eb_exams")
|
||||
.select("id, exam_code, spec_code, paper_code, tier, session, type_code, storage_loc")
|
||||
.eq("type_code", "QP")
|
||||
.order("exam_code")
|
||||
@ -627,6 +701,50 @@ async def list_catalogue_papers() -> Dict[str, Any]:
|
||||
raise HTTPException(status_code=502, detail=f"Could not load catalogue papers: {exc}")
|
||||
|
||||
|
||||
@router.get("/catalogue/{exam_id}/signed-url")
|
||||
async def get_catalogue_paper_signed_url(
|
||||
exam_id: str,
|
||||
expires_in: int = 300,
|
||||
ctx: ExamContext = Depends(get_exam_context),
|
||||
) -> Dict[str, Any]:
|
||||
"""Return a short-lived signed URL for an authenticated user's catalogue PDF access.
|
||||
|
||||
The storage operation uses service role as a scoped backend exception for signing only;
|
||||
raw cc.examboards object reads remain denied by storage.objects RLS.
|
||||
"""
|
||||
expires_in = max(60, min(int(expires_in or 300), 3600))
|
||||
try:
|
||||
row = _first(
|
||||
ctx.supabase.table("eb_exams")
|
||||
.select("id, exam_code, storage_loc")
|
||||
.eq("id", exam_id)
|
||||
.eq("type_code", "QP")
|
||||
.limit(1)
|
||||
.execute()
|
||||
)
|
||||
if not row or not row.get("storage_loc"):
|
||||
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||
try:
|
||||
bucket, path = _parse_storage_loc(row["storage_loc"])
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||
if bucket != "cc.examboards":
|
||||
raise HTTPException(status_code=404, detail="Catalogue paper not found")
|
||||
signed_url = _signed_url_value(StorageAdmin().create_signed_url(bucket, path, expires_in))
|
||||
return {
|
||||
"exam_id": row["id"],
|
||||
"exam_code": row.get("exam_code"),
|
||||
"bucket": bucket,
|
||||
"path": path,
|
||||
"expires_in": expires_in,
|
||||
"signed_url": signed_url,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=502, detail=f"Could not sign catalogue paper URL: {exc}")
|
||||
|
||||
|
||||
@router.get("/templates")
|
||||
async def list_templates(
|
||||
include_archived: bool = False,
|
||||
|
||||
@ -26,6 +26,7 @@ from fastapi.responses import JSONResponse
|
||||
from modules.auth.supabase_bearer import SupabaseBearer
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from modules.upload_validation import read_upload_bytes
|
||||
from modules.logger_tool import initialise_logger
|
||||
|
||||
router = APIRouter()
|
||||
@ -59,10 +60,9 @@ async def upload_single_file(
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="User ID required")
|
||||
|
||||
# Read file content
|
||||
file_bytes = await file.read()
|
||||
# Validate MIME/type and read file content with a hard size limit.
|
||||
file_bytes, mime_type = await read_upload_bytes(file)
|
||||
file_size = len(file_bytes)
|
||||
mime_type = file.content_type or 'application/octet-stream'
|
||||
filename = file.filename or path
|
||||
|
||||
logger.info(f"📤 Simple upload: {filename} ({file_size} bytes) for user {user_id}")
|
||||
@ -234,10 +234,9 @@ async def upload_directory(
|
||||
# Process each file
|
||||
for i, (file, relative_path) in enumerate(zip(files, relative_paths)):
|
||||
try:
|
||||
# Read file content
|
||||
file_bytes = await file.read()
|
||||
# Validate MIME/type and read file content with a hard size limit.
|
||||
file_bytes, mime_type = await read_upload_bytes(file)
|
||||
file_size = len(file_bytes)
|
||||
mime_type = file.content_type or 'application/octet-stream'
|
||||
filename = file.filename or f"file_{i}"
|
||||
|
||||
total_size += file_size
|
||||
@ -291,6 +290,8 @@ async def upload_directory(
|
||||
|
||||
logger.info(f"📄 Uploaded file {i+1}/{len(files)}: {relative_path}")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to upload file {relative_path}: {e}")
|
||||
# Continue with other files, don't fail entire upload
|
||||
|
||||
@ -5,6 +5,7 @@ Clears:
|
||||
- Neo4j: drops ALL databases except system, neo4j (including gaisdata, cc.users.*, cc.institutes.*)
|
||||
- Supabase: deletes ALL data tables except gais_local_authorities and gais_schools
|
||||
- Supabase: deletes all auth users except kcar, then re-seeds kcar profile state
|
||||
- Granular scopes can clear exam corpus, timetable data, or --user-subset seed copies
|
||||
|
||||
Safe invariants (never touched):
|
||||
- kcar auth account
|
||||
@ -82,8 +83,11 @@ SUPABASE_TABLES_TO_CLEAR = [
|
||||
"admin_profiles",
|
||||
]
|
||||
|
||||
# Exam subsystem tables, FK child-first. NOT in the list above — the previous full reset()
|
||||
# never cleared exam data or storage at all; the granular scopes below fold it in.
|
||||
# Exam-marker subsystem tables, FK child-first. scope="exam-corpus" is deliberately
|
||||
# broader than "public papers": it wipes public corpus eb_* rows, templates, layouts,
|
||||
# questions, boundaries, response areas, marking batches, student submissions, and mark
|
||||
# entries. NOT in the list above — the previous full reset() never cleared exam data
|
||||
# or storage at all; the granular scopes below fold it in.
|
||||
EXAM_CORPUS_TABLES = [
|
||||
"mark_entries",
|
||||
"student_submissions",
|
||||
@ -114,7 +118,8 @@ TIMETABLE_TABLES = [
|
||||
"planned_lessons",
|
||||
]
|
||||
|
||||
# Buckets whose objects the exam-corpus reset clears (Storage API — protect_delete blocks raw SQL).
|
||||
# Bucket whose objects scope="exam-corpus" clears for the whole exam-marker subsystem
|
||||
# (Storage API — protect_delete blocks raw SQL).
|
||||
EXAM_STORAGE_BUCKET = "cc.examboards"
|
||||
|
||||
|
||||
@ -129,6 +134,28 @@ def _sb_headers():
|
||||
}
|
||||
|
||||
|
||||
# Markers that identify a production Supabase target. Destructive reset against any of these is
|
||||
# refused by default (project rule: ".94 only; .156 human-gated") — set RESET_ALLOW_PROD=1 to override.
|
||||
PROD_TARGET_MARKERS = ("192.168.0.156", "supabase.classroomcopilot")
|
||||
|
||||
|
||||
def _assert_reset_allowed(url: str, scope: str) -> None:
|
||||
"""Default-deny destructive reset against a production-looking Supabase target.
|
||||
|
||||
The /admin/reset route and this module both act on os.environ['SUPABASE_URL']; without this guard
|
||||
a platform-admin call on a prod-deployed API would wipe prod data + exam corpus + storage. We refuse
|
||||
when the target matches a known prod marker unless an explicit RESET_ALLOW_PROD opt-in is set.
|
||||
"""
|
||||
target = (url or "").lower()
|
||||
looks_prod = any(m in target for m in PROD_TARGET_MARKERS)
|
||||
override = os.environ.get("RESET_ALLOW_PROD", "").strip().lower() in ("1", "true", "yes")
|
||||
if looks_prod and not override:
|
||||
raise RuntimeError(
|
||||
f"refusing destructive reset (scope={scope}) against production-looking target {target!r}; "
|
||||
f"this is human-gated — set RESET_ALLOW_PROD=1 to override."
|
||||
)
|
||||
|
||||
|
||||
# ─── Neo4j helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
def _neo4j_drop_all_non_system() -> Dict[str, List[str]]:
|
||||
@ -195,8 +222,13 @@ def _clear_tables(url: str, headers: dict, tables: List[str]) -> "tuple[List[str
|
||||
|
||||
|
||||
def _clear_exam_storage() -> Dict[str, Any]:
|
||||
"""Remove cc.examboards objects via the Storage API (protect_delete blocks raw SQL deletes).
|
||||
Gathers storage_loc from eb_exams/eb_specifications BEFORE the rows are cleared."""
|
||||
"""Remove cc.examboards objects for the exam-marker subsystem.
|
||||
|
||||
scope="exam-corpus" is not limited to public-paper metadata: it also removes the
|
||||
storage objects that back exam board corpus files and any downstream exam-marker
|
||||
artifacts referenced from eb_exams/eb_specifications. Gathers storage_loc from
|
||||
eb_exams/eb_specifications BEFORE the rows are cleared.
|
||||
"""
|
||||
try:
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
@ -230,31 +262,75 @@ def _clear_exam_storage() -> Dict[str, Any]:
|
||||
return {"removed": removed, "buckets": list(by_bucket)}
|
||||
|
||||
|
||||
def _clear_user_subset_files() -> Dict[str, Any]:
|
||||
"""Remove files rows and cc.users storage objects created by --user-subset seeding.
|
||||
|
||||
Reuses the seed/unseed implementation so reset(scope="user-subset") has the
|
||||
same storage-before-row deletion order and idempotency guarantees as
|
||||
seed_exam_corpus.py --unseed. The helper only targets rows marked by the seeder:
|
||||
bucket='cc.users', source='exam-corpus-seed', path LIKE 'exam-marker/%'.
|
||||
"""
|
||||
try:
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
|
||||
except Exception as exc:
|
||||
logger.warning(f" user-subset clear skipped (import): {exc}")
|
||||
return {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": [str(exc)]}
|
||||
|
||||
rep = LoadReport()
|
||||
_delete_user_subset_files(
|
||||
SupabaseServiceRoleClient(),
|
||||
StorageAdmin(),
|
||||
exam_codes=None,
|
||||
rep=rep,
|
||||
)
|
||||
return {
|
||||
"files_rows_deleted": rep.unseed_user_files,
|
||||
"storage_objects_removed": rep.unseed_objects,
|
||||
"errors": rep.errors,
|
||||
}
|
||||
|
||||
|
||||
# ─── Main reset ───────────────────────────────────────────────────────────────
|
||||
|
||||
def reset(scope: str = "all") -> Dict[str, Any]:
|
||||
"""Destructive reset. scope ∈ {all, exam-corpus, timetable}.
|
||||
"""Destructive reset. scope ∈ {all, exam-corpus, timetable, user-subset}.
|
||||
|
||||
- all : full wipe (Neo4j + Supabase data + auth users) AND the exam subsystem + storage.
|
||||
- exam-corpus : ONLY eb_*/exam_* tables + cc.examboards storage objects (load/unload the corpus).
|
||||
- all : full wipe (Neo4j + Supabase data + auth users) AND the entire
|
||||
exam-marker subsystem listed below, including --user-subset copies.
|
||||
- exam-corpus : ONLY the entire exam-marker subsystem, not just public papers:
|
||||
public corpus/eb_* data, cc.examboards storage objects, exam
|
||||
templates, template layouts, questions, boundaries, response
|
||||
areas, marking batches, student submissions, mark entries, and
|
||||
--user-subset cc.users copies.
|
||||
- timetable : ONLY timetable/calendar materialization tables.
|
||||
- user-subset : ONLY files rows and cc.users storage objects created by
|
||||
seed_exam_corpus.py --user-subset.
|
||||
"""
|
||||
scope = (scope or "all").lower()
|
||||
if scope not in ("all", "exam-corpus", "timetable"):
|
||||
raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable)")
|
||||
if scope not in ("all", "exam-corpus", "timetable", "user-subset"):
|
||||
raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable|user-subset)")
|
||||
url, headers = _sb_headers()
|
||||
_assert_reset_allowed(url, scope)
|
||||
|
||||
if scope == "exam-corpus":
|
||||
logger.info("RESET (scope=exam-corpus) — exam tables + cc.examboards storage")
|
||||
logger.info("RESET (scope=exam-corpus) — entire exam-marker subsystem: public corpus/eb_* data, cc.examboards storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries, and --user-subset copies")
|
||||
user_subset = _clear_user_subset_files()
|
||||
storage = _clear_exam_storage()
|
||||
cleared, failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
|
||||
return {"scope": scope, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
|
||||
return {"scope": scope, "user_subset": user_subset, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
|
||||
|
||||
if scope == "timetable":
|
||||
logger.info("RESET (scope=timetable) — timetable/calendar tables")
|
||||
cleared, failed = _clear_tables(url, headers, TIMETABLE_TABLES)
|
||||
return {"scope": scope, "tables_cleared": cleared, "tables_failed": failed}
|
||||
|
||||
if scope == "user-subset":
|
||||
logger.info("RESET (scope=user-subset) — --user-subset cc.users storage objects and files rows")
|
||||
user_subset = _clear_user_subset_files()
|
||||
return {"scope": scope, "user_subset": user_subset}
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("RESET ENVIRONMENT — full destructive wipe starting")
|
||||
logger.info("=" * 60)
|
||||
@ -267,6 +343,9 @@ def reset(scope: str = "all") -> Dict[str, Any]:
|
||||
results["neo4j"] = {"dropped": dropped}
|
||||
|
||||
# ── 2. Supabase: clear all data tables (GAIS preserved) ──────────────────
|
||||
# First remove --user-subset cc.users storage objects (+ their files rows) via the
|
||||
# Storage API, so the generic files-table clear below doesn't strand orphaned objects.
|
||||
results["user_subset"] = _clear_user_subset_files()
|
||||
logger.info("\n[Supabase] Clearing data tables (preserving gais_*)...")
|
||||
url, headers = _sb_headers()
|
||||
cleared, failed = [], []
|
||||
@ -319,9 +398,12 @@ def reset(scope: str = "all") -> Dict[str, Any]:
|
||||
)
|
||||
logger.info(" kcar → admin_profiles restored ✓")
|
||||
|
||||
# ── 5. Exam subsystem: storage objects (Storage API) + exam tables ───────────
|
||||
# (The legacy full reset cleared neither exam tables nor storage — folded in here.)
|
||||
logger.info("\n[Supabase] Clearing exam subsystem (storage + eb_*/exam_* tables)...")
|
||||
# ── 5. Exam-marker subsystem: storage objects (Storage API) + all exam tables ──
|
||||
# This is the same destructive surface as scope="exam-corpus": public corpus/eb_*
|
||||
# rows, cc.examboards storage, templates/layout/questions/boundaries/response
|
||||
# areas, marking batches, submissions, and mark entries. (The legacy full reset
|
||||
# cleared neither exam tables nor storage — folded in here.)
|
||||
logger.info("\n[Supabase] Clearing entire exam-marker subsystem (public corpus, storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries)...")
|
||||
exam_storage = _clear_exam_storage()
|
||||
exam_cleared, exam_failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
|
||||
|
||||
|
||||
@ -105,6 +105,7 @@ class LoadReport:
|
||||
downloaded: int = 0
|
||||
download_cached: int = 0
|
||||
unseed_objects: int = 0
|
||||
unseed_user_files: int = 0
|
||||
unseed_exams: int = 0
|
||||
unseed_specs: int = 0
|
||||
unseed_templates: int = 0
|
||||
@ -117,6 +118,7 @@ class LoadReport:
|
||||
"downloaded": self.downloaded,
|
||||
"download_cached": self.download_cached,
|
||||
"unseed_objects": self.unseed_objects,
|
||||
"unseed_user_files": self.unseed_user_files,
|
||||
"unseed_exams": self.unseed_exams,
|
||||
"unseed_specs": self.unseed_specs,
|
||||
"unseed_templates": self.unseed_templates,
|
||||
@ -579,6 +581,84 @@ def _chunks(seq: List[Any], n: int = 100):
|
||||
for i in range(0, len(seq), n):
|
||||
yield seq[i:i + n]
|
||||
|
||||
def _storage_remove(storage: StorageAdmin, bucket: str, paths: List[str]) -> None:
|
||||
"""Remove object paths from a bucket through the Supabase Storage API.
|
||||
|
||||
The python client treats missing objects as a successful no-op, which is useful for
|
||||
unseed idempotency. Any API/permission failure is raised so callers can avoid
|
||||
deleting the matching DB rows while storage may still exist.
|
||||
"""
|
||||
result = storage.client.supabase.storage.from_(bucket).remove(paths)
|
||||
error = getattr(result, "error", None)
|
||||
if error:
|
||||
raise StorageError(str(error))
|
||||
if isinstance(result, dict) and result.get("error"):
|
||||
raise StorageError(str(result["error"]))
|
||||
|
||||
def _delete_user_subset_files(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
exam_codes: Optional[List[str]], rep: LoadReport) -> None:
|
||||
"""Delete --user-subset files from cc.users storage, then their files rows.
|
||||
|
||||
User-subset seeding writes rows with source='exam-corpus-seed', bucket='cc.users',
|
||||
and paths under exam-marker/. Storage must be removed before the files rows: the
|
||||
files GC trigger also tries to delete storage when rows are deleted, so removing
|
||||
objects first avoids trigger failures and keeps this operation idempotent.
|
||||
|
||||
exam_codes=None means remove all user-subset seed rows (used by unscoped unseed
|
||||
even if the eb_* rows were already removed by a prior partial run).
|
||||
"""
|
||||
sb = client.supabase
|
||||
seeded_files: List[Dict[str, Any]] = []
|
||||
|
||||
def _base_query():
|
||||
return sb.table("files").select("id, bucket, path, name, source") \
|
||||
.eq("bucket", "cc.users").eq("source", "exam-corpus-seed") \
|
||||
.like("path", "exam-marker/%")
|
||||
|
||||
if exam_codes is None:
|
||||
seeded_files.extend(getattr(_base_query().execute(), "data", None) or [])
|
||||
elif exam_codes:
|
||||
for chunk in _chunks([f"{code}.pdf" for code in exam_codes if code], 100):
|
||||
seeded_files.extend(getattr(_base_query().in_("name", chunk).execute(), "data", None) or [])
|
||||
|
||||
rows_by_id: Dict[str, Dict[str, Any]] = {}
|
||||
paths_by_bucket: Dict[str, List[str]] = {}
|
||||
seen_paths: set = set()
|
||||
for row in seeded_files:
|
||||
row_id = row.get("id")
|
||||
bucket = row.get("bucket")
|
||||
path = row.get("path")
|
||||
if row_id:
|
||||
rows_by_id[str(row_id)] = row
|
||||
if bucket == "cc.users" and isinstance(path, str) and path.startswith("exam-marker/"):
|
||||
key = (bucket, path)
|
||||
if key not in seen_paths:
|
||||
seen_paths.add(key)
|
||||
paths_by_bucket.setdefault(bucket, []).append(path)
|
||||
|
||||
removable_ids = list(rows_by_id)
|
||||
if not removable_ids and not paths_by_bucket:
|
||||
logger.info("[unseed] no user-subset cc.users files to remove")
|
||||
return
|
||||
|
||||
for bkt, paths in paths_by_bucket.items():
|
||||
for chunk in _chunks(paths, 100):
|
||||
try:
|
||||
_storage_remove(storage, bkt, chunk)
|
||||
rep.unseed_objects += len(chunk)
|
||||
except Exception as exc:
|
||||
logger.warning(f"[unseed] user-subset storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
|
||||
rep.errors.append(f"user-subset storage remove {bkt}: {exc}")
|
||||
return
|
||||
|
||||
for chunk in _chunks(removable_ids, 100):
|
||||
try:
|
||||
sb.table("files").delete().in_("id", chunk).execute()
|
||||
rep.unseed_user_files += len(chunk)
|
||||
except Exception as exc:
|
||||
logger.warning(f"[unseed] user-subset files delete failed: {exc}")
|
||||
rep.errors.append(f"user-subset files delete: {exc}")
|
||||
|
||||
def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
board_filter: Optional[str], spec_filter: Optional[str],
|
||||
drop_specs: bool = True, drop_seed_templates: bool = True, rep: LoadReport) -> None:
|
||||
@ -597,6 +677,8 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
specs = getattr(q.execute(), "data", None) or []
|
||||
spec_codes = [s["spec_code"] for s in specs]
|
||||
if not spec_codes:
|
||||
if not board_filter and not spec_filter:
|
||||
_delete_user_subset_files(client, storage, exam_codes=None, rep=rep)
|
||||
logger.info("[unseed] no matching specifications; nothing to do")
|
||||
return
|
||||
|
||||
@ -605,7 +687,14 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
res = sb.table("eb_exams").select("id, exam_code, storage_loc").in_("spec_code", chunk).execute()
|
||||
exams.extend(getattr(res, "data", None) or [])
|
||||
|
||||
# 1) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
|
||||
# 1) User-subset storage/rows. Storage is removed before files rows so trg_files_gc has
|
||||
# nothing left to collect when rows are deleted.
|
||||
user_subset_exam_codes = None if not board_filter and not spec_filter else [
|
||||
e.get("exam_code") for e in exams if e.get("exam_code")
|
||||
]
|
||||
_delete_user_subset_files(client, storage, exam_codes=user_subset_exam_codes, rep=rep)
|
||||
|
||||
# 2) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
|
||||
by_bucket: Dict[str, List[str]] = {}
|
||||
for row in exams + specs:
|
||||
loc = row.get("storage_loc")
|
||||
@ -621,7 +710,7 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
except Exception as exc:
|
||||
logger.warning(f"[unseed] storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
|
||||
|
||||
# 2) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
|
||||
# 3) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
|
||||
if drop_seed_templates and exams:
|
||||
exam_codes = [e["exam_code"] for e in exams if e.get("exam_code")]
|
||||
for chunk in _chunks(exam_codes, 100):
|
||||
@ -632,7 +721,7 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
except Exception as exc:
|
||||
logger.warning(f"[unseed] template delete failed: {exc}")
|
||||
|
||||
# 3) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
|
||||
# 4) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
|
||||
exam_ids = [e["id"] for e in exams]
|
||||
for chunk in _chunks(exam_ids, 100):
|
||||
try:
|
||||
@ -648,8 +737,8 @@ def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
|
||||
except Exception as exc:
|
||||
logger.warning(f"[unseed] eb_specifications delete failed: {exc}")
|
||||
|
||||
logger.info(f"unseed done: storage_objects={rep.unseed_objects} templates={rep.unseed_templates} "
|
||||
f"exams={rep.unseed_exams} specs={rep.unseed_specs}")
|
||||
logger.info(f"unseed done: storage_objects={rep.unseed_objects} user_files={rep.unseed_user_files} "
|
||||
f"templates={rep.unseed_templates} exams={rep.unseed_exams} specs={rep.unseed_specs}")
|
||||
|
||||
|
||||
# ─────────────────────────────── orchestration ───────────────────────────────
|
||||
|
||||
81
tests/test_docling_extract.py
Normal file
81
tests/test_docling_extract.py
Normal file
@ -0,0 +1,81 @@
|
||||
from api.services.docling.extract import aqa_questions_rapid
|
||||
|
||||
|
||||
def _text(raw, page, l, t, r=120, b=None):
|
||||
return {
|
||||
"text": raw,
|
||||
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
|
||||
}
|
||||
|
||||
|
||||
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
|
||||
(tmp_path / "p1.json").write_text(
|
||||
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||
)
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
assert "02.3" in parts
|
||||
|
||||
|
||||
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
|
||||
(tmp_path / "p1.json").write_text(
|
||||
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||
)
|
||||
(tmp_path / "p2.json").write_text(
|
||||
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||
)
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
assert parts["07.1"]["page"] == 2
|
||||
assert parts["07.1"]["bbox"]["l"] == 49
|
||||
assert "07.2" in parts
|
||||
|
||||
|
||||
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
|
||||
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
|
||||
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
|
||||
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
|
||||
import json
|
||||
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
|
||||
assert label in parts
|
||||
|
||||
|
||||
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
|
||||
import json
|
||||
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
assert "01.3" in parts
|
||||
|
||||
|
||||
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
|
||||
import json
|
||||
texts = [
|
||||
_text("05.2 Some question text", 1, 49, 700),
|
||||
_text("05.3 Middle question text", 1, 49, 620),
|
||||
_text("05.5 Later question text", 2, 49, 740),
|
||||
]
|
||||
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
|
||||
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
assert "05.1" in parts
|
||||
assert "05.4" in parts
|
||||
assert "05.5" in parts
|
||||
|
||||
|
||||
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
|
||||
import json
|
||||
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
|
||||
|
||||
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||
|
||||
assert "03.0" in parts
|
||||
@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
from api.services.docling import extract
|
||||
from api.services.docling.regions import detect_response_regions_from_image
|
||||
|
||||
|
||||
@ -37,3 +38,46 @@ def test_detects_answer_box() -> None:
|
||||
assert boxes
|
||||
assert boxes[0]["bbox"]["w"] > 600
|
||||
assert boxes[0]["bbox"]["h"] > 200
|
||||
|
||||
|
||||
def test_detect_response_region_taxonomy_for_lines_and_boxes():
|
||||
img = Image.new("RGB", (800, 1000), "white")
|
||||
draw = ImageDraw.Draw(img)
|
||||
for y in (220, 260, 300):
|
||||
draw.line((120, y, 680, y), fill="black", width=2)
|
||||
draw.rectangle((140, 520, 660, 640), outline="black", width=3)
|
||||
|
||||
regions = detect_response_regions_from_image(img, min_confidence=0.1)
|
||||
types = {r.region_type for r in regions}
|
||||
|
||||
assert "answer_lines" in types
|
||||
assert "answer_box" in types
|
||||
|
||||
|
||||
def test_attach_detected_response_regions_normalizes_taxonomy_and_uses_pdf_y(monkeypatch, tmp_path):
|
||||
pdf = tmp_path / "paper.pdf"
|
||||
pdf.write_bytes(b"%PDF test placeholder")
|
||||
parts = {
|
||||
"01.1": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 700, "b": 680}, "regions": []},
|
||||
"01.2": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 500, "b": 480}, "regions": []},
|
||||
}
|
||||
|
||||
def fake_detect(path, min_confidence=0.32):
|
||||
return [{
|
||||
"page_index": 0,
|
||||
"region_type": "answer-box",
|
||||
"confidence": 0.77,
|
||||
"bbox": {"x": 100, "y": 335, "w": 500, "h": 40},
|
||||
"detection_method": "test",
|
||||
"meta": {"page_height_px": 1000, "page_height_pdf": 800},
|
||||
}]
|
||||
|
||||
monkeypatch.setattr(extract.region_mod, "detect_response_regions_from_pdf", fake_detect)
|
||||
|
||||
attached, candidates = extract.attach_detected_response_regions(parts, str(pdf))
|
||||
|
||||
assert attached == 1
|
||||
assert len(candidates) == 1
|
||||
assert parts["01.1"]["regions"] == []
|
||||
assert parts["01.2"]["regions"][0]["type"] == "answer_box"
|
||||
assert parts["01.2"]["regions"][0]["source"] == "opencv"
|
||||
|
||||
@ -143,6 +143,9 @@ class _FakeStorageAdmin:
|
||||
def download_file(self, bucket_id, file_path):
|
||||
return b"%PDF-1.7 fake"
|
||||
|
||||
def create_signed_url(self, bucket_id, file_path, expires_in=3600):
|
||||
return {"signedURL": f"https://storage.test/{bucket_id}/{file_path}?token=fake&expires_in={expires_in}"}
|
||||
|
||||
|
||||
class _FakeServiceRoleClient:
|
||||
def __init__(self, store):
|
||||
@ -171,6 +174,65 @@ def test_requires_auth_when_not_overridden():
|
||||
assert resp.status_code in (401, 403) # unauthenticated, not processed
|
||||
|
||||
|
||||
def test_catalogue_requires_auth_when_not_overridden():
|
||||
app = FastAPI()
|
||||
app.include_router(router, prefix="/api/exam")
|
||||
resp = TestClient(app).get("/api/exam/catalogue")
|
||||
assert resp.status_code in (401, 403)
|
||||
|
||||
|
||||
def test_list_catalogue_papers_uses_as_user_metadata():
|
||||
store = {
|
||||
"eb_exams": [
|
||||
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/p.pdf"},
|
||||
{"id": "e2", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/ms.pdf"},
|
||||
]
|
||||
}
|
||||
client, _ = make_client(store=store)
|
||||
resp = client.get("/api/exam/catalogue")
|
||||
assert resp.status_code == 200
|
||||
assert [p["id"] for p in resp.json()["papers"]] == ["e1"]
|
||||
|
||||
|
||||
def test_catalogue_signed_url_requires_auth_and_signs_examboard_pdf(monkeypatch):
|
||||
store = {
|
||||
"eb_exams": [
|
||||
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.examboards/aqa/physics/qp.pdf"},
|
||||
]
|
||||
}
|
||||
client, _ = make_client(store=store)
|
||||
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||
resp = client.get("/api/exam/catalogue/e1/signed-url?expires_in=120")
|
||||
assert resp.status_code == 200
|
||||
body = resp.json()
|
||||
assert body["bucket"] == "cc.examboards"
|
||||
assert body["path"] == "aqa/physics/qp.pdf"
|
||||
assert body["expires_in"] == 120
|
||||
assert "token=fake" in body["signed_url"]
|
||||
|
||||
|
||||
def test_catalogue_signed_url_rejects_non_examboard_storage(monkeypatch):
|
||||
store = {
|
||||
"eb_exams": [
|
||||
{"id": "e1", "exam_code": "AQA-1", "type_code": "QP", "storage_loc": "cc.public/aqa/physics/qp.pdf"},
|
||||
]
|
||||
}
|
||||
client, _ = make_client(store=store)
|
||||
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||
assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
|
||||
|
||||
|
||||
def test_catalogue_signed_url_rejects_non_catalogue_doc_type(monkeypatch):
|
||||
store = {
|
||||
"eb_exams": [
|
||||
{"id": "e1", "exam_code": "AQA-MS", "type_code": "MS", "storage_loc": "cc.examboards/aqa/physics/ms.pdf"},
|
||||
]
|
||||
}
|
||||
client, _ = make_client(store=store)
|
||||
monkeypatch.setattr(templates_mod, "StorageAdmin", _FakeStorageAdmin)
|
||||
assert client.get("/api/exam/catalogue/e1/signed-url").status_code == 404
|
||||
|
||||
|
||||
def test_create_template_sets_owner_and_institute():
|
||||
client, store = make_client()
|
||||
resp = client.post("/api/exam/templates", json={"title": "AQA Physics 1H", "subject": "Physics"})
|
||||
@ -533,6 +595,27 @@ def test_box_to_canvas_uses_cropbox_as_page_origin():
|
||||
assert templates_mod._box_to_canvas(box, 1, pages) == {"x": 0.0, "y": 25.0, "w": 80.0, "h": 40.0}
|
||||
|
||||
|
||||
def test_auto_map_deduplicates_continued_part_labels(monkeypatch):
|
||||
monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [
|
||||
{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0},
|
||||
{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 800.0},
|
||||
])
|
||||
first_pass = _first_pass_template()
|
||||
first_pass["meta"]["n_pages"] = 2
|
||||
first_pass["pages"]["2"] = {
|
||||
"role": "question", "role_source": "auto", "margins_enabled": True,
|
||||
"main_bands": [],
|
||||
"part_bands": [{"label": "01.1", "question": "01", "y_start": 760, "y_end": 600, "label_box": {"l": 50, "t": 760, "r": 90, "b": 740, "coord_origin": "BOTTOMLEFT"}, "source": "auto", "confirmed": False}],
|
||||
"furniture": [], "figures": [], "tables": [],
|
||||
}
|
||||
|
||||
rows = templates_mod._map_first_pass_to_rows("t1", first_pass, b"%PDF", [])
|
||||
|
||||
question_ids = [q["id"] for q in rows["questions"]]
|
||||
assert len(question_ids) == len(set(question_ids))
|
||||
assert [q["label"] for q in rows["questions"]].count("01.1") == 1
|
||||
|
||||
|
||||
def test_response_region_types_are_mapped_to_response_form_enum(monkeypatch):
|
||||
monkeypatch.setattr(templates_mod, "_pdf_page_geometry", lambda _pdf: [{"media_x0": 0.0, "crop_x0": 0.0, "crop_y0": 0.0, "page_pt_w": 600.0, "page_pt_h": 800.0, "rendered_w": 600.0, "rendered_h": 800.0, "page_top": 0.0}])
|
||||
first_pass = _first_pass_template()
|
||||
@ -559,6 +642,20 @@ def test_auto_map_fast_path_merges_ai_rows_and_returns_detail(monkeypatch):
|
||||
assert store["exam_boundaries"] and store["exam_boundaries"][0]["derivation"] == "docling-main-band"
|
||||
|
||||
|
||||
def test_auto_map_deduplicates_repeated_response_area_ids(monkeypatch):
|
||||
store = _template_with_source()
|
||||
client, store = make_client(store=store)
|
||||
_patch_auto_map(monkeypatch, store, fast=True)
|
||||
dup = {"page_index": 0, "bbox": {"l": 50, "t": 700, "r": 100, "b": 680, "coord_origin": "BOTTOMLEFT"}, "region_type": "answer_lines", "confidence": 0.9}
|
||||
monkeypatch.setattr(templates_mod, "detect_response_regions_from_pdf", lambda *_a, **_k: [dup, dict(dup)])
|
||||
|
||||
resp = client.post("/api/exam/templates/t1/auto-map")
|
||||
|
||||
assert resp.status_code == 200
|
||||
response_area_ids = [r["id"] for r in store["exam_response_areas"]]
|
||||
assert len(response_area_ids) == len(set(response_area_ids))
|
||||
|
||||
|
||||
def test_auto_map_preserves_manual_and_confirmed_rows_on_rerun(monkeypatch):
|
||||
store = _template_with_source()
|
||||
store.update({
|
||||
|
||||
103
tests/test_files_idor.py
Normal file
103
tests/test_files_idor.py
Normal file
@ -0,0 +1,103 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
import routers.database.files.files as files_router
|
||||
import routers.database.files.files_simplified as files_simplified_router
|
||||
|
||||
|
||||
ROUTERS = [files_router, files_simplified_router]
|
||||
|
||||
USER_A = "00000000-0000-0000-0000-000000000001"
|
||||
USER_B = "00000000-0000-0000-0000-000000000002"
|
||||
CAB_A = "10000000-0000-0000-0000-000000000001"
|
||||
CAB_B = "10000000-0000-0000-0000-000000000002"
|
||||
|
||||
|
||||
class FakeQuery:
|
||||
def __init__(self, rows):
|
||||
self.rows = list(rows)
|
||||
|
||||
def select(self, *_args, **_kwargs):
|
||||
return self
|
||||
|
||||
def eq(self, key, value):
|
||||
self.rows = [row for row in self.rows if row.get(key) == value]
|
||||
return self
|
||||
|
||||
def limit(self, _n):
|
||||
return self
|
||||
|
||||
def execute(self):
|
||||
return SimpleNamespace(data=self.rows)
|
||||
|
||||
|
||||
class FakeSupabase:
|
||||
def __init__(self, store):
|
||||
self.store = store
|
||||
|
||||
def table(self, name):
|
||||
return FakeQuery(self.store.get(name, []))
|
||||
|
||||
|
||||
class FakeServiceRoleClient:
|
||||
def __init__(self, store):
|
||||
self.supabase = FakeSupabase(store)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||
def test_list_files_hides_unowned_unshared_cabinet(monkeypatch, router_module):
|
||||
store = {
|
||||
"file_cabinets": [
|
||||
{"id": CAB_A, "user_id": USER_A},
|
||||
{"id": CAB_B, "user_id": USER_B},
|
||||
],
|
||||
"cabinet_memberships": [],
|
||||
"files": [
|
||||
{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A},
|
||||
{"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B},
|
||||
],
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
router_module,
|
||||
"SupabaseServiceRoleClient",
|
||||
lambda: FakeServiceRoleClient(store),
|
||||
)
|
||||
|
||||
assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
|
||||
|
||||
|
||||
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||
def test_list_files_allows_own_cabinet(monkeypatch, router_module):
|
||||
store = {
|
||||
"file_cabinets": [{"id": CAB_A, "user_id": USER_A}],
|
||||
"cabinet_memberships": [],
|
||||
"files": [{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}],
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
router_module,
|
||||
"SupabaseServiceRoleClient",
|
||||
lambda: FakeServiceRoleClient(store),
|
||||
)
|
||||
|
||||
assert router_module.list_files(CAB_A, {"sub": USER_A}) == [
|
||||
{"id": "file-a", "cabinet_id": CAB_A, "uploaded_by": USER_A}
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("router_module", ROUTERS)
|
||||
def test_list_files_denies_non_owner_even_with_cabinet_membership(monkeypatch, router_module):
|
||||
store = {
|
||||
"file_cabinets": [{"id": CAB_B, "user_id": USER_B}],
|
||||
"cabinet_memberships": [
|
||||
{"cabinet_id": CAB_B, "profile_id": USER_A, "role": "viewer"}
|
||||
],
|
||||
"files": [{"id": "file-b", "cabinet_id": CAB_B, "uploaded_by": USER_B}],
|
||||
}
|
||||
monkeypatch.setattr(
|
||||
router_module,
|
||||
"SupabaseServiceRoleClient",
|
||||
lambda: FakeServiceRoleClient(store),
|
||||
)
|
||||
|
||||
assert router_module.list_files(CAB_B, {"sub": USER_A}) == []
|
||||
51
tests/test_reset_environment_user_subset.py
Normal file
51
tests/test_reset_environment_user_subset.py
Normal file
@ -0,0 +1,51 @@
|
||||
from run.initialization import reset_environment
|
||||
|
||||
|
||||
def test_reset_user_subset_scope_only_runs_user_subset_cleanup(monkeypatch):
|
||||
calls = []
|
||||
|
||||
monkeypatch.setattr(
|
||||
reset_environment,
|
||||
"_sb_headers",
|
||||
lambda: ("http://192.168.0.94:8000", {"Authorization": "Bearer redacted"}),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
reset_environment,
|
||||
"_assert_reset_allowed",
|
||||
lambda url, scope: calls.append(("guard", url, scope)),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
reset_environment,
|
||||
"_clear_user_subset_files",
|
||||
lambda: {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
|
||||
)
|
||||
|
||||
def fail_if_called(*_args, **_kwargs):
|
||||
raise AssertionError("reset(scope='user-subset') must not clear unrelated tables or databases")
|
||||
|
||||
monkeypatch.setattr(reset_environment, "_clear_tables", fail_if_called)
|
||||
monkeypatch.setattr(reset_environment, "_neo4j_drop_all_non_system", fail_if_called)
|
||||
monkeypatch.setattr(reset_environment, "_clear_exam_storage", fail_if_called)
|
||||
|
||||
result = reset_environment.reset(scope="user-subset")
|
||||
|
||||
assert calls == [("guard", "http://192.168.0.94:8000", "user-subset")]
|
||||
assert result == {
|
||||
"scope": "user-subset",
|
||||
"user_subset": {"files_rows_deleted": 2, "storage_objects_removed": 2, "errors": []},
|
||||
}
|
||||
|
||||
|
||||
def test_reset_accepts_case_insensitive_user_subset_scope(monkeypatch):
|
||||
monkeypatch.setattr(reset_environment, "_sb_headers", lambda: ("http://192.168.0.94:8000", {}))
|
||||
monkeypatch.setattr(reset_environment, "_assert_reset_allowed", lambda *_args, **_kwargs: None)
|
||||
monkeypatch.setattr(
|
||||
reset_environment,
|
||||
"_clear_user_subset_files",
|
||||
lambda: {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
|
||||
)
|
||||
|
||||
assert reset_environment.reset(scope="USER-SUBSET") == {
|
||||
"scope": "user-subset",
|
||||
"user_subset": {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": []},
|
||||
}
|
||||
171
tests/test_seed_exam_corpus_unseed.py
Normal file
171
tests/test_seed_exam_corpus_unseed.py
Normal file
@ -0,0 +1,171 @@
|
||||
from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
|
||||
|
||||
|
||||
class _Result:
|
||||
def __init__(self, data=None):
|
||||
self.data = data or []
|
||||
|
||||
|
||||
class _FilesQuery:
|
||||
def __init__(self, db, op="select"):
|
||||
self.db = db
|
||||
self.op = op
|
||||
self.filters = []
|
||||
self.in_filters = []
|
||||
|
||||
def select(self, *_args, **_kwargs):
|
||||
return self
|
||||
|
||||
def delete(self, *_args, **_kwargs):
|
||||
self.op = "delete"
|
||||
return self
|
||||
|
||||
def eq(self, key, value):
|
||||
self.filters.append(("eq", key, value))
|
||||
return self
|
||||
|
||||
def like(self, key, pattern):
|
||||
self.filters.append(("like", key, pattern))
|
||||
return self
|
||||
|
||||
def in_(self, key, values):
|
||||
self.in_filters.append((key, set(values)))
|
||||
return self
|
||||
|
||||
def _matches(self, row):
|
||||
for kind, key, value in self.filters:
|
||||
actual = row.get(key)
|
||||
if kind == "eq" and actual != value:
|
||||
return False
|
||||
if kind == "like":
|
||||
assert value.endswith("%")
|
||||
if not isinstance(actual, str) or not actual.startswith(value[:-1]):
|
||||
return False
|
||||
for key, values in self.in_filters:
|
||||
if row.get(key) not in values:
|
||||
return False
|
||||
return True
|
||||
|
||||
def execute(self):
|
||||
matched = [row for row in self.db.rows if self._matches(row)]
|
||||
if self.op == "delete":
|
||||
self.db.ops.append(("delete", [row["id"] for row in matched]))
|
||||
self.db.rows = [row for row in self.db.rows if not self._matches(row)]
|
||||
return _Result(matched)
|
||||
return _Result(matched)
|
||||
|
||||
|
||||
class _FakeDb:
|
||||
def __init__(self, rows):
|
||||
self.rows = list(rows)
|
||||
self.ops = []
|
||||
|
||||
def table(self, name):
|
||||
assert name == "files"
|
||||
return _FilesQuery(self)
|
||||
|
||||
|
||||
class _FakeStorageBucket:
|
||||
def __init__(self, storage, bucket):
|
||||
self.storage = storage
|
||||
self.bucket = bucket
|
||||
|
||||
def remove(self, paths):
|
||||
self.storage.ops.append(("remove", self.bucket, list(paths)))
|
||||
if self.storage.fail:
|
||||
raise RuntimeError("storage unavailable")
|
||||
if self.storage.result_error:
|
||||
return {"error": self.storage.result_error}
|
||||
return []
|
||||
|
||||
|
||||
class _FakeStorageRoot:
|
||||
def __init__(self, storage):
|
||||
self.storage = storage
|
||||
|
||||
def from_(self, bucket):
|
||||
return _FakeStorageBucket(self.storage, bucket)
|
||||
|
||||
|
||||
class _FakeStorage:
|
||||
def __init__(self, fail=False, result_error=None):
|
||||
self.fail = fail
|
||||
self.result_error = result_error
|
||||
self.ops = []
|
||||
self.client = type("Client", (), {"supabase": type("SB", (), {"storage": _FakeStorageRoot(self)})()})()
|
||||
|
||||
|
||||
class _FakeClient:
|
||||
def __init__(self, db):
|
||||
self.supabase = db
|
||||
|
||||
|
||||
def test_delete_user_subset_storage_before_files_rows_for_scoped_exams():
|
||||
db = _FakeDb([
|
||||
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||
{"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
|
||||
{"id": "f3", "bucket": "cc.users", "path": "exam-marker/i/c/f3/A.pdf", "name": "A.pdf", "source": "manual"},
|
||||
{"id": "f4", "bucket": "cc.users", "path": "other/f4/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||
])
|
||||
storage = _FakeStorage()
|
||||
rep = LoadReport()
|
||||
|
||||
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||
|
||||
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||
assert db.ops == [("delete", ["f1"])]
|
||||
assert [row["id"] for row in db.rows] == ["f2", "f3", "f4"]
|
||||
assert rep.unseed_objects == 1
|
||||
assert rep.unseed_user_files == 1
|
||||
assert rep.errors == []
|
||||
|
||||
|
||||
def test_delete_user_subset_keeps_files_rows_when_storage_remove_fails():
|
||||
db = _FakeDb([
|
||||
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||
])
|
||||
storage = _FakeStorage(fail=True)
|
||||
rep = LoadReport()
|
||||
|
||||
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||
|
||||
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||
assert db.ops == []
|
||||
assert [row["id"] for row in db.rows] == ["f1"]
|
||||
assert rep.unseed_objects == 0
|
||||
assert rep.unseed_user_files == 0
|
||||
assert rep.errors
|
||||
|
||||
|
||||
def test_delete_user_subset_keeps_files_rows_when_storage_remove_returns_error():
|
||||
db = _FakeDb([
|
||||
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||
])
|
||||
storage = _FakeStorage(result_error="permission denied")
|
||||
rep = LoadReport()
|
||||
|
||||
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=["A"], rep=rep)
|
||||
|
||||
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf"])]
|
||||
assert db.ops == []
|
||||
assert [row["id"] for row in db.rows] == ["f1"]
|
||||
assert rep.unseed_objects == 0
|
||||
assert rep.unseed_user_files == 0
|
||||
assert rep.errors
|
||||
|
||||
|
||||
def test_delete_user_subset_unscoped_cleans_all_seeded_exam_marker_rows():
|
||||
db = _FakeDb([
|
||||
{"id": "f1", "bucket": "cc.users", "path": "exam-marker/i/c/f1/A.pdf", "name": "A.pdf", "source": "exam-corpus-seed"},
|
||||
{"id": "f2", "bucket": "cc.users", "path": "exam-marker/i/c/f2/B.pdf", "name": "B.pdf", "source": "exam-corpus-seed"},
|
||||
])
|
||||
storage = _FakeStorage()
|
||||
rep = LoadReport()
|
||||
|
||||
_delete_user_subset_files(_FakeClient(db), storage, exam_codes=None, rep=rep)
|
||||
|
||||
assert storage.ops == [("remove", "cc.users", ["exam-marker/i/c/f1/A.pdf", "exam-marker/i/c/f2/B.pdf"])]
|
||||
assert db.ops == [("delete", ["f1", "f2"])]
|
||||
assert db.rows == []
|
||||
assert rep.unseed_objects == 2
|
||||
assert rep.unseed_user_files == 2
|
||||
54
tests/test_upload_validation.py
Normal file
54
tests/test_upload_validation.py
Normal file
@ -0,0 +1,54 @@
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
from fastapi import HTTPException
|
||||
|
||||
from modules.upload_validation import MAX_UPLOAD_BYTES, read_pdf_upload_bytes, read_upload_bytes
|
||||
|
||||
|
||||
class FakeUpload:
|
||||
def __init__(self, data: bytes, content_type: str, filename: str = "file.bin"):
|
||||
self._data = data
|
||||
self._pos = 0
|
||||
self.content_type = content_type
|
||||
self.filename = filename
|
||||
|
||||
async def read(self, size: int = -1) -> bytes:
|
||||
if self._pos >= len(self._data):
|
||||
return b""
|
||||
if size is None or size < 0:
|
||||
size = len(self._data) - self._pos
|
||||
chunk = self._data[self._pos : self._pos + size]
|
||||
self._pos += len(chunk)
|
||||
return chunk
|
||||
|
||||
|
||||
def run(coro):
|
||||
return asyncio.run(coro)
|
||||
|
||||
|
||||
def test_valid_pdf_upload_passes_and_returns_mime():
|
||||
data, mime = run(read_upload_bytes(FakeUpload(b"%PDF-1.7\n", "application/pdf")))
|
||||
assert data.startswith(b"%PDF-")
|
||||
assert mime == "application/pdf"
|
||||
|
||||
|
||||
def test_disallowed_mime_rejected_with_415():
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
run(read_upload_bytes(FakeUpload(b"print(1)", "application/x-python")))
|
||||
assert exc.value.status_code == 415
|
||||
assert "Unsupported upload type" in exc.value.detail
|
||||
|
||||
|
||||
def test_oversize_upload_rejected_with_413():
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
run(read_upload_bytes(FakeUpload(b"x" * (MAX_UPLOAD_BYTES + 1), "text/plain")))
|
||||
assert exc.value.status_code == 413
|
||||
assert "exceeds max size" in exc.value.detail
|
||||
|
||||
|
||||
def test_pdf_helper_rejects_spoofed_pdf_mime():
|
||||
with pytest.raises(HTTPException) as exc:
|
||||
run(read_pdf_upload_bytes(FakeUpload(b"not a pdf", "application/pdf")))
|
||||
assert exc.value.status_code == 415
|
||||
assert "not a valid PDF" in exc.value.detail
|
||||
Loading…
x
Reference in New Issue
Block a user