diff --git a/api/services/docling/.gitignore b/api/services/docling/.gitignore new file mode 100644 index 0000000..9339e1a --- /dev/null +++ b/api/services/docling/.gitignore @@ -0,0 +1,5 @@ +# B1 image-only eval corpus + pipeline outputs: fetched/generated at runtime, never committed. +# Exam-board PDFs are third-party copyright (served only via signed URLs); results/ are reproducible. +/samples/b1/ +/results/b1_rapid/ +/results/final/ diff --git a/api/services/docling/finalize.py b/api/services/docling/finalize.py index bb1d9c5..913642a 100644 --- a/api/services/docling/finalize.py +++ b/api/services/docling/finalize.py @@ -59,6 +59,61 @@ GEOMETRY = [ extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr", "--marks-fill", "results/genreport/ocrh556/marks_fill.json"]), ] + +B1_GEOMETRY = [ + dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)", + board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf", + pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf", + docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json", + rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json", + gt_key="b1-aqa-biology-7402-1-2023jun"), + dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)", + board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf", + pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json", + gt_key="b1-aqa-chemistry-7405-1-2022jun"), + dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)", + board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf", + pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json", + gt_key="b1-aqa-physics-7408-1-2022jun"), + dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)", + board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf", + pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json", + gt_key="b1-aqa-biology-8461-1h-2022jun"), + dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)", + board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf", + pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json", + gt_key="b1-aqa-chemistry-8462-1h-2022jun"), + dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)", + board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf", + pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json", + gt_key="b1-aqa-combined-8464-b1h-2022jun"), + dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)", + board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)", + storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf", + pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf", + docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json", + rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json", + gt_key="b1-aqa-combined-8464-c1h-2022jun"), +] + +GT_LABELS_PATH = "fixtures/b1_gt_labels.json" + FAST = [ dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa", level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf", @@ -95,16 +150,65 @@ def jload(p): return {} -def stats_from(struct, val): + +def load_gt_labels(): + try: + return json.load(open(GT_LABELS_PATH)) + except Exception: + return {} + + +def part_labels(struct): + labels = [] + for q in struct.get("questions", []) or []: + for part in q.get("parts", []) or []: + lab = part.get("label") + if lab: + labels.append(lab) + return labels + + +def coverage_against_labels(struct, labels): + if not labels: + return None + rec = set(part_labels(struct)) + gt = set(labels) + hit = sorted(rec & gt) + miss = sorted(gt - rec) + return {"coverage_pct": round(len(hit) / len(gt) * 100, 1), + "recovered": len(hit), "total": len(gt), "missed": miss, + "source": "fixtures/b1_gt_labels.json"} + + +def answer_region_count(struct): + top = len(struct.get("regions", []) or []) + per_part = 0 + for q in struct.get("questions", []) or []: + for part in q.get("parts", []) or []: + per_part += len(part.get("regions", []) or []) + return top + per_part + + +def ensure_rapid_cache(p): + if os.path.exists(p["docling"]): + return True + if not os.path.exists(p["pdf"]): + print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})") + return False + return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]]) + +def stats_from(struct, val, gt_labels=None): st = struct.get("stats", {}) or {} mc = st.get("marks_check") or {} - cov = struct.get("coverage", {}) or {} + cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {}) return { "board": struct.get("board"), "paper_code": struct.get("paper_code"), "n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"), "marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"), "marks_pct": mc.get("pct"), - "coverage_pct": cov.get("coverage_pct"), "coverage_missed": cov.get("missed", []), + "coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"), + "coverage_total": cov.get("total"), "coverage_source": cov.get("source"), + "coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct), "validate_verdict": (val.get("summary") or {}).get("worst_severity"), "validate_flags": val.get("flags", []), "questions_expected": (val.get("summary") or {}).get("questions_expected"), @@ -113,12 +217,15 @@ def stats_from(struct, val): } -def do_geometry(p, overlays): +def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False): d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True) S, F, B, R, T, V = (os.path.join(d, f) for f in ("structured.json", "furniture.json", "bands.json", "page_roles.json", "template.json", "validate.json")) - ex = ["extract.py"] + p["extract"] + ["--out", S] + if prepare_ocr and not ensure_rapid_cache(p): + raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}") + extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")] + ex = ["extract.py"] + extract_args + ["--out", S] if p.get("gt"): ex += ["--gt", p["gt"]] run(ex) @@ -138,7 +245,7 @@ def do_geometry(p, overlays): odbg = os.path.join(d, "overlays", "debug") run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B, "--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg]) - return stats_from(jload(S), jload(V)), d + return stats_from(jload(S), jload(V), gt_labels), d def do_fast(p): @@ -164,6 +271,7 @@ def per_paper_report(p, s, d, kind): + (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "") if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a", f"- **G6 verdict:** {s['validate_verdict']}", + f"- **answer-region count:** {s.get('answer_regions')}", ] if s["validate_flags"]: lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]] @@ -178,21 +286,28 @@ def per_paper_report(p, s, d, kind): def main(): ap = argparse.ArgumentParser() ap.add_argument("--no-overlays", action="store_true") + ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus") + ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running") a = ap.parse_args() os.makedirs(FINAL, exist_ok=True) catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"), "papers": []} total_imgs = 0 - for p in GEOMETRY: + gt_fixtures = load_gt_labels() + geometry = B1_GEOMETRY if a.b1_only else GEOMETRY + fast = [] if a.b1_only else FAST + + for p in geometry: print(f"[geometry] {p['slug']}") - s, d = do_geometry(p, not a.no_overlays) + gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels") + s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr) n = per_paper_report(p, s, d, p["path"]) total_imgs += n catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")}, "kind": "geometry", "path": p["path"], "dir": d, "overlay_images": n, **s}) - for p in FAST: + for p in fast: print(f"[fast] {p['slug']}") s, d = do_fast(p) per_paper_report(p, s, d, "born-digital fast-path") @@ -214,13 +329,13 @@ def write_index(catalog, total_imgs): "`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).", "Machine catalog: `catalog.json`.", "", "## Image-only / OCR-path (with geometry + overlays)", "", - "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 | Images |", - "|---|---|---|---|---|---|---|"] + "| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |", + "|---|---|---|---|---|---|---|---|"] for p in g: cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a" L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | " f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} " - f"({p['marks_pct']}%) | {cov} | {p['validate_verdict']} | " + f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | " f"{p['overlay_images']} |") L += ["", "## Born-digital fast-path (CPU, no geometry)", "", "| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |", diff --git a/api/services/docling/fixtures/b1_gt_labels.json b/api/services/docling/fixtures/b1_gt_labels.json new file mode 100644 index 0000000..7f39758 --- /dev/null +++ b/api/services/docling/fixtures/b1_gt_labels.json @@ -0,0 +1,356 @@ +{ + "b1-aqa-biology-7402-1-2023jun": { + "source_pdf": "cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": "7402/1", + "labels": [ + "01.1", + "01.2", + "01.3", + "02.1", + "02.2", + "02.3", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "04.1", + "04.2", + "04.3", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "06.1", + "06.2", + "06.3", + "06.4", + "07.1", + "07.2", + "89.6", + "08.1", + "08.2", + "08.3", + "08.4", + "09.1", + "09.2", + "09.3", + "09.4", + "09.5", + "09.6", + "10.1", + "10.2", + "10.3" + ] + }, + "b1-aqa-chemistry-7405-1-2022jun": { + "source_pdf": "cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": "7405/1", + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "01.6", + "02.1", + "02.2", + "02.3", + "02.4", + "02.5", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "05.6", + "05.7", + "06.1", + "06.2", + "06.3", + "06.4", + "06.5", + "06.6", + "06.7", + "07.1", + "07.2", + "07.3", + "07.4", + "07.5", + "07.6", + "07.7", + "08.1", + "08.2", + "08.3", + "08.4", + "08.5" + ] + }, + "b1-aqa-physics-7408-1-2022jun": { + "source_pdf": "cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": "7408/1", + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "02.1", + "02.2", + "02.3", + "02.4", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "05.6", + "06.1", + "06.2", + "06.3", + "07.0", + "08.0", + "09.0", + "10.0", + "11.0", + "12.0", + "13.0", + "14.0", + "15.0", + "16.0", + "17.0", + "18.0", + "19.0", + "20.0", + "21.0", + "22.0", + "23.0", + "24.0", + "25.0", + "26.0", + "27.0", + "28.0", + "29.0", + "30.0", + "31.0" + ] + }, + "b1-aqa-biology-8461-1h-2022jun": { + "source_pdf": "cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": "8461/1", + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "01.6", + "01.7", + "01.8", + "01.9", + "02.1", + "02.2", + "02.3", + "02.4", + "02.5", + "02.6", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "06.1", + "06.2", + "06.3", + "06.4", + "06.5", + "07.1", + "07.2", + "07.3", + "07.4", + "07.5", + "07.6", + "07.7", + "07.8" + ] + }, + "b1-aqa-chemistry-8462-1h-2022jun": { + "source_pdf": "cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": "8462/1", + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "01.6", + "01.7", + "02.1", + "02.2", + "02.3", + "02.4", + "02.5", + "02.6", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "04.6", + "04.7", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "06.1", + "06.2", + "06.3", + "06.4", + "06.5", + "06.6", + "07.1", + "07.2", + "07.3", + "07.4", + "07.5", + "07.6", + "08.1", + "08.2", + "08.3", + "08.4", + "08.5" + ] + }, + "b1-aqa-combined-8464-b1h-2022jun": { + "source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": null, + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "01.6", + "01.7", + "01.8", + "02.1", + "02.2", + "02.3", + "02.4", + "02.5", + "02.6", + "02.7", + "03.1", + "03.2", + "03.3", + "03.4", + "03.5", + "03.6", + "03.7", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "05.1", + "05.2", + "05.3", + "05.4", + "05.5", + "05.6", + "06.1", + "06.2", + "06.3" + ] + }, + "b1-aqa-combined-8464-c1h-2022jun": { + "source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf", + "source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.", + "board_detected": "aqa", + "paper_code_detected": null, + "labels": [ + "01.1", + "01.2", + "01.3", + "01.4", + "01.5", + "02.1", + "02.2", + "02.3", + "02.4", + "02.5", + "03.0", + "04.1", + "04.2", + "04.3", + "04.4", + "04.5", + "04.6", + "04.7", + "05.1", + "05.2", + "05.3", + "05.4", + "06.1", + "06.2", + "06.3", + "06.4", + "06.5", + "07.1", + "07.2", + "07.3", + "07.4", + "07.5", + "07.6" + ] + } +} diff --git a/api/services/docling/scripts/fetch_b1_corpus.py b/api/services/docling/scripts/fetch_b1_corpus.py new file mode 100644 index 0000000..c163e8a --- /dev/null +++ b/api/services/docling/scripts/fetch_b1_corpus.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""Populate the gitignored B1 image-only eval corpus from the .94 exam-board store. + +The B1 eval papers are NOT committed (third-party copyright; served only via signed URLs). +This script downloads each B1_GEOMETRY paper's `storage_loc` object from cc.examboards via the +Storage API into its local `pdf` path (under samples/b1/), so finalize.py --b1-only and the +B1-2/B1-3 generalization work can run against a real corpus. + +Run from api/services/docling/ inside the cc-api-dev container (SUPABASE_URL/SERVICE_ROLE_KEY in env): + python3 scripts/fetch_b1_corpus.py # fetch all B1 papers (skip existing) + python3 scripts/fetch_b1_corpus.py --force # re-download + python3 scripts/fetch_b1_corpus.py --only b1-aqa-physics-7408-1-2022jun + python3 scripts/fetch_b1_corpus.py --list # show what would be fetched, no download +""" +from __future__ import annotations + +import argparse +import os +import sys + +# Import the canonical B1 corpus definition (slug, storage_loc, local pdf path) from finalize. +_HERE = os.path.dirname(os.path.abspath(__file__)) +_DOCLING_DIR = os.path.dirname(_HERE) +sys.path.insert(0, _DOCLING_DIR) +from finalize import B1_GEOMETRY # noqa: E402 + + +def _split_storage_loc(storage_loc: str) -> tuple[str, str]: + """'cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf' -> ('cc.examboards', 'aqa/.../qp.pdf').""" + bucket, _, path = storage_loc.partition("/") + if not bucket or not path: + raise ValueError(f"malformed storage_loc: {storage_loc!r}") + return bucket, path + + +def _entries(only: str | None): + for p in B1_GEOMETRY: + loc = p.get("storage_loc") + pdf = p.get("pdf") + if not loc or not pdf: + continue + if only and p.get("slug") != only: + continue + yield p["slug"], loc, pdf + + +def main() -> int: + ap = argparse.ArgumentParser(description="Fetch the B1 image-only eval corpus from .94 cc.examboards") + ap.add_argument("--force", action="store_true", help="re-download even if the local file exists") + ap.add_argument("--only", help="fetch a single paper by slug") + ap.add_argument("--list", action="store_true", help="list what would be fetched and exit") + args = ap.parse_args() + + todo = list(_entries(args.only)) + if not todo: + print("no matching B1 papers", file=sys.stderr) + return 1 + + if args.list: + for slug, loc, pdf in todo: + print(f"{slug}\t{loc}\t-> {pdf}") + return 0 + + from modules.database.supabase.utils.storage import StorageAdmin + storage = StorageAdmin() + + ok = skipped = 0 + for slug, loc, pdf in todo: + dest = os.path.join(_DOCLING_DIR, pdf) if not os.path.isabs(pdf) else pdf + if os.path.exists(dest) and not args.force: + print(f"[skip] {slug} (exists)") + skipped += 1 + continue + bucket, path = _split_storage_loc(loc) + data = storage.download_file(bucket, path) + os.makedirs(os.path.dirname(dest), exist_ok=True) + with open(dest, "wb") as fh: + fh.write(data) + print(f"[ok] {slug} <- {bucket}/{path} ({len(data)} bytes)") + ok += 1 + + print(f"fetched {ok}, skipped {skipped}, of {len(todo)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/api/services/docling/scripts/make_b1_gt.py b/api/services/docling/scripts/make_b1_gt.py new file mode 100644 index 0000000..48c72ca --- /dev/null +++ b/api/services/docling/scripts/make_b1_gt.py @@ -0,0 +1,32 @@ +import json, sys +from pathlib import Path +base=Path('/app/api/services/docling') +sys.path.insert(0, str(base)) +import extract +papers=[ +('b1-aqa-biology-7402-1-2023jun','samples/b1/aqa-biology-7402-1-2023jun.pdf','cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf'), +('b1-aqa-chemistry-7405-1-2022jun','samples/b1/aqa-chemistry-7405-1-2022jun.pdf','cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf'), +('b1-aqa-physics-7408-1-2022jun','samples/b1/aqa-physics-7408-1-2022jun.pdf','cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf'), +('b1-aqa-biology-8461-1h-2022jun','samples/b1/aqa-biology-8461-1h-2022jun.pdf','cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf'), +('b1-aqa-chemistry-8462-1h-2022jun','samples/b1/aqa-chemistry-8462-1h-2022jun.pdf','cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf'), +('b1-aqa-combined-8464-b1h-2022jun','samples/b1/aqa-combined-8464-b1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf'), +('b1-aqa-combined-8464-c1h-2022jun','samples/b1/aqa-combined-8464-c1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf'), +] +out={} +for slug, rel, storage in papers: + lines=extract.lines_from_pdftext(str(base/rel)) + board, code=extract.detect_board(lines) + if board != 'aqa': + raise RuntimeError(f'{slug}: expected AQA board, detected {board!r} ({code!r})') + parts=extract.parse_text_by_board(lines, board) + labels=list(parts) + out[slug]={ + 'source_pdf': storage, + 'source_method': 'AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.', + 'board_detected': board, + 'paper_code_detected': code, + 'labels': labels, + } + print(slug, board, code, len(labels), labels[:5], labels[-5:]) +Path(base/'fixtures').mkdir(exist_ok=True) +Path(base/'fixtures/b1_gt_labels.json').write_text(json.dumps(out, indent=2)+"\n") diff --git a/api/services/docling/scripts/rapid_pass.py b/api/services/docling/scripts/rapid_pass.py new file mode 100644 index 0000000..c5b0bc5 --- /dev/null +++ b/api/services/docling/scripts/rapid_pass.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +rapid_pass.py — generalise the proven AQA "RapidOCR margin-pass" (95.2% on the image-only +8463 paper) to any AQA paper. Born-digital AQA QPs ship a text layer, so we force RapidOCR +over the *rendered* page (`force_ocr:true`) to simulate the image-only redistribution case +and recover the boxed `NN.M` question numbers Tesseract shatters. + +For each page it writes results//p{N}.json (a full per-page DoclingDocument, the +shape extract.py's aqa_questions_rapid expects) and a merged.json (for board / front-matter +detection). All GPU work is serialised + OOM-resilient through dsync. + +Usage: + python scripts/rapid_pass.py samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf rapid_7408 + python scripts/rapid_pass.py [first_page] [last_page] +""" +import os, sys, json, subprocess, re +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import dsync + +OPTS = {"ocr_engine": "rapidocr", "force_ocr": True} + + +def npages(pdf): + out = subprocess.check_output(["pdfinfo", pdf]).decode() + return int(out.split("Pages:")[1].split()[0]) + + +def main(): + pdf = sys.argv[1] + slug = sys.argv[2] + if os.path.isabs(slug) or ".." in slug.split(os.sep) or not re.fullmatch(r"[A-Za-z0-9._/-]+", slug): + raise SystemExit(f"unsafe output slug: {slug!r}") + n = npages(pdf) + first = int(sys.argv[3]) if len(sys.argv) > 3 else 1 + last = min(int(sys.argv[4]), n) if len(sys.argv) > 4 else n + if first > n or first > last: + print(f"requested page range {first}-{last} is outside PDF ({n} pages); nothing to do") + return + outdir = os.path.join("results", slug) + os.makedirs(outdir, exist_ok=True) + + r = dsync._redis() + print(f"redis: {'connected' if r else 'NO CACHE'} pdf={pdf} pages {first}-{last}/{n}") + merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []} + for pg in range(first, last + 1): + page_path = os.path.join(outdir, f"p{pg}.json") + if os.path.exists(page_path): + doc = json.load(open(page_path)) + print(f" p{pg}: file cache HIT ({len(doc.get(texts, []))} texts)") + else: + doc = dsync.convert_page(pdf, pg, OPTS, r=r) + if not doc: + merged["_failed_pages"].append(pg) + print(f" p{pg}: FAILED") + continue + json.dump(doc, open(page_path, "w")) + for k in ("texts", "tables", "pictures"): + merged[k].extend(doc.get(k, [])) + merged["pages"].update(doc.get("pages", {})) + nmarg = sum(1 for t in doc.get("texts", []) + if (t.get("prov") or [{}])[0].get("bbox", {}).get("l", 999) <= 140) + print(f" p{pg}: {len(doc.get('texts', []))} texts ({nmarg} left-margin)") + json.dump(merged, open(os.path.join(outdir, "merged.json"), "w")) + print(f"-> {outdir}/ ({last-first+1-len(merged['_failed_pages'])} pages, " + f"failed={merged['_failed_pages']})") + + +if __name__ == "__main__": + main()