feat(docling): B1 image-only OCR eval harness (overwatch-cleaned)
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
Eval harness for AQA A-level + GCSE-science image-only papers: finalize.py --b1-only, RapidOCR runner (rapid_pass.py via dsync), GT fixtures (make_b1_gt.py + b1_gt_labels.json), and fetch_b1_corpus.py to pull the eval corpus from .94 cc.examboards at runtime. Salvaged from t_15be12ed (which timed out on iteration budget re-running OCR): exam PDFs and generated OCR caches/reports are NOT committed (third-party copyright + reproducible) — gitignored and fetched/generated at runtime. Baseline coverage recorded in the task evidence file. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
34fc7edd68
commit
69d9c46abe
5
api/services/docling/.gitignore
vendored
Normal file
5
api/services/docling/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# B1 image-only eval corpus + pipeline outputs: fetched/generated at runtime, never committed.
|
||||||
|
# Exam-board PDFs are third-party copyright (served only via signed URLs); results/ are reproducible.
|
||||||
|
/samples/b1/
|
||||||
|
/results/b1_rapid/
|
||||||
|
/results/final/
|
||||||
@ -59,6 +59,61 @@ GEOMETRY = [
|
|||||||
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
|
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
|
||||||
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
|
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
B1_GEOMETRY = [
|
||||||
|
dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
|
||||||
|
gt_key="b1-aqa-biology-7402-1-2023jun"),
|
||||||
|
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-chemistry-7405-1-2022jun"),
|
||||||
|
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-physics-7408-1-2022jun"),
|
||||||
|
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-biology-8461-1h-2022jun"),
|
||||||
|
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-chemistry-8462-1h-2022jun"),
|
||||||
|
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-combined-8464-b1h-2022jun"),
|
||||||
|
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
|
||||||
|
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||||||
|
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||||
|
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
|
||||||
|
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
|
||||||
|
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
|
||||||
|
gt_key="b1-aqa-combined-8464-c1h-2022jun"),
|
||||||
|
]
|
||||||
|
|
||||||
|
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
|
||||||
|
|
||||||
FAST = [
|
FAST = [
|
||||||
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
|
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
|
||||||
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||||||
@ -95,16 +150,65 @@ def jload(p):
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
def stats_from(struct, val):
|
|
||||||
|
def load_gt_labels():
|
||||||
|
try:
|
||||||
|
return json.load(open(GT_LABELS_PATH))
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def part_labels(struct):
|
||||||
|
labels = []
|
||||||
|
for q in struct.get("questions", []) or []:
|
||||||
|
for part in q.get("parts", []) or []:
|
||||||
|
lab = part.get("label")
|
||||||
|
if lab:
|
||||||
|
labels.append(lab)
|
||||||
|
return labels
|
||||||
|
|
||||||
|
|
||||||
|
def coverage_against_labels(struct, labels):
|
||||||
|
if not labels:
|
||||||
|
return None
|
||||||
|
rec = set(part_labels(struct))
|
||||||
|
gt = set(labels)
|
||||||
|
hit = sorted(rec & gt)
|
||||||
|
miss = sorted(gt - rec)
|
||||||
|
return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
|
||||||
|
"recovered": len(hit), "total": len(gt), "missed": miss,
|
||||||
|
"source": "fixtures/b1_gt_labels.json"}
|
||||||
|
|
||||||
|
|
||||||
|
def answer_region_count(struct):
|
||||||
|
top = len(struct.get("regions", []) or [])
|
||||||
|
per_part = 0
|
||||||
|
for q in struct.get("questions", []) or []:
|
||||||
|
for part in q.get("parts", []) or []:
|
||||||
|
per_part += len(part.get("regions", []) or [])
|
||||||
|
return top + per_part
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_rapid_cache(p):
|
||||||
|
if os.path.exists(p["docling"]):
|
||||||
|
return True
|
||||||
|
if not os.path.exists(p["pdf"]):
|
||||||
|
print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
|
||||||
|
return False
|
||||||
|
return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
|
||||||
|
|
||||||
|
def stats_from(struct, val, gt_labels=None):
|
||||||
st = struct.get("stats", {}) or {}
|
st = struct.get("stats", {}) or {}
|
||||||
mc = st.get("marks_check") or {}
|
mc = st.get("marks_check") or {}
|
||||||
cov = struct.get("coverage", {}) or {}
|
cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
|
||||||
return {
|
return {
|
||||||
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
|
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
|
||||||
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
|
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
|
||||||
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
|
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
|
||||||
"marks_pct": mc.get("pct"),
|
"marks_pct": mc.get("pct"),
|
||||||
"coverage_pct": cov.get("coverage_pct"), "coverage_missed": cov.get("missed", []),
|
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
|
||||||
|
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
|
||||||
|
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
|
||||||
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
|
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
|
||||||
"validate_flags": val.get("flags", []),
|
"validate_flags": val.get("flags", []),
|
||||||
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
|
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
|
||||||
@ -113,12 +217,15 @@ def stats_from(struct, val):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def do_geometry(p, overlays):
|
def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
|
||||||
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||||||
S, F, B, R, T, V = (os.path.join(d, f) for f in
|
S, F, B, R, T, V = (os.path.join(d, f) for f in
|
||||||
("structured.json", "furniture.json", "bands.json", "page_roles.json",
|
("structured.json", "furniture.json", "bands.json", "page_roles.json",
|
||||||
"template.json", "validate.json"))
|
"template.json", "validate.json"))
|
||||||
ex = ["extract.py"] + p["extract"] + ["--out", S]
|
if prepare_ocr and not ensure_rapid_cache(p):
|
||||||
|
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
|
||||||
|
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
|
||||||
|
ex = ["extract.py"] + extract_args + ["--out", S]
|
||||||
if p.get("gt"):
|
if p.get("gt"):
|
||||||
ex += ["--gt", p["gt"]]
|
ex += ["--gt", p["gt"]]
|
||||||
run(ex)
|
run(ex)
|
||||||
@ -138,7 +245,7 @@ def do_geometry(p, overlays):
|
|||||||
odbg = os.path.join(d, "overlays", "debug")
|
odbg = os.path.join(d, "overlays", "debug")
|
||||||
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
|
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
|
||||||
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
|
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
|
||||||
return stats_from(jload(S), jload(V)), d
|
return stats_from(jload(S), jload(V), gt_labels), d
|
||||||
|
|
||||||
|
|
||||||
def do_fast(p):
|
def do_fast(p):
|
||||||
@ -164,6 +271,7 @@ def per_paper_report(p, s, d, kind):
|
|||||||
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
|
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
|
||||||
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
|
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
|
||||||
f"- **G6 verdict:** {s['validate_verdict']}",
|
f"- **G6 verdict:** {s['validate_verdict']}",
|
||||||
|
f"- **answer-region count:** {s.get('answer_regions')}",
|
||||||
]
|
]
|
||||||
if s["validate_flags"]:
|
if s["validate_flags"]:
|
||||||
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
|
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
|
||||||
@ -178,21 +286,28 @@ def per_paper_report(p, s, d, kind):
|
|||||||
def main():
|
def main():
|
||||||
ap = argparse.ArgumentParser()
|
ap = argparse.ArgumentParser()
|
||||||
ap.add_argument("--no-overlays", action="store_true")
|
ap.add_argument("--no-overlays", action="store_true")
|
||||||
|
ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
|
||||||
|
ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
|
||||||
a = ap.parse_args()
|
a = ap.parse_args()
|
||||||
os.makedirs(FINAL, exist_ok=True)
|
os.makedirs(FINAL, exist_ok=True)
|
||||||
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||||||
"papers": []}
|
"papers": []}
|
||||||
total_imgs = 0
|
total_imgs = 0
|
||||||
|
|
||||||
for p in GEOMETRY:
|
gt_fixtures = load_gt_labels()
|
||||||
|
geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
|
||||||
|
fast = [] if a.b1_only else FAST
|
||||||
|
|
||||||
|
for p in geometry:
|
||||||
print(f"[geometry] {p['slug']}")
|
print(f"[geometry] {p['slug']}")
|
||||||
s, d = do_geometry(p, not a.no_overlays)
|
gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
|
||||||
|
s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
|
||||||
n = per_paper_report(p, s, d, p["path"])
|
n = per_paper_report(p, s, d, p["path"])
|
||||||
total_imgs += n
|
total_imgs += n
|
||||||
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||||||
"kind": "geometry", "path": p["path"], "dir": d,
|
"kind": "geometry", "path": p["path"], "dir": d,
|
||||||
"overlay_images": n, **s})
|
"overlay_images": n, **s})
|
||||||
for p in FAST:
|
for p in fast:
|
||||||
print(f"[fast] {p['slug']}")
|
print(f"[fast] {p['slug']}")
|
||||||
s, d = do_fast(p)
|
s, d = do_fast(p)
|
||||||
per_paper_report(p, s, d, "born-digital fast-path")
|
per_paper_report(p, s, d, "born-digital fast-path")
|
||||||
@ -214,13 +329,13 @@ def write_index(catalog, total_imgs):
|
|||||||
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
|
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
|
||||||
"Machine catalog: `catalog.json`.", "",
|
"Machine catalog: `catalog.json`.", "",
|
||||||
"## Image-only / OCR-path (with geometry + overlays)", "",
|
"## Image-only / OCR-path (with geometry + overlays)", "",
|
||||||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 | Images |",
|
"| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
|
||||||
"|---|---|---|---|---|---|---|"]
|
"|---|---|---|---|---|---|---|---|"]
|
||||||
for p in g:
|
for p in g:
|
||||||
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
|
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
|
||||||
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||||||
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||||||
f"({p['marks_pct']}%) | {cov} | {p['validate_verdict']} | "
|
f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
|
||||||
f"{p['overlay_images']} |")
|
f"{p['overlay_images']} |")
|
||||||
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
|
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
|
||||||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
|
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
|
||||||
|
|||||||
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
356
api/services/docling/fixtures/b1_gt_labels.json
Normal file
@ -0,0 +1,356 @@
|
|||||||
|
{
|
||||||
|
"b1-aqa-biology-7402-1-2023jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7402/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"89.6",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"09.1",
|
||||||
|
"09.2",
|
||||||
|
"09.3",
|
||||||
|
"09.4",
|
||||||
|
"09.5",
|
||||||
|
"09.6",
|
||||||
|
"10.1",
|
||||||
|
"10.2",
|
||||||
|
"10.3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-chemistry-7405-1-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7405/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"05.7",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"06.6",
|
||||||
|
"06.7",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"07.7",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"08.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-physics-7408-1-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "7408/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"07.0",
|
||||||
|
"08.0",
|
||||||
|
"09.0",
|
||||||
|
"10.0",
|
||||||
|
"11.0",
|
||||||
|
"12.0",
|
||||||
|
"13.0",
|
||||||
|
"14.0",
|
||||||
|
"15.0",
|
||||||
|
"16.0",
|
||||||
|
"17.0",
|
||||||
|
"18.0",
|
||||||
|
"19.0",
|
||||||
|
"20.0",
|
||||||
|
"21.0",
|
||||||
|
"22.0",
|
||||||
|
"23.0",
|
||||||
|
"24.0",
|
||||||
|
"25.0",
|
||||||
|
"26.0",
|
||||||
|
"27.0",
|
||||||
|
"28.0",
|
||||||
|
"29.0",
|
||||||
|
"30.0",
|
||||||
|
"31.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-biology-8461-1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "8461/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"01.8",
|
||||||
|
"01.9",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"07.7",
|
||||||
|
"07.8"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-chemistry-8462-1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": "8462/1",
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"04.6",
|
||||||
|
"04.7",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"06.6",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6",
|
||||||
|
"08.1",
|
||||||
|
"08.2",
|
||||||
|
"08.3",
|
||||||
|
"08.4",
|
||||||
|
"08.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-combined-8464-b1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": null,
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"01.6",
|
||||||
|
"01.7",
|
||||||
|
"01.8",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"02.6",
|
||||||
|
"02.7",
|
||||||
|
"03.1",
|
||||||
|
"03.2",
|
||||||
|
"03.3",
|
||||||
|
"03.4",
|
||||||
|
"03.5",
|
||||||
|
"03.6",
|
||||||
|
"03.7",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"05.5",
|
||||||
|
"05.6",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"b1-aqa-combined-8464-c1h-2022jun": {
|
||||||
|
"source_pdf": "cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||||||
|
"source_method": "AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.",
|
||||||
|
"board_detected": "aqa",
|
||||||
|
"paper_code_detected": null,
|
||||||
|
"labels": [
|
||||||
|
"01.1",
|
||||||
|
"01.2",
|
||||||
|
"01.3",
|
||||||
|
"01.4",
|
||||||
|
"01.5",
|
||||||
|
"02.1",
|
||||||
|
"02.2",
|
||||||
|
"02.3",
|
||||||
|
"02.4",
|
||||||
|
"02.5",
|
||||||
|
"03.0",
|
||||||
|
"04.1",
|
||||||
|
"04.2",
|
||||||
|
"04.3",
|
||||||
|
"04.4",
|
||||||
|
"04.5",
|
||||||
|
"04.6",
|
||||||
|
"04.7",
|
||||||
|
"05.1",
|
||||||
|
"05.2",
|
||||||
|
"05.3",
|
||||||
|
"05.4",
|
||||||
|
"06.1",
|
||||||
|
"06.2",
|
||||||
|
"06.3",
|
||||||
|
"06.4",
|
||||||
|
"06.5",
|
||||||
|
"07.1",
|
||||||
|
"07.2",
|
||||||
|
"07.3",
|
||||||
|
"07.4",
|
||||||
|
"07.5",
|
||||||
|
"07.6"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
87
api/services/docling/scripts/fetch_b1_corpus.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Populate the gitignored B1 image-only eval corpus from the .94 exam-board store.
|
||||||
|
|
||||||
|
The B1 eval papers are NOT committed (third-party copyright; served only via signed URLs).
|
||||||
|
This script downloads each B1_GEOMETRY paper's `storage_loc` object from cc.examboards via the
|
||||||
|
Storage API into its local `pdf` path (under samples/b1/), so finalize.py --b1-only and the
|
||||||
|
B1-2/B1-3 generalization work can run against a real corpus.
|
||||||
|
|
||||||
|
Run from api/services/docling/ inside the cc-api-dev container (SUPABASE_URL/SERVICE_ROLE_KEY in env):
|
||||||
|
python3 scripts/fetch_b1_corpus.py # fetch all B1 papers (skip existing)
|
||||||
|
python3 scripts/fetch_b1_corpus.py --force # re-download
|
||||||
|
python3 scripts/fetch_b1_corpus.py --only b1-aqa-physics-7408-1-2022jun
|
||||||
|
python3 scripts/fetch_b1_corpus.py --list # show what would be fetched, no download
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Import the canonical B1 corpus definition (slug, storage_loc, local pdf path) from finalize.
|
||||||
|
_HERE = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
_DOCLING_DIR = os.path.dirname(_HERE)
|
||||||
|
sys.path.insert(0, _DOCLING_DIR)
|
||||||
|
from finalize import B1_GEOMETRY # noqa: E402
|
||||||
|
|
||||||
|
|
||||||
|
def _split_storage_loc(storage_loc: str) -> tuple[str, str]:
|
||||||
|
"""'cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf' -> ('cc.examboards', 'aqa/.../qp.pdf')."""
|
||||||
|
bucket, _, path = storage_loc.partition("/")
|
||||||
|
if not bucket or not path:
|
||||||
|
raise ValueError(f"malformed storage_loc: {storage_loc!r}")
|
||||||
|
return bucket, path
|
||||||
|
|
||||||
|
|
||||||
|
def _entries(only: str | None):
|
||||||
|
for p in B1_GEOMETRY:
|
||||||
|
loc = p.get("storage_loc")
|
||||||
|
pdf = p.get("pdf")
|
||||||
|
if not loc or not pdf:
|
||||||
|
continue
|
||||||
|
if only and p.get("slug") != only:
|
||||||
|
continue
|
||||||
|
yield p["slug"], loc, pdf
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser(description="Fetch the B1 image-only eval corpus from .94 cc.examboards")
|
||||||
|
ap.add_argument("--force", action="store_true", help="re-download even if the local file exists")
|
||||||
|
ap.add_argument("--only", help="fetch a single paper by slug")
|
||||||
|
ap.add_argument("--list", action="store_true", help="list what would be fetched and exit")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
todo = list(_entries(args.only))
|
||||||
|
if not todo:
|
||||||
|
print("no matching B1 papers", file=sys.stderr)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if args.list:
|
||||||
|
for slug, loc, pdf in todo:
|
||||||
|
print(f"{slug}\t{loc}\t-> {pdf}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
from modules.database.supabase.utils.storage import StorageAdmin
|
||||||
|
storage = StorageAdmin()
|
||||||
|
|
||||||
|
ok = skipped = 0
|
||||||
|
for slug, loc, pdf in todo:
|
||||||
|
dest = os.path.join(_DOCLING_DIR, pdf) if not os.path.isabs(pdf) else pdf
|
||||||
|
if os.path.exists(dest) and not args.force:
|
||||||
|
print(f"[skip] {slug} (exists)")
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
bucket, path = _split_storage_loc(loc)
|
||||||
|
data = storage.download_file(bucket, path)
|
||||||
|
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
||||||
|
with open(dest, "wb") as fh:
|
||||||
|
fh.write(data)
|
||||||
|
print(f"[ok] {slug} <- {bucket}/{path} ({len(data)} bytes)")
|
||||||
|
ok += 1
|
||||||
|
|
||||||
|
print(f"fetched {ok}, skipped {skipped}, of {len(todo)}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
raise SystemExit(main())
|
||||||
32
api/services/docling/scripts/make_b1_gt.py
Normal file
32
api/services/docling/scripts/make_b1_gt.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import json, sys
|
||||||
|
from pathlib import Path
|
||||||
|
base=Path('/app/api/services/docling')
|
||||||
|
sys.path.insert(0, str(base))
|
||||||
|
import extract
|
||||||
|
papers=[
|
||||||
|
('b1-aqa-biology-7402-1-2023jun','samples/b1/aqa-biology-7402-1-2023jun.pdf','cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf'),
|
||||||
|
('b1-aqa-chemistry-7405-1-2022jun','samples/b1/aqa-chemistry-7405-1-2022jun.pdf','cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-physics-7408-1-2022jun','samples/b1/aqa-physics-7408-1-2022jun.pdf','cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-biology-8461-1h-2022jun','samples/b1/aqa-biology-8461-1h-2022jun.pdf','cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-chemistry-8462-1h-2022jun','samples/b1/aqa-chemistry-8462-1h-2022jun.pdf','cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-combined-8464-b1h-2022jun','samples/b1/aqa-combined-8464-b1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf'),
|
||||||
|
('b1-aqa-combined-8464-c1h-2022jun','samples/b1/aqa-combined-8464-c1h-2022jun.pdf','cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf'),
|
||||||
|
]
|
||||||
|
out={}
|
||||||
|
for slug, rel, storage in papers:
|
||||||
|
lines=extract.lines_from_pdftext(str(base/rel))
|
||||||
|
board, code=extract.detect_board(lines)
|
||||||
|
if board != 'aqa':
|
||||||
|
raise RuntimeError(f'{slug}: expected AQA board, detected {board!r} ({code!r})')
|
||||||
|
parts=extract.parse_text_by_board(lines, board)
|
||||||
|
labels=list(parts)
|
||||||
|
out[slug]={
|
||||||
|
'source_pdf': storage,
|
||||||
|
'source_method': 'AQA born-digital text-layer parsed with existing extract.py AQA grammar; used as reproducible GT label set for image-only OCR baseline.',
|
||||||
|
'board_detected': board,
|
||||||
|
'paper_code_detected': code,
|
||||||
|
'labels': labels,
|
||||||
|
}
|
||||||
|
print(slug, board, code, len(labels), labels[:5], labels[-5:])
|
||||||
|
Path(base/'fixtures').mkdir(exist_ok=True)
|
||||||
|
Path(base/'fixtures/b1_gt_labels.json').write_text(json.dumps(out, indent=2)+"\n")
|
||||||
69
api/services/docling/scripts/rapid_pass.py
Normal file
69
api/services/docling/scripts/rapid_pass.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
rapid_pass.py — generalise the proven AQA "RapidOCR margin-pass" (95.2% on the image-only
|
||||||
|
8463 paper) to any AQA paper. Born-digital AQA QPs ship a text layer, so we force RapidOCR
|
||||||
|
over the *rendered* page (`force_ocr:true`) to simulate the image-only redistribution case
|
||||||
|
and recover the boxed `NN.M` question numbers Tesseract shatters.
|
||||||
|
|
||||||
|
For each page it writes results/<outdir>/p{N}.json (a full per-page DoclingDocument, the
|
||||||
|
shape extract.py's aqa_questions_rapid expects) and a merged.json (for board / front-matter
|
||||||
|
detection). All GPU work is serialised + OOM-resilient through dsync.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/rapid_pass.py samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf rapid_7408
|
||||||
|
python scripts/rapid_pass.py <pdf> <outdir-slug> [first_page] [last_page]
|
||||||
|
"""
|
||||||
|
import os, sys, json, subprocess, re
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
import dsync
|
||||||
|
|
||||||
|
OPTS = {"ocr_engine": "rapidocr", "force_ocr": True}
|
||||||
|
|
||||||
|
|
||||||
|
def npages(pdf):
|
||||||
|
out = subprocess.check_output(["pdfinfo", pdf]).decode()
|
||||||
|
return int(out.split("Pages:")[1].split()[0])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf = sys.argv[1]
|
||||||
|
slug = sys.argv[2]
|
||||||
|
if os.path.isabs(slug) or ".." in slug.split(os.sep) or not re.fullmatch(r"[A-Za-z0-9._/-]+", slug):
|
||||||
|
raise SystemExit(f"unsafe output slug: {slug!r}")
|
||||||
|
n = npages(pdf)
|
||||||
|
first = int(sys.argv[3]) if len(sys.argv) > 3 else 1
|
||||||
|
last = min(int(sys.argv[4]), n) if len(sys.argv) > 4 else n
|
||||||
|
if first > n or first > last:
|
||||||
|
print(f"requested page range {first}-{last} is outside PDF ({n} pages); nothing to do")
|
||||||
|
return
|
||||||
|
outdir = os.path.join("results", slug)
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
|
r = dsync._redis()
|
||||||
|
print(f"redis: {'connected' if r else 'NO CACHE'} pdf={pdf} pages {first}-{last}/{n}")
|
||||||
|
merged = {"texts": [], "tables": [], "pictures": [], "pages": {}, "_failed_pages": []}
|
||||||
|
for pg in range(first, last + 1):
|
||||||
|
page_path = os.path.join(outdir, f"p{pg}.json")
|
||||||
|
if os.path.exists(page_path):
|
||||||
|
doc = json.load(open(page_path))
|
||||||
|
print(f" p{pg}: file cache HIT ({len(doc.get(texts, []))} texts)")
|
||||||
|
else:
|
||||||
|
doc = dsync.convert_page(pdf, pg, OPTS, r=r)
|
||||||
|
if not doc:
|
||||||
|
merged["_failed_pages"].append(pg)
|
||||||
|
print(f" p{pg}: FAILED")
|
||||||
|
continue
|
||||||
|
json.dump(doc, open(page_path, "w"))
|
||||||
|
for k in ("texts", "tables", "pictures"):
|
||||||
|
merged[k].extend(doc.get(k, []))
|
||||||
|
merged["pages"].update(doc.get("pages", {}))
|
||||||
|
nmarg = sum(1 for t in doc.get("texts", [])
|
||||||
|
if (t.get("prov") or [{}])[0].get("bbox", {}).get("l", 999) <= 140)
|
||||||
|
print(f" p{pg}: {len(doc.get('texts', []))} texts ({nmarg} left-margin)")
|
||||||
|
json.dump(merged, open(os.path.join(outdir, "merged.json"), "w"))
|
||||||
|
print(f"-> {outdir}/ ({last-first+1-len(merged['_failed_pages'])} pages, "
|
||||||
|
f"failed={merged['_failed_pages']})")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user