api/api/services/docling/finalize.py
CC Worker 69d9c46abe
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
feat(docling): B1 image-only OCR eval harness (overwatch-cleaned)
Eval harness for AQA A-level + GCSE-science image-only papers: finalize.py --b1-only,
RapidOCR runner (rapid_pass.py via dsync), GT fixtures (make_b1_gt.py + b1_gt_labels.json),
and fetch_b1_corpus.py to pull the eval corpus from .94 cc.examboards at runtime.

Salvaged from t_15be12ed (which timed out on iteration budget re-running OCR): exam PDFs and
generated OCR caches/reports are NOT committed (third-party copyright + reproducible) — gitignored
and fetched/generated at runtime. Baseline coverage recorded in the task evidence file.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 03:10:10 +00:00

364 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
finalize.py — produce the final corpus output bundle under results/final/.
Runs the full pipeline (via the real module CLIs, so the bundle is reproducible) across the corpus:
* geometry papers (image-only / OCR-path): structured + furniture + bands + page_roles + template
+ validate + overlays (template human-review view for ALL pages, rich debug for sample pages).
* born-digital fast-path papers: structured + validate (no geometry -> no overlays).
Writes per-paper report.md, a human INDEX.md, and a machine catalog.json.
Usage:
python finalize.py [--no-overlays] # --no-overlays = JSON pipeline only (fast)
"""
import os, sys, glob, json, subprocess, argparse, datetime
FINAL = "results/final"
PY = sys.executable
# ------------------------------------------------------------------ corpus manifest
GEOMETRY = [
dict(slug="aqa-physics-8463-imageonly", title="AQA GCSE Physics 8463/1H (image-only)",
board="aqa", level="GCSE", path="image-only (RapidOCR margin-pass)",
pdf="samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
docling="results/E_tess_full.json", rapid="results/rapid_pages/p*.json",
extract=["--docling", "results/E_tess_full.json", "--rapid", "results/rapid_pages/p*.json",
"--granite", "cached"]),
dict(slug="aqa-physics-7408-ocr", title="AQA A-level Physics 7408/1 (rasterised OCR)",
board="aqa", level="A-level", path="OCR (RapidOCR margin-pass + Section-B MCQ)",
pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
docling="results/rapid_7408/merged.json", rapid="results/rapid_7408/p*.json",
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt",
extract=["--docling", "results/rapid_7408/merged.json", "--rapid", "results/rapid_7408/p*.json",
"--board", "aqa"]),
dict(slug="aqa-biology-8461-ocr", title="AQA GCSE Biology 8461/1H (rasterised OCR)",
board="aqa", level="GCSE", path="OCR (RapidOCR margin-pass)",
pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
docling="results/rapid_8461/merged.json", rapid="results/rapid_8461/p*.json",
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt",
extract=["--docling", "results/rapid_8461/merged.json", "--rapid", "results/rapid_8461/p*.json",
"--board", "aqa"]),
dict(slug="edexcel-maths-1ma1-1h-ocr", title="Edexcel GCSE Maths 1MA1/1H (rasterised OCR)",
board="edexcel", level="GCSE-H", path="OCR + gemma marks gap-fill",
pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
docling="results/genreport/edexcel1h/ocr.json", rapid=None,
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt",
extract=["--docling", "results/genreport/edexcel1h/ocr.json", "--board", "edexcel",
"--marks-fill", "results/genreport/edexcel1h/marks_fill.json"]),
dict(slug="edexcel-maths-1ma1-1f-ocr", title="Edexcel GCSE Maths 1MA1/1F (rasterised OCR)",
board="edexcel", level="GCSE-F", path="OCR + gemma marks gap-fill",
pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf",
docling="results/genreport/edexcel1f/ocr.json", rapid=None,
extract=["--docling", "results/genreport/edexcel1f/ocr.json", "--board", "edexcel",
"--marks-fill", "results/genreport/edexcel1f/marks_fill.json"]),
dict(slug="ocr-physics-h556-ocr", title="OCR A-level Physics H556/3 (rasterised OCR)",
board="ocr", level="A-level", path="OCR + gemma marks gap-fill",
pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
docling="results/genreport/ocrh556/ocr.json", rapid=None,
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt",
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
]
B1_GEOMETRY = [
dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
gt_key="b1-aqa-biology-7402-1-2023jun"),
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
gt_key="b1-aqa-chemistry-7405-1-2022jun"),
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
gt_key="b1-aqa-physics-7408-1-2022jun"),
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
gt_key="b1-aqa-biology-8461-1h-2022jun"),
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
gt_key="b1-aqa-chemistry-8462-1h-2022jun"),
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
gt_key="b1-aqa-combined-8464-b1h-2022jun"),
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
gt_key="b1-aqa-combined-8464-c1h-2022jun"),
]
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
FAST = [
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt"),
dict(slug="aqa-biology-8461-fast", title="AQA GCSE Biology 8461/1H (born-digital)", board="aqa",
level="GCSE", pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt"),
dict(slug="edexcel-maths-1ma1-1h-fast", title="Edexcel GCSE Maths 1MA1/1H (born-digital)",
board="edexcel", level="GCSE-H", pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt"),
dict(slug="edexcel-maths-1ma1-1f-fast", title="Edexcel GCSE Maths 1MA1/1F (born-digital)",
board="edexcel", level="GCSE-F", pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf"),
dict(slug="ocr-physics-h556-fast", title="OCR A-level Physics H556/3 (born-digital)", board="ocr",
level="A-level", pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt"),
dict(slug="aqa-chemistry-8462-fast", title="AQA GCSE Chemistry 8462/1H (born-digital)", board="aqa",
level="GCSE", pdf="samples/chemistry-p1h-2023-qp.pdf"),
dict(slug="aqa-physics-8463-twin-fast", title="AQA GCSE Physics 8463/1H born-digital twin",
board="aqa", level="GCSE", pdf="samples/physics-p1h-2022-qp.pdf"),
]
def run(cmd):
r = subprocess.run([PY] + cmd, capture_output=True, text=True)
if r.returncode != 0:
print(f" ! FAILED: {' '.join(cmd)}\n{r.stderr[-400:]}")
return r.returncode == 0
def jload(p):
try:
return json.load(open(p))
except Exception:
return {}
def load_gt_labels():
try:
return json.load(open(GT_LABELS_PATH))
except Exception:
return {}
def part_labels(struct):
labels = []
for q in struct.get("questions", []) or []:
for part in q.get("parts", []) or []:
lab = part.get("label")
if lab:
labels.append(lab)
return labels
def coverage_against_labels(struct, labels):
if not labels:
return None
rec = set(part_labels(struct))
gt = set(labels)
hit = sorted(rec & gt)
miss = sorted(gt - rec)
return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
"recovered": len(hit), "total": len(gt), "missed": miss,
"source": "fixtures/b1_gt_labels.json"}
def answer_region_count(struct):
top = len(struct.get("regions", []) or [])
per_part = 0
for q in struct.get("questions", []) or []:
for part in q.get("parts", []) or []:
per_part += len(part.get("regions", []) or [])
return top + per_part
def ensure_rapid_cache(p):
if os.path.exists(p["docling"]):
return True
if not os.path.exists(p["pdf"]):
print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
return False
return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
def stats_from(struct, val, gt_labels=None):
st = struct.get("stats", {}) or {}
mc = st.get("marks_check") or {}
cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
return {
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
"marks_pct": mc.get("pct"),
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
"validate_flags": val.get("flags", []),
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
"questions_recovered": (val.get("summary") or {}).get("questions_recovered"),
"second_pass_slots": [q["label"] for q in val.get("question_sequence", []) if not q["recovered"]],
}
def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
S, F, B, R, T, V = (os.path.join(d, f) for f in
("structured.json", "furniture.json", "bands.json", "page_roles.json",
"template.json", "validate.json"))
if prepare_ocr and not ensure_rapid_cache(p):
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
ex = ["extract.py"] + extract_args + ["--out", S]
if p.get("gt"):
ex += ["--gt", p["gt"]]
run(ex)
run(["furniture.py", p["docling"], "--out", F])
bands = ["bands.py", S, "--docling", p["docling"], "--out", B]
if p.get("rapid"):
bands += ["--rapid", p["rapid"]]
run(bands)
run(["page_roles.py", p["docling"], "--bands", B, "--out", R])
run(["template.py", "--structured", S, "--bands", B, "--furniture", F,
"--page-roles", R, "--pdf", p["pdf"], "--out", T])
run(["validate.py", S, "--out", V])
if overlays:
otpl = os.path.join(d, "overlays", "template")
run(["scripts/overlay.py", S, p["pdf"], "--template", T, "--dpi", "120", "--out", otpl])
# rich debug view on the first few pages (cover + early questions)
odbg = os.path.join(d, "overlays", "debug")
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
return stats_from(jload(S), jload(V), gt_labels), d
def do_fast(p):
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
S = os.path.join(d, "structured.json"); V = os.path.join(d, "validate.json")
ex = ["extract.py", "--text", p["pdf"], "--out", S]
if p.get("gt"):
ex += ["--gt", p["gt"]]
run(ex)
run(["validate.py", S, "--out", V])
return stats_from(jload(S), jload(V)), d
def per_paper_report(p, s, d, kind):
n_imgs = len(glob.glob(os.path.join(d, "overlays", "**", "*.png"), recursive=True))
lines = [f"# {p['title']}", "",
f"- **slug:** `{p['slug']}` · **board:** {p['board']} · **level:** {p['level']} "
f"· **path:** {kind}",
f"- **questions/parts:** {s['n_questions']} / {s['n_parts']}",
f"- **marks:** {s['marks_sum']}/{s['official_max']}"
+ (f" ({s['marks_pct']}% of official max)" if s['marks_pct'] is not None else ""),
f"- **coverage vs GT:** {s['coverage_pct']}%"
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
f"- **G6 verdict:** {s['validate_verdict']}",
f"- **answer-region count:** {s.get('answer_regions')}",
]
if s["validate_flags"]:
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
lines += ["", "**Artifacts:** `structured.json`, `validate.json`"
+ (", `furniture.json`, `bands.json`, `page_roles.json`, `template.json`, "
f"`overlays/` ({n_imgs} images)" if kind != "born-digital fast-path"
else " (born-digital: no page geometry → no overlays)")]
open(os.path.join(d, "report.md"), "w").write("\n".join(lines) + "\n")
return n_imgs
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--no-overlays", action="store_true")
ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
a = ap.parse_args()
os.makedirs(FINAL, exist_ok=True)
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
"papers": []}
total_imgs = 0
gt_fixtures = load_gt_labels()
geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
fast = [] if a.b1_only else FAST
for p in geometry:
print(f"[geometry] {p['slug']}")
gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
n = per_paper_report(p, s, d, p["path"])
total_imgs += n
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
"kind": "geometry", "path": p["path"], "dir": d,
"overlay_images": n, **s})
for p in fast:
print(f"[fast] {p['slug']}")
s, d = do_fast(p)
per_paper_report(p, s, d, "born-digital fast-path")
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
"kind": "fast", "path": "born-digital fast-path", "dir": d, **s})
json.dump(catalog, open(os.path.join(FINAL, "catalog.json"), "w"), indent=2)
write_index(catalog, total_imgs)
print(f"\n-> {len(catalog['papers'])} papers, {total_imgs} overlay images -> {FINAL}/")
def write_index(catalog, total_imgs):
g = [p for p in catalog["papers"] if p["kind"] == "geometry"]
f = [p for p in catalog["papers"] if p["kind"] == "fast"]
L = ["# Final corpus output — exam-extraction spike", "",
f"Generated {catalog['generated_at']}. {len(catalog['papers'])} paper-runs across "
f"3 boards × 2 levels, both pipeline paths; {total_imgs} overlay debug images.", "",
"Each `<slug>/` holds the machine artifacts (JSON) + `report.md`; geometry papers also have "
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
"Machine catalog: `catalog.json`.", "",
"## Image-only / OCR-path (with geometry + overlays)", "",
"| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
"|---|---|---|---|---|---|---|---|"]
for p in g:
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
f"{p['overlay_images']} |")
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
"|---|---|---|---|---|---|"]
for p in f:
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
f"({p['marks_pct']}%) | {p['coverage_pct'] if p['coverage_pct'] is not None else 'n/a'}% | "
f"{p['validate_verdict']} |")
L += ["", "## Per-paper directory layout", "```",
"<slug>/",
" structured.json extract.py output (questions->parts->marks/bbox/regions)",
" validate.json G6 consistency judge (confidence + flags)",
" furniture.json recurring-furniture mask + content margins [geometry only]",
" bands.json main + part y-bands [geometry only]",
" page_roles.json per-page role + margin override [geometry only]",
" template.json editable first-pass template (source/confirmed) [geometry only]",
" overlays/template/ human-review view, all pages [geometry only]",
" overlays/debug/ raw-detection view, sample pages [geometry only]",
" report.md per-paper human summary", "```"]
open(os.path.join(FINAL, "INDEX.md"), "w").write("\n".join(L) + "\n")
if __name__ == "__main__":
main()