Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
Eval harness for AQA A-level + GCSE-science image-only papers: finalize.py --b1-only, RapidOCR runner (rapid_pass.py via dsync), GT fixtures (make_b1_gt.py + b1_gt_labels.json), and fetch_b1_corpus.py to pull the eval corpus from .94 cc.examboards at runtime. Salvaged from t_15be12ed (which timed out on iteration budget re-running OCR): exam PDFs and generated OCR caches/reports are NOT committed (third-party copyright + reproducible) — gitignored and fetched/generated at runtime. Baseline coverage recorded in the task evidence file. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
364 lines
20 KiB
Python
364 lines
20 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
finalize.py — produce the final corpus output bundle under results/final/.
|
||
|
||
Runs the full pipeline (via the real module CLIs, so the bundle is reproducible) across the corpus:
|
||
* geometry papers (image-only / OCR-path): structured + furniture + bands + page_roles + template
|
||
+ validate + overlays (template human-review view for ALL pages, rich debug for sample pages).
|
||
* born-digital fast-path papers: structured + validate (no geometry -> no overlays).
|
||
Writes per-paper report.md, a human INDEX.md, and a machine catalog.json.
|
||
|
||
Usage:
|
||
python finalize.py [--no-overlays] # --no-overlays = JSON pipeline only (fast)
|
||
"""
|
||
import os, sys, glob, json, subprocess, argparse, datetime
|
||
|
||
FINAL = "results/final"
|
||
PY = sys.executable
|
||
|
||
# ------------------------------------------------------------------ corpus manifest
|
||
GEOMETRY = [
|
||
dict(slug="aqa-physics-8463-imageonly", title="AQA GCSE Physics 8463/1H (image-only)",
|
||
board="aqa", level="GCSE", path="image-only (RapidOCR margin-pass)",
|
||
pdf="samples/AQA-Physics-Paper-1H-2022-with-qr.pdf",
|
||
docling="results/E_tess_full.json", rapid="results/rapid_pages/p*.json",
|
||
extract=["--docling", "results/E_tess_full.json", "--rapid", "results/rapid_pages/p*.json",
|
||
"--granite", "cached"]),
|
||
dict(slug="aqa-physics-7408-ocr", title="AQA A-level Physics 7408/1 (rasterised OCR)",
|
||
board="aqa", level="A-level", path="OCR (RapidOCR margin-pass + Section-B MCQ)",
|
||
pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||
docling="results/rapid_7408/merged.json", rapid="results/rapid_7408/p*.json",
|
||
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt",
|
||
extract=["--docling", "results/rapid_7408/merged.json", "--rapid", "results/rapid_7408/p*.json",
|
||
"--board", "aqa"]),
|
||
dict(slug="aqa-biology-8461-ocr", title="AQA GCSE Biology 8461/1H (rasterised OCR)",
|
||
board="aqa", level="GCSE", path="OCR (RapidOCR margin-pass)",
|
||
pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
|
||
docling="results/rapid_8461/merged.json", rapid="results/rapid_8461/p*.json",
|
||
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt",
|
||
extract=["--docling", "results/rapid_8461/merged.json", "--rapid", "results/rapid_8461/p*.json",
|
||
"--board", "aqa"]),
|
||
dict(slug="edexcel-maths-1ma1-1h-ocr", title="Edexcel GCSE Maths 1MA1/1H (rasterised OCR)",
|
||
board="edexcel", level="GCSE-H", path="OCR + gemma marks gap-fill",
|
||
pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
|
||
docling="results/genreport/edexcel1h/ocr.json", rapid=None,
|
||
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt",
|
||
extract=["--docling", "results/genreport/edexcel1h/ocr.json", "--board", "edexcel",
|
||
"--marks-fill", "results/genreport/edexcel1h/marks_fill.json"]),
|
||
dict(slug="edexcel-maths-1ma1-1f-ocr", title="Edexcel GCSE Maths 1MA1/1F (rasterised OCR)",
|
||
board="edexcel", level="GCSE-F", path="OCR + gemma marks gap-fill",
|
||
pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf",
|
||
docling="results/genreport/edexcel1f/ocr.json", rapid=None,
|
||
extract=["--docling", "results/genreport/edexcel1f/ocr.json", "--board", "edexcel",
|
||
"--marks-fill", "results/genreport/edexcel1f/marks_fill.json"]),
|
||
dict(slug="ocr-physics-h556-ocr", title="OCR A-level Physics H556/3 (rasterised OCR)",
|
||
board="ocr", level="A-level", path="OCR + gemma marks gap-fill",
|
||
pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
|
||
docling="results/genreport/ocrh556/ocr.json", rapid=None,
|
||
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt",
|
||
extract=["--docling", "results/genreport/ocrh556/ocr.json", "--board", "ocr",
|
||
"--marks-fill", "results/genreport/ocrh556/marks_fill.json"]),
|
||
]
|
||
|
||
B1_GEOMETRY = [
|
||
dict(slug="b1-aqa-biology-7402-1-2023jun", title="AQA A-level Biology 7402/1 2023 Jun (image-only OCR baseline)",
|
||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/biology/7402/1/2023-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
|
||
gt_key="b1-aqa-biology-7402-1-2023jun"),
|
||
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
|
||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
|
||
gt_key="b1-aqa-chemistry-7405-1-2022jun"),
|
||
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
|
||
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
|
||
gt_key="b1-aqa-physics-7408-1-2022jun"),
|
||
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
|
||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
|
||
gt_key="b1-aqa-biology-8461-1h-2022jun"),
|
||
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
|
||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
|
||
gt_key="b1-aqa-chemistry-8462-1h-2022jun"),
|
||
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
|
||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
|
||
gt_key="b1-aqa-combined-8464-b1h-2022jun"),
|
||
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
|
||
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
|
||
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
|
||
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
|
||
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
|
||
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
|
||
gt_key="b1-aqa-combined-8464-c1h-2022jun"),
|
||
]
|
||
|
||
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
|
||
|
||
FAST = [
|
||
dict(slug="aqa-physics-7408-fast", title="AQA A-level Physics 7408/1 (born-digital)", board="aqa",
|
||
level="A-level", pdf="samples/extra/aqa-alevel-physics-7408-1-jun22-qp.pdf",
|
||
gt="results/gt_extra/aqa-alevel-physics-7408-1-jun22-qp.txt"),
|
||
dict(slug="aqa-biology-8461-fast", title="AQA GCSE Biology 8461/1H (born-digital)", board="aqa",
|
||
level="GCSE", pdf="samples/extra/aqa-gcse-biology-8461-1h-jun22-qp.pdf",
|
||
gt="results/gt_extra/aqa-gcse-biology-8461-1h-jun22-qp.txt"),
|
||
dict(slug="edexcel-maths-1ma1-1h-fast", title="Edexcel GCSE Maths 1MA1/1H (born-digital)",
|
||
board="edexcel", level="GCSE-H", pdf="samples/extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.pdf",
|
||
gt="results/gt_extra/edexcel-gcse-maths-1ma1-1h-jun22-qp.txt"),
|
||
dict(slug="edexcel-maths-1ma1-1f-fast", title="Edexcel GCSE Maths 1MA1/1F (born-digital)",
|
||
board="edexcel", level="GCSE-F", pdf="samples/extra/edexcel-gcse-maths-1ma1-1f-jun22-qp.pdf"),
|
||
dict(slug="ocr-physics-h556-fast", title="OCR A-level Physics H556/3 (born-digital)", board="ocr",
|
||
level="A-level", pdf="samples/extra/ocr-alevel-physics-h556-3-jun22-qp.pdf",
|
||
gt="results/gt_extra/ocr-alevel-physics-h556-3-jun22-qp.txt"),
|
||
dict(slug="aqa-chemistry-8462-fast", title="AQA GCSE Chemistry 8462/1H (born-digital)", board="aqa",
|
||
level="GCSE", pdf="samples/chemistry-p1h-2023-qp.pdf"),
|
||
dict(slug="aqa-physics-8463-twin-fast", title="AQA GCSE Physics 8463/1H born-digital twin",
|
||
board="aqa", level="GCSE", pdf="samples/physics-p1h-2022-qp.pdf"),
|
||
]
|
||
|
||
|
||
def run(cmd):
|
||
r = subprocess.run([PY] + cmd, capture_output=True, text=True)
|
||
if r.returncode != 0:
|
||
print(f" ! FAILED: {' '.join(cmd)}\n{r.stderr[-400:]}")
|
||
return r.returncode == 0
|
||
|
||
|
||
def jload(p):
|
||
try:
|
||
return json.load(open(p))
|
||
except Exception:
|
||
return {}
|
||
|
||
|
||
|
||
def load_gt_labels():
|
||
try:
|
||
return json.load(open(GT_LABELS_PATH))
|
||
except Exception:
|
||
return {}
|
||
|
||
|
||
def part_labels(struct):
|
||
labels = []
|
||
for q in struct.get("questions", []) or []:
|
||
for part in q.get("parts", []) or []:
|
||
lab = part.get("label")
|
||
if lab:
|
||
labels.append(lab)
|
||
return labels
|
||
|
||
|
||
def coverage_against_labels(struct, labels):
|
||
if not labels:
|
||
return None
|
||
rec = set(part_labels(struct))
|
||
gt = set(labels)
|
||
hit = sorted(rec & gt)
|
||
miss = sorted(gt - rec)
|
||
return {"coverage_pct": round(len(hit) / len(gt) * 100, 1),
|
||
"recovered": len(hit), "total": len(gt), "missed": miss,
|
||
"source": "fixtures/b1_gt_labels.json"}
|
||
|
||
|
||
def answer_region_count(struct):
|
||
top = len(struct.get("regions", []) or [])
|
||
per_part = 0
|
||
for q in struct.get("questions", []) or []:
|
||
for part in q.get("parts", []) or []:
|
||
per_part += len(part.get("regions", []) or [])
|
||
return top + per_part
|
||
|
||
|
||
def ensure_rapid_cache(p):
|
||
if os.path.exists(p["docling"]):
|
||
return True
|
||
if not os.path.exists(p["pdf"]):
|
||
print(f" ! missing source PDF for {p['slug']}: {p['pdf']} (storage_loc={p.get('storage_loc')})")
|
||
return False
|
||
return run(["scripts/rapid_pass.py", p["pdf"], "b1_rapid/" + p["slug"]])
|
||
|
||
def stats_from(struct, val, gt_labels=None):
|
||
st = struct.get("stats", {}) or {}
|
||
mc = st.get("marks_check") or {}
|
||
cov = coverage_against_labels(struct, gt_labels) if gt_labels else (struct.get("coverage", {}) or {})
|
||
return {
|
||
"board": struct.get("board"), "paper_code": struct.get("paper_code"),
|
||
"n_questions": st.get("n_questions"), "n_parts": st.get("n_parts"),
|
||
"marks_sum": mc.get("sum"), "official_max": mc.get("expected_max"),
|
||
"marks_pct": mc.get("pct"),
|
||
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
|
||
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
|
||
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
|
||
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
|
||
"validate_flags": val.get("flags", []),
|
||
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
|
||
"questions_recovered": (val.get("summary") or {}).get("questions_recovered"),
|
||
"second_pass_slots": [q["label"] for q in val.get("question_sequence", []) if not q["recovered"]],
|
||
}
|
||
|
||
|
||
def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
|
||
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||
S, F, B, R, T, V = (os.path.join(d, f) for f in
|
||
("structured.json", "furniture.json", "bands.json", "page_roles.json",
|
||
"template.json", "validate.json"))
|
||
if prepare_ocr and not ensure_rapid_cache(p):
|
||
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
|
||
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
|
||
ex = ["extract.py"] + extract_args + ["--out", S]
|
||
if p.get("gt"):
|
||
ex += ["--gt", p["gt"]]
|
||
run(ex)
|
||
run(["furniture.py", p["docling"], "--out", F])
|
||
bands = ["bands.py", S, "--docling", p["docling"], "--out", B]
|
||
if p.get("rapid"):
|
||
bands += ["--rapid", p["rapid"]]
|
||
run(bands)
|
||
run(["page_roles.py", p["docling"], "--bands", B, "--out", R])
|
||
run(["template.py", "--structured", S, "--bands", B, "--furniture", F,
|
||
"--page-roles", R, "--pdf", p["pdf"], "--out", T])
|
||
run(["validate.py", S, "--out", V])
|
||
if overlays:
|
||
otpl = os.path.join(d, "overlays", "template")
|
||
run(["scripts/overlay.py", S, p["pdf"], "--template", T, "--dpi", "120", "--out", otpl])
|
||
# rich debug view on the first few pages (cover + early questions)
|
||
odbg = os.path.join(d, "overlays", "debug")
|
||
run(["scripts/overlay.py", S, p["pdf"], "--docling", p["docling"], "--bands", B,
|
||
"--furniture", F, "--pages", "1,2,3,4,5", "--dpi", "120", "--out", odbg])
|
||
return stats_from(jload(S), jload(V), gt_labels), d
|
||
|
||
|
||
def do_fast(p):
|
||
d = os.path.join(FINAL, p["slug"]); os.makedirs(d, exist_ok=True)
|
||
S = os.path.join(d, "structured.json"); V = os.path.join(d, "validate.json")
|
||
ex = ["extract.py", "--text", p["pdf"], "--out", S]
|
||
if p.get("gt"):
|
||
ex += ["--gt", p["gt"]]
|
||
run(ex)
|
||
run(["validate.py", S, "--out", V])
|
||
return stats_from(jload(S), jload(V)), d
|
||
|
||
|
||
def per_paper_report(p, s, d, kind):
|
||
n_imgs = len(glob.glob(os.path.join(d, "overlays", "**", "*.png"), recursive=True))
|
||
lines = [f"# {p['title']}", "",
|
||
f"- **slug:** `{p['slug']}` · **board:** {p['board']} · **level:** {p['level']} "
|
||
f"· **path:** {kind}",
|
||
f"- **questions/parts:** {s['n_questions']} / {s['n_parts']}",
|
||
f"- **marks:** {s['marks_sum']}/{s['official_max']}"
|
||
+ (f" ({s['marks_pct']}% of official max)" if s['marks_pct'] is not None else ""),
|
||
f"- **coverage vs GT:** {s['coverage_pct']}%"
|
||
+ (f" (missed {s['coverage_missed'][:8]})" if s.get('coverage_missed') else "")
|
||
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
|
||
f"- **G6 verdict:** {s['validate_verdict']}",
|
||
f"- **answer-region count:** {s.get('answer_regions')}",
|
||
]
|
||
if s["validate_flags"]:
|
||
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]
|
||
lines += ["", "**Artifacts:** `structured.json`, `validate.json`"
|
||
+ (", `furniture.json`, `bands.json`, `page_roles.json`, `template.json`, "
|
||
f"`overlays/` ({n_imgs} images)" if kind != "born-digital fast-path"
|
||
else " (born-digital: no page geometry → no overlays)")]
|
||
open(os.path.join(d, "report.md"), "w").write("\n".join(lines) + "\n")
|
||
return n_imgs
|
||
|
||
|
||
def main():
|
||
ap = argparse.ArgumentParser()
|
||
ap.add_argument("--no-overlays", action="store_true")
|
||
ap.add_argument("--b1-only", action="store_true", help="run only the Sprint B1 image-only OCR eval corpus")
|
||
ap.add_argument("--prepare-ocr", action="store_true", help="populate missing B1 RapidOCR caches via dsync before running")
|
||
a = ap.parse_args()
|
||
os.makedirs(FINAL, exist_ok=True)
|
||
catalog = {"generated_at": datetime.datetime.now().isoformat(timespec="seconds"),
|
||
"papers": []}
|
||
total_imgs = 0
|
||
|
||
gt_fixtures = load_gt_labels()
|
||
geometry = B1_GEOMETRY if a.b1_only else GEOMETRY
|
||
fast = [] if a.b1_only else FAST
|
||
|
||
for p in geometry:
|
||
print(f"[geometry] {p['slug']}")
|
||
gt_labels = (gt_fixtures.get(p.get("gt_key") or p["slug"], {}) or {}).get("labels")
|
||
s, d = do_geometry(p, not a.no_overlays, gt_labels=gt_labels, prepare_ocr=a.prepare_ocr)
|
||
n = per_paper_report(p, s, d, p["path"])
|
||
total_imgs += n
|
||
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||
"kind": "geometry", "path": p["path"], "dir": d,
|
||
"overlay_images": n, **s})
|
||
for p in fast:
|
||
print(f"[fast] {p['slug']}")
|
||
s, d = do_fast(p)
|
||
per_paper_report(p, s, d, "born-digital fast-path")
|
||
catalog["papers"].append({**{k: p[k] for k in ("slug", "title", "board", "level")},
|
||
"kind": "fast", "path": "born-digital fast-path", "dir": d, **s})
|
||
|
||
json.dump(catalog, open(os.path.join(FINAL, "catalog.json"), "w"), indent=2)
|
||
write_index(catalog, total_imgs)
|
||
print(f"\n-> {len(catalog['papers'])} papers, {total_imgs} overlay images -> {FINAL}/")
|
||
|
||
|
||
def write_index(catalog, total_imgs):
|
||
g = [p for p in catalog["papers"] if p["kind"] == "geometry"]
|
||
f = [p for p in catalog["papers"] if p["kind"] == "fast"]
|
||
L = ["# Final corpus output — exam-extraction spike", "",
|
||
f"Generated {catalog['generated_at']}. {len(catalog['papers'])} paper-runs across "
|
||
f"3 boards × 2 levels, both pipeline paths; {total_imgs} overlay debug images.", "",
|
||
"Each `<slug>/` holds the machine artifacts (JSON) + `report.md`; geometry papers also have "
|
||
"`overlays/template/` (human-review view, all pages) and `overlays/debug/` (raw-detection view).",
|
||
"Machine catalog: `catalog.json`.", "",
|
||
"## Image-only / OCR-path (with geometry + overlays)", "",
|
||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | Answer regions | G6 | Images |",
|
||
"|---|---|---|---|---|---|---|---|"]
|
||
for p in g:
|
||
cov = f"{p['coverage_pct']}%" if p['coverage_pct'] is not None else "n/a"
|
||
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||
f"({p['marks_pct']}%) | {cov} | {p.get('answer_regions')} | {p['validate_verdict']} | "
|
||
f"{p['overlay_images']} |")
|
||
L += ["", "## Born-digital fast-path (CPU, no geometry)", "",
|
||
"| Paper | Board / level | Q/parts | Marks/max | Coverage | G6 |",
|
||
"|---|---|---|---|---|---|"]
|
||
for p in f:
|
||
L.append(f"| [{p['title']}]({p['slug']}/report.md) | {p['board']} {p['level']} | "
|
||
f"{p['n_questions']}/{p['n_parts']} | {p['marks_sum']}/{p['official_max']} "
|
||
f"({p['marks_pct']}%) | {p['coverage_pct'] if p['coverage_pct'] is not None else 'n/a'}% | "
|
||
f"{p['validate_verdict']} |")
|
||
L += ["", "## Per-paper directory layout", "```",
|
||
"<slug>/",
|
||
" structured.json extract.py output (questions->parts->marks/bbox/regions)",
|
||
" validate.json G6 consistency judge (confidence + flags)",
|
||
" furniture.json recurring-furniture mask + content margins [geometry only]",
|
||
" bands.json main + part y-bands [geometry only]",
|
||
" page_roles.json per-page role + margin override [geometry only]",
|
||
" template.json editable first-pass template (source/confirmed) [geometry only]",
|
||
" overlays/template/ human-review view, all pages [geometry only]",
|
||
" overlays/debug/ raw-detection view, sample pages [geometry only]",
|
||
" report.md per-paper human summary", "```"]
|
||
open(os.path.join(FINAL, "INDEX.md"), "w").write("\n".join(L) + "\n")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|