api/run/initialization/manifests/generate_corpus_manifest.py
CC Worker 5750413f43 feat(seed): implement exam-corpus loader + filled 505-paper manifest
Implements the seed_exam_corpus.py skeleton TODOs against the real APIs and
fills the public exam corpus from official board sources.

Loader (run/initialization/seed_exam_corpus.py):
- _resolve_source_bytes: local path | url: fetch with on-disk cache + PDF validation
- upload_file: real StorageAdmin.upload_file, skip-if-exists+sha256 unless --force
- upsert_specification/upsert_paper: real upserts on spec_code/exam_code.
  Fix: QP/MS/INSERT/ER role -> eb_exams.type_code; doc_type set to 'pdf'
  (doc_type is CHECK-constrained to file formats; the skeleton wrote the role there).
- copy_user_test_subset: copy a QP subset into a test user's cc.users exam space + files rows
- first_sweep: auto_map + the /auto-map row mapper over seeded QPs -> system-owned
  exam_templates + questions/response_areas/boundaries/layout (idempotent)
- identity discovery via institute_memberships.profile_id

Manifest (run/initialization/manifests/):
- exam-corpus.yaml: 505 papers / 18 specs / AQA+Edexcel+OCR, every source URL HEAD-verified.
  AQA sciences GCSE 8461/8462/8463/8464 + AS/A-level 7401-7408, sessions JUN18-JUN24, QP+MS+ER, F+H.
- generate_corpus_manifest.py: regenerates + re-verifies all URLs from official hosts.

seed_curriculum.py: deprecation banner -> superseded by seed_exam_corpus.py; storage_loc
standardised on cc.examboards.

Verified on dev .94: full 505-paper seed (eb_specifications=18, eb_exams=505, QP=211),
idempotent re-runs, first-sweep + user-subset, 6/6 buckets provisioned.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 22:58:03 +00:00

292 lines
16 KiB
Python

#!/usr/bin/env python3
"""
generate_corpus_manifest.py — build the public exam-corpus manifest from OFFICIAL sources,
verifying every source URL is live before it is written.
Output: exam-corpus.yaml (consumed by run/initialization/seed_exam_corpus.py).
Sources (all official exam-board hosts; public past-paper PDFs):
AQA filestore.aqa.org.uk — fully templatable; enumerated + HEAD-verified here.
Edexcel qualifications.pearson.com — date suffix non-derivable; confirmed URLs embedded.
OCR www.ocr.org.uk/Images — opaque doc-id; confirmed URLs embedded.
Every URL is HEAD/GET-checked (200 + application/pdf) before inclusion, so the committed
manifest never carries a dead or wrong-cased link. Re-run to refresh as more sessions go public.
Conventions (locked — see ~/cc/ideas/2026-06-07-exam-paper-ingestion.md):
session = "YYYY-Mon" e.g. 2022-Jun
exam_code = BOARD-award-PAPER-SESSIONCOMPACT-ROLE e.g. AQA-8463-1H-2022JUN-QP
"""
from __future__ import annotations
import concurrent.futures as cf
import os
import sys
import urllib.error
import urllib.request
from typing import Any, Dict, List, Optional, Tuple
import yaml
AQA_BASE = "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes"
ROLE_TOKEN = {"QP": "QP", "MS": "MS", "ER": "WRE"} # AQA filestore role tokens
MONTHS = {"JUN": ("june", "Jun"), "NOV": ("november", "Nov")}
FETCHED = "2026-06-07"
def head_ok(url: str, timeout: int = 20) -> bool:
"""True iff the URL resolves to a real PDF (200 + application/pdf), following redirects.
AQA soft-404s redirect to www.aqa.org.uk/req_path=... (text/html), so we check content-type.
Uses a tiny Range GET (stdlib urllib) so we never pull the whole PDF just to verify it."""
req = urllib.request.Request(url, headers={"Range": "bytes=0-3", "User-Agent": "cc-corpus/1.0"})
try:
with urllib.request.urlopen(req, timeout=timeout) as r:
ctype = (r.headers.get("content-type") or "").lower()
return r.status in (200, 206) and "pdf" in ctype
except urllib.error.HTTPError as e:
# A 206/200 PDF never lands here; 404/redirect-to-html will.
ctype = (e.headers.get("content-type") or "").lower() if e.headers else ""
return e.code in (200, 206) and "pdf" in ctype
except Exception:
return False
# ─────────────────────────── AQA catalogue ───────────────────────────
# spec_code, subject, award, award_level, first_teach, [(filestore_papercode, paper_code, tier), ...]
def _gcse_single(award: str) -> List[Tuple[str, str, Optional[str]]]:
out = []
for paper in ("1", "2"):
for tier in ("F", "H"):
out.append((f"{award}{paper}{tier}", f"{award}/{paper}{tier}", tier))
return out
def _trilogy(award: str) -> List[Tuple[str, str, Optional[str]]]:
out = []
for subj in ("B", "C", "P"):
for paper in ("1", "2"):
for tier in ("F", "H"):
out.append((f"{award}{subj}{paper}{tier}", f"{award}/{subj}/{paper}{tier}", tier))
return out
def _alevel(award: str, papers=("1", "2", "3")) -> List[Tuple[str, str, Optional[str]]]:
return [(f"{award}{p}", f"{award}/{p}", None) for p in papers]
AQA_SPECS = [
("AQA-BIOL-8461", "BIOLOGY", "8461", "GCSE", "2016", _gcse_single("8461")),
("AQA-CHEM-8462", "CHEMISTRY", "8462", "GCSE", "2016", _gcse_single("8462")),
("AQA-PHYS-8463", "PHYSICS", "8463", "GCSE", "2016", _gcse_single("8463")),
("AQA-COMB-8464", "COMBINED SCIENCE TRILOGY", "8464", "GCSE", "2016", _trilogy("8464")),
("AQA-BIOL-7401", "BIOLOGY", "7401", "AS", "2015", _alevel("7401", ("1", "2"))),
("AQA-BIOL-7402", "BIOLOGY", "7402", "A-level", "2015", _alevel("7402")),
("AQA-CHEM-7404", "CHEMISTRY", "7404", "AS", "2015", _alevel("7404", ("1", "2"))),
("AQA-CHEM-7405", "CHEMISTRY", "7405", "A-level", "2015", _alevel("7405")),
("AQA-PHYS-7407", "PHYSICS", "7407", "AS", "2015", _alevel("7407", ("1", "2"))),
("AQA-PHYS-7408", "PHYSICS", "7408", "A-level", "2015", _alevel("7408")),
]
AQA_SESSIONS = ["JUN18", "JUN19", "NOV20", "NOV21", "JUN22", "JUN23", "JUN24"]
AQA_ROLES = ["QP", "MS", "ER"]
def aqa_url(papercode: str, role: str, session: str) -> Tuple[str, str]:
mon = session[:3]
yy = session[3:]
folder, _ = MONTHS[mon]
year = "20" + yy
fname = f"AQA-{papercode}-{ROLE_TOKEN[role]}-{session}.PDF"
return f"{AQA_BASE}/{year}/{folder}/{fname}", fname
def session_pretty(session: str) -> Tuple[str, str]:
mon = session[:3] # "JUN" | "NOV"
yy = session[3:] # "22"
_, pretty = MONTHS[mon]
# ("2022-Jun" display session, "2022JUN" compact for exam_code — year-first, matches the
# locked exam_code convention and the Edexcel/OCR entries).
return f"20{yy}-{pretty}", f"20{yy}{mon}"
def build_aqa() -> Dict[str, Any]:
candidates: List[Tuple[str, str, str, str, str, str, Optional[str], str, str, str]] = []
# (spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname)
spec_meta = {}
for spec_code, subject, award, level, first_teach, papers in AQA_SPECS:
spec_meta[spec_code] = (subject, award, level, first_teach)
for paper_fc, paper_code, tier in papers:
for session in AQA_SESSIONS:
for role in AQA_ROLES:
url, fname = aqa_url(paper_fc, role, session)
candidates.append((spec_code, subject, award, paper_fc, paper_code, tier,
role, session, url, fname))
print(f"[AQA] HEAD-verifying {len(candidates)} candidate URLs...", file=sys.stderr)
live: Dict[int, bool] = {}
with cf.ThreadPoolExecutor(max_workers=24) as ex:
futs = {ex.submit(head_ok, c[8]): i for i, c in enumerate(candidates)}
done = 0
for fut in cf.as_completed(futs):
i = futs[fut]
live[i] = fut.result()
done += 1
if done % 60 == 0:
print(f" ...{done}/{len(candidates)} ({sum(live.values())} live)", file=sys.stderr)
specs: Dict[str, Dict[str, Any]] = {}
for i, c in enumerate(candidates):
if not live.get(i):
continue
spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname = c
sess_pretty, sess_compact = session_pretty(session)
token = paper_fc[len(award):] # "1H" / "P1H" / "1"
exam_code = f"AQA-{award}-{token}-{sess_compact}-{role}"
spec = specs.setdefault(spec_code, {"papers": []})
spec["papers"].append({
"exam_code": exam_code,
"paper_code": paper_code,
"tier": tier,
"session": sess_pretty,
"doc_type": role,
"file": {
"source": f"url:{url}",
"original_name": fname,
"provenance": {"source_url": url, "fetched": FETCHED,
"license": "AQA public past paper"},
},
})
spec_list = []
for spec_code, subject, award, level, first_teach, _papers in AQA_SPECS:
if spec_code not in specs:
continue
papers = sorted(specs[spec_code]["papers"], key=lambda p: p["exam_code"])
spec_list.append({
"spec_code": spec_code, "exam_board_code": "AQA", "subject_code": subject,
"award_code": award, "award_level": level, "first_teach": first_teach,
"papers": papers,
})
print(f"[AQA] {spec_code}: {len(papers)} live papers", file=sys.stderr)
return {"exam_board_code": "AQA", "specifications": spec_list}
# ─────────────── Edexcel / OCR — confirmed direct URLs (re-verified at build) ───────────────
# Each tuple: (spec_code, subject, award, level, first_teach, exam_code, paper_code, tier,
# session, role, url, original_name)
EDEXCEL_SPECS = {
"EDX-BIOL-1BI0": ("BIOLOGY", "1BI0", "GCSE", "2016"),
"EDX-CHEM-1CH0": ("CHEMISTRY", "1CH0", "GCSE", "2016"),
"EDX-PHYS-1PH0": ("PHYSICS", "1PH0", "GCSE", "2016"),
"EDX-COMB-1SC0": ("COMBINED SCIENCE", "1SC0", "GCSE", "2016"),
}
_EDX = "https://qualifications.pearson.com/content/dam/pdf/GCSE/Science/2016"
EDEXCEL_PAPERS = [
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2024JUN-QP", "1BI0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-1h-que-20240511.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-2F-2023JUN-QP", "1BI0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2f-que-20230610.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-2H-2023JUN-QP", "1BI0/2H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2h-que-20230610.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1F-2023JUN-MS", "1BI0/1F", "F", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1f-rms-20230824.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2024JUN-MS", "1BI0/1H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1h-rms-20240822.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2022JUN-MS", "1BI0/1H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1bi0-1h-rms-20220825.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-1F-2023JUN-QP", "1CH0/1F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1f-que-20230523.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-1H-2024JUN-QP", "1CH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1h-que-20240518.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-2H-2024JUN-MS", "1CH0/2H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1ch0-2h-rms-20240822.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-1H-2023JUN-QP", "1PH0/1H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20230526.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2F-2023JUN-QP", "1PH0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-2f-que-20230617.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-1H-2024JUN-QP", "1PH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20240523.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2H-2023JUN-MS", "1PH0/2H", "H", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1ph0-2h-rms-20230824.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2H-2022JUN-MS", "1PH0/2H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1ph0-2h-rms-20220825.pdf"),
("EDX-COMB-1SC0", "EDX-1SC0-1CH-2023JUN-MS", "1SC0/1CH", None, "2023-Jun", "MS", f"{_EDX}/Exam-materials/1sc0-1ch-rms-20230824.pdf"),
]
OCR_SPECS = {
"OCR-BIOL-J247": ("BIOLOGY", "J247", "GCSE", "2016"),
"OCR-CHEM-J248": ("CHEMISTRY", "J248", "GCSE", "2016"),
"OCR-PHYS-J249": ("PHYSICS", "J249", "GCSE", "2016"),
"OCR-COMB-J250": ("COMBINED SCIENCE", "J250", "GCSE", "2016"),
}
_OCR = "https://www.ocr.org.uk/Images"
OCR_PAPERS = [
("OCR-BIOL-J247", "OCR-J247-1-2024JUN-QP", "J247/01", "F", "2024-Jun", "QP", f"{_OCR}/727713-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2024JUN-MS", "J247/01", "F", "2024-Jun", "MS", f"{_OCR}/727745-mark-scheme-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2024JUN-QP", "J247/03", "H", "2024-Jun", "QP", f"{_OCR}/727715-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2024JUN-MS", "J247/03", "H", "2024-Jun", "MS", f"{_OCR}/727747-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2023JUN-QP", "J247/01", "F", "2023-Jun", "QP", f"{_OCR}/704945-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2023JUN-MS", "J247/03", "H", "2023-Jun", "MS", f"{_OCR}/704979-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2022JUN-QP", "J247/03", "H", "2022-Jun", "QP", f"{_OCR}/678031-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2022JUN-MS", "J247/01", "F", "2022-Jun", "MS", f"{_OCR}/678076-mark-scheme-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-1-2024JUN-QP", "J248/01", "F", "2024-Jun", "QP", f"{_OCR}/727718-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-3-2024JUN-MS", "J248/03", "H", "2024-Jun", "MS", f"{_OCR}/727751-mark-scheme-paper-3.pdf"),
("OCR-CHEM-J248", "OCR-J248-1-2023JUN-QP", "J248/01", "F", "2023-Jun", "QP", f"{_OCR}/704950-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-3-2022JUN-QP", "J248/03", "H", "2022-Jun", "QP", f"{_OCR}/678036-question-paper-paper-3.pdf"),
("OCR-PHYS-J249", "OCR-J249-1-2024JUN-QP", "J249/01", "F", "2024-Jun", "QP", f"{_OCR}/727724-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "OCR-J249-3-2024JUN-MS", "J249/03", "H", "2024-Jun", "MS", f"{_OCR}/727755-mark-scheme-paper-3.pdf"),
("OCR-PHYS-J249", "OCR-J249-1-2023JUN-QP", "J249/01", "F", "2023-Jun", "QP", f"{_OCR}/704956-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "OCR-J249-3-2022JUN-MS", "J249/03", "H", "2022-Jun", "MS", f"{_OCR}/678086-mark-scheme-paper-3.pdf"),
("OCR-COMB-J250", "OCR-J250-1-2024JUN-QP", "J250/01", "F", "2024-Jun", "QP", f"{_OCR}/727730-question-paper-paper-1.pdf"),
("OCR-COMB-J250", "OCR-J250-7-2024JUN-MS", "J250/07", "H", "2024-Jun", "MS", f"{_OCR}/727763-mark-scheme-paper-7.pdf"),
]
def build_board(board_code: str, specs_meta: Dict, papers: List) -> Dict[str, Any]:
print(f"[{board_code}] re-verifying {len(papers)} confirmed URLs...", file=sys.stderr)
by_spec: Dict[str, List[Dict[str, Any]]] = {}
for spec_code, exam_code, paper_code, tier, session, role, url in papers:
if not head_ok(url):
print(f" DROP (not live): {url}", file=sys.stderr)
continue
by_spec.setdefault(spec_code, []).append({
"exam_code": exam_code, "paper_code": paper_code, "tier": tier,
"session": session, "doc_type": role,
"file": {"source": f"url:{url}", "original_name": os.path.basename(url),
"provenance": {"source_url": url, "fetched": FETCHED,
"license": f"{board_code} public past paper"}},
})
spec_list = []
for spec_code, (subject, award, level, first_teach) in specs_meta.items():
if spec_code not in by_spec:
continue
spec_list.append({
"spec_code": spec_code, "exam_board_code": board_code, "subject_code": subject,
"award_code": award, "award_level": level, "first_teach": first_teach,
"papers": sorted(by_spec[spec_code], key=lambda p: p["exam_code"]),
})
print(f"[{board_code}] {spec_code}: {len(by_spec[spec_code])} live papers", file=sys.stderr)
return {"exam_board_code": board_code, "specifications": spec_list}
def main() -> None:
out_path = os.path.join(os.path.dirname(__file__), "exam-corpus.yaml")
boards = [
build_aqa(),
build_board("EDEXCEL", EDEXCEL_SPECS, EDEXCEL_PAPERS),
build_board("OCR", OCR_SPECS, OCR_PAPERS),
]
n_specs = sum(len(b["specifications"]) for b in boards)
n_papers = sum(len(s["papers"]) for b in boards for s in b["specifications"])
manifest = {
"version": 1,
"defaults": {"bucket": "cc.examboards"},
"provenance": {
"collected_by": "kcar",
"collected_at": FETCHED,
"license_posture": ("Public exam-board past papers downloaded from each board's own "
"official site (AQA filestore, Pearson DAM, OCR Images). Stored in "
"the private dev cc.examboards bucket for internal exam-marker dev/test. "
"Each item records its source_url. Review redistribution rights before "
"any public exposure."),
"sources": {
"AQA": "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes/",
"EDEXCEL": "https://qualifications.pearson.com/en/support/support-topics/exams/past-papers.html",
"OCR": "https://www.ocr.org.uk/qualifications/past-paper-finder/",
},
},
# Optional: uncomment + set on dev .94 to exercise user-side flows / first-sweep.
# "test_subset": {"user_email": "teacher@kevlarai.test", "papers": 2},
# "system_identity": {"user_email": "teacher@kevlarai.test"},
"boards": boards,
}
with open(out_path, "w") as fh:
yaml.safe_dump(manifest, fh, sort_keys=False, default_flow_style=False, width=120)
print(f"\nWROTE {out_path}: {n_specs} specs, {n_papers} papers across {len(boards)} boards",
file=sys.stderr)
if __name__ == "__main__":
main()