PRIMARY — corpus breadth (505->1178 papers, 18->60 specs, all URLs HEAD-verified): - AQA (enumerated): Maths, English Lang/Lit, Geography, Computer Science, Business, Psychology, MFL (French/Spanish/German), GCSE + A-level, on top of round-1 sciences. - Edexcel + OCR (confirmed direct URLs via research): Maths, English, Geography, History, Business, Computer Science, GCSE + A-level. - generate_corpus_manifest.py: _subj/_mfl AQA builders, Edexcel/OCR spec+URL tables, derived exam_code (_mk_exam_code) matching the locked convention, concurrent re-verify. Verified on dev .94: eb_specifications=60, eb_exams=1178, QP=469, doc_type all 'pdf', seed idempotent (uploaded=673 new, skipped=505), failed=0. SECONDARY: - --download-only + persistent bucket-shaped local store (manifests/_corpus_store/, gitignored): download-once, seed-many, offline-repeatable; --store-dir/--no-store. (_store_path/_item_bytes/ download_corpus). Verified: store populated, seed reads offline (download_cached). - --unseed [--board/--spec]: inverse loader — storage objects (Storage API; protect_delete blocks raw SQL), first-sweep seed templates, eb_exams, eb_specifications. Verified reversible on .94. - Granular admin reset: POST /admin/reset?scope=all|exam-corpus|timetable. reset_environment.reset(scope) adds EXAM_CORPUS_TABLES (10) + cc.examboards storage cleanup + TIMETABLE_TABLES (13); 'all' now also clears the exam subsystem the legacy reset missed. No schema migration required. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
502 lines
35 KiB
Python
502 lines
35 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
generate_corpus_manifest.py — build the public exam-corpus manifest from OFFICIAL sources,
|
|
verifying every source URL is live before it is written.
|
|
|
|
Output: exam-corpus.yaml (consumed by run/initialization/seed_exam_corpus.py).
|
|
|
|
Sources (all official exam-board hosts; public past-paper PDFs):
|
|
AQA filestore.aqa.org.uk — fully templatable; enumerated + HEAD-verified here.
|
|
Edexcel qualifications.pearson.com — date suffix non-derivable; confirmed URLs embedded.
|
|
OCR www.ocr.org.uk/Images — opaque doc-id; confirmed URLs embedded.
|
|
|
|
Every URL is HEAD/GET-checked (200 + application/pdf) before inclusion, so the committed
|
|
manifest never carries a dead or wrong-cased link. Re-run to refresh as more sessions go public.
|
|
|
|
Conventions (locked — see ~/cc/ideas/2026-06-07-exam-paper-ingestion.md):
|
|
session = "YYYY-Mon" e.g. 2022-Jun
|
|
exam_code = BOARD-award-PAPER-SESSIONCOMPACT-ROLE e.g. AQA-8463-1H-2022JUN-QP
|
|
"""
|
|
from __future__ import annotations
|
|
import concurrent.futures as cf
|
|
import os
|
|
import sys
|
|
import urllib.error
|
|
import urllib.request
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import yaml
|
|
|
|
AQA_BASE = "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes"
|
|
ROLE_TOKEN = {"QP": "QP", "MS": "MS", "ER": "WRE"} # AQA filestore role tokens
|
|
MONTHS = {"JUN": ("june", "Jun"), "NOV": ("november", "Nov")}
|
|
FETCHED = "2026-06-07"
|
|
|
|
|
|
def head_ok(url: str, timeout: int = 20) -> bool:
|
|
"""True iff the URL resolves to a real PDF (200 + application/pdf), following redirects.
|
|
AQA soft-404s redirect to www.aqa.org.uk/req_path=... (text/html), so we check content-type.
|
|
Uses a tiny Range GET (stdlib urllib) so we never pull the whole PDF just to verify it."""
|
|
req = urllib.request.Request(url, headers={"Range": "bytes=0-3", "User-Agent": "cc-corpus/1.0"})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
ctype = (r.headers.get("content-type") or "").lower()
|
|
return r.status in (200, 206) and "pdf" in ctype
|
|
except urllib.error.HTTPError as e:
|
|
# A 206/200 PDF never lands here; 404/redirect-to-html will.
|
|
ctype = (e.headers.get("content-type") or "").lower() if e.headers else ""
|
|
return e.code in (200, 206) and "pdf" in ctype
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
# ─────────────────────────── AQA catalogue ───────────────────────────
|
|
# spec_code, subject, award, award_level, first_teach, [(filestore_papercode, paper_code, tier), ...]
|
|
def _gcse_single(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
|
out = []
|
|
for paper in ("1", "2"):
|
|
for tier in ("F", "H"):
|
|
out.append((f"{award}{paper}{tier}", f"{award}/{paper}{tier}", tier))
|
|
return out
|
|
|
|
def _trilogy(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
|
out = []
|
|
for subj in ("B", "C", "P"):
|
|
for paper in ("1", "2"):
|
|
for tier in ("F", "H"):
|
|
out.append((f"{award}{subj}{paper}{tier}", f"{award}/{subj}/{paper}{tier}", tier))
|
|
return out
|
|
|
|
def _alevel(award: str, papers=("1", "2", "3")) -> List[Tuple[str, str, Optional[str]]]:
|
|
return [(f"{award}{p}", f"{award}/{p}", None) for p in papers]
|
|
|
|
def _subj(award: str, papers, tiers=(None,)) -> List[Tuple[str, str, Optional[str]]]:
|
|
"""Generic GCSE/A-level builder. tiers=('F','H') for tiered subjects (Maths/Science),
|
|
tiers=(None,) for untiered (English/Geography/CS/Business/Psychology)."""
|
|
out = []
|
|
for p in papers:
|
|
for t in tiers:
|
|
tl = t or ""
|
|
out.append((f"{award}{p}{tl}", f"{award}/{p}{tl}", t))
|
|
return out
|
|
|
|
def _mfl(award: str) -> List[Tuple[str, str, Optional[str]]]:
|
|
"""AQA MFL: Listening/Reading/Writing papers, each Foundation/Higher (Speaking is teacher-conducted,
|
|
no public QP). Filestore code encodes skill+tier, e.g. 8658LH = French Listening Higher."""
|
|
out = []
|
|
for skill in ("L", "R", "W"):
|
|
for t in ("F", "H"):
|
|
out.append((f"{award}{skill}{t}", f"{award}/{skill}{t}", t))
|
|
return out
|
|
|
|
AQA_SPECS = [
|
|
# ── Sciences (round 1 — kept at full depth) ──────────────────────────────────────
|
|
("AQA-BIOL-8461", "BIOLOGY", "8461", "GCSE", "2016", _gcse_single("8461")),
|
|
("AQA-CHEM-8462", "CHEMISTRY", "8462", "GCSE", "2016", _gcse_single("8462")),
|
|
("AQA-PHYS-8463", "PHYSICS", "8463", "GCSE", "2016", _gcse_single("8463")),
|
|
("AQA-COMB-8464", "COMBINED SCIENCE TRILOGY", "8464", "GCSE", "2016", _trilogy("8464")),
|
|
("AQA-BIOL-7401", "BIOLOGY", "7401", "AS", "2015", _alevel("7401", ("1", "2"))),
|
|
("AQA-BIOL-7402", "BIOLOGY", "7402", "A-level", "2015", _alevel("7402")),
|
|
("AQA-CHEM-7404", "CHEMISTRY", "7404", "AS", "2015", _alevel("7404", ("1", "2"))),
|
|
("AQA-CHEM-7405", "CHEMISTRY", "7405", "A-level", "2015", _alevel("7405")),
|
|
("AQA-PHYS-7407", "PHYSICS", "7407", "AS", "2015", _alevel("7407", ("1", "2"))),
|
|
("AQA-PHYS-7408", "PHYSICS", "7408", "A-level", "2015", _alevel("7408")),
|
|
# ── Round 2 breadth — high-volume core (Maths, English) ───────────────────────────
|
|
("AQA-MATH-8300", "MATHEMATICS", "8300", "GCSE", "2015", _subj("8300", ("1", "2", "3"), ("F", "H"))),
|
|
("AQA-MATH-7357", "MATHEMATICS", "7357", "A-level", "2017", _alevel("7357", ("1", "2", "3"))),
|
|
("AQA-MATH-7356", "MATHEMATICS", "7356", "AS", "2017", _alevel("7356", ("1", "2"))),
|
|
("AQA-ENGL-8700", "ENGLISH LANGUAGE", "8700", "GCSE", "2015", _subj("8700", ("1", "2"))),
|
|
("AQA-ENGLIT-8702", "ENGLISH LITERATURE", "8702", "GCSE", "2015", _subj("8702", ("1", "2"))),
|
|
("AQA-ENGL-7702", "ENGLISH LANGUAGE", "7702", "A-level", "2015", _alevel("7702", ("1", "2"))),
|
|
("AQA-ENGLIT-7712", "ENGLISH LITERATURE A", "7712", "A-level", "2015", _alevel("7712", ("1", "2"))),
|
|
# ── Round 2 breadth — humanities / others ─────────────────────────────────────────
|
|
("AQA-GEOG-8035", "GEOGRAPHY", "8035", "GCSE", "2016", _subj("8035", ("1", "2", "3"))),
|
|
("AQA-GEOG-7037", "GEOGRAPHY", "7037", "A-level", "2016", _alevel("7037", ("1", "2"))),
|
|
("AQA-COMP-8525", "COMPUTER SCIENCE", "8525", "GCSE", "2020", _subj("8525", ("1", "2"))),
|
|
("AQA-COMP-7517", "COMPUTER SCIENCE", "7517", "A-level", "2015", _alevel("7517", ("1", "2"))),
|
|
("AQA-BUS-8132", "BUSINESS", "8132", "GCSE", "2017", _subj("8132", ("1", "2"))),
|
|
("AQA-BUS-7132", "BUSINESS", "7132", "A-level", "2015", _alevel("7132", ("1", "2", "3"))),
|
|
("AQA-PSYC-8182", "PSYCHOLOGY", "8182", "GCSE", "2017", _subj("8182", ("1", "2"))),
|
|
("AQA-PSYC-7182", "PSYCHOLOGY", "7182", "A-level", "2015", _alevel("7182", ("1", "2", "3"))),
|
|
# ── Round 2 breadth — modern foreign languages (Listening/Reading/Writing, F+H) ───
|
|
("AQA-FREN-8658", "FRENCH", "8658", "GCSE", "2016", _mfl("8658")),
|
|
("AQA-SPAN-8698", "SPANISH", "8698", "GCSE", "2016", _mfl("8698")),
|
|
("AQA-GERM-8668", "GERMAN", "8668", "GCSE", "2016", _mfl("8668")),
|
|
("AQA-FREN-7652", "FRENCH", "7652", "A-level", "2016", _alevel("7652", ("1", "2"))),
|
|
("AQA-SPAN-7692", "SPANISH", "7692", "A-level", "2016", _alevel("7692", ("1", "2"))),
|
|
("AQA-GERM-7662", "GERMAN", "7662", "A-level", "2016", _alevel("7662", ("1", "2"))),
|
|
]
|
|
AQA_SESSIONS = ["JUN18", "JUN19", "NOV20", "NOV21", "JUN22", "JUN23", "JUN24"]
|
|
AQA_ROLES = ["QP", "MS", "ER"]
|
|
|
|
|
|
def aqa_url(papercode: str, role: str, session: str) -> Tuple[str, str]:
|
|
mon = session[:3]
|
|
yy = session[3:]
|
|
folder, _ = MONTHS[mon]
|
|
year = "20" + yy
|
|
fname = f"AQA-{papercode}-{ROLE_TOKEN[role]}-{session}.PDF"
|
|
return f"{AQA_BASE}/{year}/{folder}/{fname}", fname
|
|
|
|
|
|
def session_pretty(session: str) -> Tuple[str, str]:
|
|
mon = session[:3] # "JUN" | "NOV"
|
|
yy = session[3:] # "22"
|
|
_, pretty = MONTHS[mon]
|
|
# ("2022-Jun" display session, "2022JUN" compact for exam_code — year-first, matches the
|
|
# locked exam_code convention and the Edexcel/OCR entries).
|
|
return f"20{yy}-{pretty}", f"20{yy}{mon}"
|
|
|
|
|
|
def build_aqa() -> Dict[str, Any]:
|
|
candidates: List[Tuple[str, str, str, str, str, str, Optional[str], str, str, str]] = []
|
|
# (spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname)
|
|
spec_meta = {}
|
|
for spec_code, subject, award, level, first_teach, papers in AQA_SPECS:
|
|
spec_meta[spec_code] = (subject, award, level, first_teach)
|
|
for paper_fc, paper_code, tier in papers:
|
|
for session in AQA_SESSIONS:
|
|
for role in AQA_ROLES:
|
|
url, fname = aqa_url(paper_fc, role, session)
|
|
candidates.append((spec_code, subject, award, paper_fc, paper_code, tier,
|
|
role, session, url, fname))
|
|
|
|
print(f"[AQA] HEAD-verifying {len(candidates)} candidate URLs...", file=sys.stderr)
|
|
live: Dict[int, bool] = {}
|
|
with cf.ThreadPoolExecutor(max_workers=24) as ex:
|
|
futs = {ex.submit(head_ok, c[8]): i for i, c in enumerate(candidates)}
|
|
done = 0
|
|
for fut in cf.as_completed(futs):
|
|
i = futs[fut]
|
|
live[i] = fut.result()
|
|
done += 1
|
|
if done % 60 == 0:
|
|
print(f" ...{done}/{len(candidates)} ({sum(live.values())} live)", file=sys.stderr)
|
|
|
|
specs: Dict[str, Dict[str, Any]] = {}
|
|
for i, c in enumerate(candidates):
|
|
if not live.get(i):
|
|
continue
|
|
spec_code, subject, award, paper_fc, paper_code, tier, role, session, url, fname = c
|
|
sess_pretty, sess_compact = session_pretty(session)
|
|
token = paper_fc[len(award):] # "1H" / "P1H" / "1"
|
|
exam_code = f"AQA-{award}-{token}-{sess_compact}-{role}"
|
|
spec = specs.setdefault(spec_code, {"papers": []})
|
|
spec["papers"].append({
|
|
"exam_code": exam_code,
|
|
"paper_code": paper_code,
|
|
"tier": tier,
|
|
"session": sess_pretty,
|
|
"doc_type": role,
|
|
"file": {
|
|
"source": f"url:{url}",
|
|
"original_name": fname,
|
|
"provenance": {"source_url": url, "fetched": FETCHED,
|
|
"license": "AQA public past paper"},
|
|
},
|
|
})
|
|
|
|
spec_list = []
|
|
for spec_code, subject, award, level, first_teach, _papers in AQA_SPECS:
|
|
if spec_code not in specs:
|
|
continue
|
|
papers = sorted(specs[spec_code]["papers"], key=lambda p: p["exam_code"])
|
|
spec_list.append({
|
|
"spec_code": spec_code, "exam_board_code": "AQA", "subject_code": subject,
|
|
"award_code": award, "award_level": level, "first_teach": first_teach,
|
|
"papers": papers,
|
|
})
|
|
print(f"[AQA] {spec_code}: {len(papers)} live papers", file=sys.stderr)
|
|
return {"exam_board_code": "AQA", "specifications": spec_list}
|
|
|
|
|
|
# ─────────────── Edexcel / OCR — confirmed direct URLs (re-verified at build) ───────────────
|
|
# These boards aren't templatable (Edexcel has a non-derivable date suffix; OCR uses opaque
|
|
# doc-ids), so confirmed URLs are listed as 6-tuples: (spec_code, paper_code, tier, session, role,
|
|
# url). exam_code is DERIVED (see _mk_exam_code) so it always matches the locked convention.
|
|
EXAM_CODE_PREFIX = {"EDEXCEL": "EDX", "OCR": "OCR"}
|
|
|
|
def _ec_token(paper_code: str) -> str:
|
|
t = paper_code.split("/")[-1]
|
|
return str(int(t)) if t.isdigit() else t # "01"->"1", "1H"->"1H", "1CH"->"1CH", "11"->"11"
|
|
|
|
def _mk_exam_code(prefix: str, award: str, paper_code: str, session: str, role: str) -> str:
|
|
y, m = session.split("-")
|
|
return f"{prefix}-{award}-{_ec_token(paper_code)}-{y}{m.upper()}-{role}"
|
|
|
|
_PE = "https://qualifications.pearson.com/content/dam/pdf"
|
|
_EDX = f"{_PE}/GCSE/Science/2016"
|
|
_OCR = "https://www.ocr.org.uk/Images"
|
|
|
|
EDEXCEL_SPECS = {
|
|
"EDX-BIOL-1BI0": ("BIOLOGY", "1BI0", "GCSE", "2016"),
|
|
"EDX-CHEM-1CH0": ("CHEMISTRY", "1CH0", "GCSE", "2016"),
|
|
"EDX-PHYS-1PH0": ("PHYSICS", "1PH0", "GCSE", "2016"),
|
|
"EDX-COMB-1SC0": ("COMBINED SCIENCE", "1SC0", "GCSE", "2016"),
|
|
"EDX-MATH-1MA1": ("MATHEMATICS", "1MA1", "GCSE", "2015"),
|
|
"EDX-ENGL-1EN0": ("ENGLISH LANGUAGE", "1EN0", "GCSE", "2015"),
|
|
"EDX-ENGLIT-1ET0": ("ENGLISH LITERATURE", "1ET0", "GCSE", "2015"),
|
|
"EDX-GEOG-1GA0": ("GEOGRAPHY A", "1GA0", "GCSE", "2016"),
|
|
"EDX-HIST-1HI0": ("HISTORY", "1HI0", "GCSE", "2016"),
|
|
"EDX-BUS-1BS0": ("BUSINESS", "1BS0", "GCSE", "2017"),
|
|
"EDX-COMP-1CP2": ("COMPUTER SCIENCE", "1CP2", "GCSE", "2020"),
|
|
"EDX-MATH-9MA0": ("MATHEMATICS", "9MA0", "A-level", "2017"),
|
|
"EDX-ENGL-9EN0": ("ENGLISH LANGUAGE", "9EN0", "A-level", "2015"),
|
|
"EDX-ENGLIT-9ET0": ("ENGLISH LITERATURE", "9ET0", "A-level", "2015"),
|
|
"EDX-GEOG-9GE0": ("GEOGRAPHY", "9GE0", "A-level", "2016"),
|
|
}
|
|
EDEXCEL_PAPERS = [
|
|
# ── Sciences (round 1) ──
|
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-1h-que-20240511.pdf"),
|
|
("EDX-BIOL-1BI0", "1BI0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2f-que-20230610.pdf"),
|
|
("EDX-BIOL-1BI0", "1BI0/2H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2h-que-20230610.pdf"),
|
|
("EDX-BIOL-1BI0", "1BI0/1F", "F", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1f-rms-20230824.pdf"),
|
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1h-rms-20240822.pdf"),
|
|
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1bi0-1h-rms-20220825.pdf"),
|
|
("EDX-CHEM-1CH0", "1CH0/1F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1f-que-20230523.pdf"),
|
|
("EDX-CHEM-1CH0", "1CH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1h-que-20240518.pdf"),
|
|
("EDX-CHEM-1CH0", "1CH0/2H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1ch0-2h-rms-20240822.pdf"),
|
|
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20230526.pdf"),
|
|
("EDX-PHYS-1PH0", "1PH0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-2f-que-20230617.pdf"),
|
|
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20240523.pdf"),
|
|
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1ph0-2h-rms-20230824.pdf"),
|
|
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1ph0-2h-rms-20220825.pdf"),
|
|
("EDX-COMB-1SC0", "1SC0/1CH", None, "2023-Jun", "MS", f"{_EDX}/Exam-materials/1sc0-1ch-rms-20230824.pdf"),
|
|
# ── Maths 1MA1 (round 2) ──
|
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20230520.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20230824.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20230824.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-que-20240517.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20240517.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20240822.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Nov", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20240111.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/1H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-1h-rms-20220825.pdf"),
|
|
("EDX-MATH-1MA1", "1MA1/3H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-3h-rms-20220825.pdf"),
|
|
# ── English Language 1EN0 / Literature 1ET0 (round 2) ──
|
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20240524.pdf"),
|
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Nov", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20231108.pdf"),
|
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20240822.pdf"),
|
|
("EDX-ENGL-1EN0", "1EN0/02", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20240822.pdf"),
|
|
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20230824.pdf"),
|
|
("EDX-ENGL-1EN0", "1EN0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20230824.pdf"),
|
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-que-20230518.pdf"),
|
|
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20230525.pdf"),
|
|
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20240521.pdf"),
|
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20230824.pdf"),
|
|
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20240822.pdf"),
|
|
# ── A-level Maths 9MA0 / English 9EN0 / 9ET0 (round 2) ──
|
|
("EDX-MATH-9MA0", "9MA0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-que-20230607.pdf"),
|
|
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-que-20230621.pdf"),
|
|
("EDX-MATH-9MA0", "9MA0/02", None, "2024-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-02-que-20240612.pdf"),
|
|
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-rms-20230817.pdf"),
|
|
("EDX-MATH-9MA0", "9MA0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-rms-20240815.pdf"),
|
|
("EDX-ENGL-9EN0", "9EN0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-01-rms-20240815.pdf"),
|
|
("EDX-ENGL-9EN0", "9EN0/02", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-02-rms-20240815.pdf"),
|
|
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2024-Jun", "QP", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-que-20240525.pdf"),
|
|
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-rms-20230817.pdf"),
|
|
("EDX-ENGLIT-9ET0", "9ET0/03", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-03-rms-20230817.pdf"),
|
|
# ── Humanities (round 2) ──
|
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-que-20230523.pdf"),
|
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20230824.pdf"),
|
|
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-que-20230610.pdf"),
|
|
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-rms-20230824.pdf"),
|
|
("EDX-GEOG-1GA0", "1GA0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20240822.pdf"),
|
|
("EDX-GEOG-1GA0", "1GA0/03", None, "2024-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-03-que-20240615.pdf"),
|
|
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "QP", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-que-20230519.pdf"),
|
|
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-rms-20230824.pdf"),
|
|
("EDX-HIST-1HI0", "1HI0/12", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-12-rms-20230824.pdf"),
|
|
("EDX-HIST-1HI0", "1HI0/13", None, "2024-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-13-rms-20240822.pdf"),
|
|
("EDX-HIST-1HI0", "1HI0/33", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-33-rms-20230824.pdf"),
|
|
("EDX-BUS-1BS0", "1BS0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-que-20230519.pdf"),
|
|
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20230613.pdf"),
|
|
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-rms-20230824.pdf"),
|
|
("EDX-BUS-1BS0", "1BS0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20240606.pdf"),
|
|
("EDX-BUS-1BS0", "1BS0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-rms-20240822.pdf"),
|
|
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-que-20230520.pdf"),
|
|
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20230824.pdf"),
|
|
("EDX-COMP-1CP2", "1CP2/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-02-que-20230526.pdf"),
|
|
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/Computer-Science/2020/Exam-materials/1cp2-01-que-20240702.pdf"),
|
|
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20240822.pdf"),
|
|
("EDX-GEOG-9GE0", "9GE0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Geography/2016/Exam-materials/9ge0-01-que-20230518.pdf"),
|
|
]
|
|
|
|
OCR_SPECS = {
|
|
"OCR-BIOL-J247": ("BIOLOGY", "J247", "GCSE", "2016"),
|
|
"OCR-CHEM-J248": ("CHEMISTRY", "J248", "GCSE", "2016"),
|
|
"OCR-PHYS-J249": ("PHYSICS", "J249", "GCSE", "2016"),
|
|
"OCR-COMB-J250": ("COMBINED SCIENCE", "J250", "GCSE", "2016"),
|
|
"OCR-MATH-J560": ("MATHEMATICS", "J560", "GCSE", "2015"),
|
|
"OCR-ENGL-J351": ("ENGLISH LANGUAGE", "J351", "GCSE", "2015"),
|
|
"OCR-ENGLIT-J352": ("ENGLISH LITERATURE", "J352", "GCSE", "2015"),
|
|
"OCR-COMP-J277": ("COMPUTER SCIENCE", "J277", "GCSE", "2020"),
|
|
"OCR-GEOG-J383": ("GEOGRAPHY A", "J383", "GCSE", "2016"),
|
|
"OCR-BUS-J204": ("BUSINESS", "J204", "GCSE", "2017"),
|
|
"OCR-HIST-J411": ("HISTORY B (SHP)", "J411", "GCSE", "2016"),
|
|
"OCR-MATH-H240": ("MATHEMATICS A", "H240", "A-level", "2017"),
|
|
"OCR-ENGLIT-H472": ("ENGLISH LITERATURE", "H472", "A-level", "2015"),
|
|
"OCR-ENGL-H470": ("ENGLISH LANGUAGE", "H470", "A-level", "2015"),
|
|
}
|
|
OCR_PAPERS = [
|
|
# ── Sciences (round 1) ──
|
|
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "QP", f"{_OCR}/727713-question-paper-paper-1.pdf"),
|
|
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "MS", f"{_OCR}/727745-mark-scheme-paper-1.pdf"),
|
|
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "QP", f"{_OCR}/727715-question-paper-paper-3.pdf"),
|
|
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "MS", f"{_OCR}/727747-mark-scheme-paper-3.pdf"),
|
|
("OCR-BIOL-J247", "J247/01", "F", "2023-Jun", "QP", f"{_OCR}/704945-question-paper-paper-1.pdf"),
|
|
("OCR-BIOL-J247", "J247/03", "H", "2023-Jun", "MS", f"{_OCR}/704979-mark-scheme-paper-3.pdf"),
|
|
("OCR-BIOL-J247", "J247/03", "H", "2022-Jun", "QP", f"{_OCR}/678031-question-paper-paper-3.pdf"),
|
|
("OCR-BIOL-J247", "J247/01", "F", "2022-Jun", "MS", f"{_OCR}/678076-mark-scheme-paper-1.pdf"),
|
|
("OCR-CHEM-J248", "J248/01", "F", "2024-Jun", "QP", f"{_OCR}/727718-question-paper-paper-1.pdf"),
|
|
("OCR-CHEM-J248", "J248/03", "H", "2024-Jun", "MS", f"{_OCR}/727751-mark-scheme-paper-3.pdf"),
|
|
("OCR-CHEM-J248", "J248/01", "F", "2023-Jun", "QP", f"{_OCR}/704950-question-paper-paper-1.pdf"),
|
|
("OCR-CHEM-J248", "J248/03", "H", "2022-Jun", "QP", f"{_OCR}/678036-question-paper-paper-3.pdf"),
|
|
("OCR-PHYS-J249", "J249/01", "F", "2024-Jun", "QP", f"{_OCR}/727724-question-paper-paper-1.pdf"),
|
|
("OCR-PHYS-J249", "J249/03", "H", "2024-Jun", "MS", f"{_OCR}/727755-mark-scheme-paper-3.pdf"),
|
|
("OCR-PHYS-J249", "J249/01", "F", "2023-Jun", "QP", f"{_OCR}/704956-question-paper-paper-1.pdf"),
|
|
("OCR-PHYS-J249", "J249/03", "H", "2022-Jun", "MS", f"{_OCR}/678086-mark-scheme-paper-3.pdf"),
|
|
("OCR-COMB-J250", "J250/01", "F", "2024-Jun", "QP", f"{_OCR}/727730-question-paper-paper-1.pdf"),
|
|
("OCR-COMB-J250", "J250/07", "H", "2024-Jun", "MS", f"{_OCR}/727763-mark-scheme-paper-7.pdf"),
|
|
# ── Maths J560 (round 2) ──
|
|
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "QP", f"{_OCR}/727817-question-paper-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "MS", f"{_OCR}/727824-mark-scheme-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "QP", f"{_OCR}/727820-question-paper-paper-4.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "MS", f"{_OCR}/727827-mark-scheme-paper-4.pdf"),
|
|
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "QP", f"{_OCR}/705050-question-paper-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "MS", f"{_OCR}/705057-mark-scheme-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "QP", f"{_OCR}/705053-question-paper-paper-4.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "MS", f"{_OCR}/705060-mark-scheme-paper-4.pdf"),
|
|
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "QP", f"{_OCR}/678149-question-paper-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "MS", f"{_OCR}/678156-mark-scheme-paper-1.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "QP", f"{_OCR}/678152-question-paper-paper-4.pdf"),
|
|
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "MS", f"{_OCR}/678159-mark-scheme-paper-4.pdf"),
|
|
# ── English Language J351 / Literature J352 (round 2) ──
|
|
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "QP", f"{_OCR}/727556-question-paper-communicating-information-and-ideas.pdf"),
|
|
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "MS", f"{_OCR}/727658-mark-scheme-communication-information-and-ideas.pdf"),
|
|
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "QP", f"{_OCR}/727558-question-paper-exploring-effects-and-impact.pdf"),
|
|
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "MS", f"{_OCR}/727659-mark-scheme-exploring-effects-and-impact.pdf"),
|
|
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "QP", f"{_OCR}/704782-question-paper-communicating-information-and-ideas.pdf"),
|
|
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "MS", f"{_OCR}/704888-mark-scheme-communication-information-and-ideas.pdf"),
|
|
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "QP", f"{_OCR}/677852-question-paper-communicating-information-and-ideas.pdf"),
|
|
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "MS", f"{_OCR}/677967-mark-scheme-communication-information-and-ideas.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "QP", f"{_OCR}/727830-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "MS", f"{_OCR}/727832-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "QP", f"{_OCR}/727831-question-paper-exploring-poetry-and-shakespeare.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "MS", f"{_OCR}/727833-mark-scheme-exploring-poetry-and-shakespeare.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "QP", f"{_OCR}/705069-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
|
|
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "MS", f"{_OCR}/705075-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
|
|
# ── A-level Maths H240 / English Lit H472 / Lang H470 (round 2) ──
|
|
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "QP", f"{_OCR}/726654-question-paper-pure-mathematics.pdf"),
|
|
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "MS", f"{_OCR}/726795-mark-scheme-pure-mathematics.pdf"),
|
|
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "QP", f"{_OCR}/726656-question-paper-pure-mathematics-and-statistics.pdf"),
|
|
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "MS", f"{_OCR}/726796-mark-scheme-pure-mathematics-and-statistics.pdf"),
|
|
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "QP", f"{_OCR}/703866-question-paper-pure-mathematics.pdf"),
|
|
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "MS", f"{_OCR}/704008-mark-scheme-pure-mathematics.pdf"),
|
|
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "QP", f"{_OCR}/676845-question-paper-pure-mathematics.pdf"),
|
|
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "MS", f"{_OCR}/677005-mark-scheme-pure-mathematics.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "QP", f"{_OCR}/726602-question-paper-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "MS", f"{_OCR}/726762-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "QP", f"{_OCR}/703813-question-paper-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "MS", f"{_OCR}/703974-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "QP", f"{_OCR}/676783-question-paper-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "MS", f"{_OCR}/676965-mark-scheme-drama-and-poetry-pre-1900.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "QP", f"{_OCR}/726595-question-paper-exploring-language.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "MS", f"{_OCR}/726764-mark-scheme-exploring-language.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "QP", f"{_OCR}/703806-question-paper-exploring-language.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "MS", f"{_OCR}/703976-mark-scheme-exploring-language.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "QP", f"{_OCR}/676772-question-paper-exploring-language.pdf"),
|
|
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "MS", f"{_OCR}/676967-mark-scheme-exploring-language.pdf"),
|
|
# ── Humanities (round 2) ──
|
|
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "QP", f"{_OCR}/727534-question-paper-computer-systems.pdf"),
|
|
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "MS", f"{_OCR}/727652-mark-scheme-computer-systems.pdf"),
|
|
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "QP", f"{_OCR}/727535-question-paper-computational-thinking-algorithms-and-programming.pdf"),
|
|
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "MS", f"{_OCR}/727653-mark-scheme-computational-thinking-algorithms-and-programming.pdf"),
|
|
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "QP", f"{_OCR}/727564-question-paper-living-in-the-uk-today.pdf"),
|
|
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "MS", f"{_OCR}/727661-mark-scheme-living-in-the-uk-today.pdf"),
|
|
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "QP", f"{_OCR}/727566-question-paper-the-world-around-us.pdf"),
|
|
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "MS", f"{_OCR}/727662-mark-scheme-the-world-around-us.pdf"),
|
|
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "QP", f"{_OCR}/727519-question-paper-business-1-business-activity-marketing-and-people.pdf"),
|
|
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "MS", f"{_OCR}/727634-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
|
|
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "QP", f"{_OCR}/727520-question-paper-business-2-operations-finance-and-influences-on-business.pdf"),
|
|
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "MS", f"{_OCR}/727635-mark-scheme-business-2-operations-finance-and-influences-on-business.pdf"),
|
|
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "QP", f"{_OCR}/704745-question-paper-business-1-business-activity-marketing-and-people.pdf"),
|
|
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "MS", f"{_OCR}/704864-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
|
|
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "QP", f"{_OCR}/727590-question-paper-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
|
|
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "MS", f"{_OCR}/727678-mark-scheme-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
|
|
]
|
|
|
|
|
|
def build_board(board_code: str, specs_meta: Dict, papers: List) -> Dict[str, Any]:
|
|
prefix = EXAM_CODE_PREFIX[board_code]
|
|
print(f"[{board_code}] re-verifying {len(papers)} confirmed URLs...", file=sys.stderr)
|
|
live: Dict[int, bool] = {}
|
|
with cf.ThreadPoolExecutor(max_workers=24) as ex:
|
|
futs = {ex.submit(head_ok, p[5]): i for i, p in enumerate(papers)}
|
|
for fut in cf.as_completed(futs):
|
|
live[futs[fut]] = fut.result()
|
|
by_spec: Dict[str, List[Dict[str, Any]]] = {}
|
|
for i, (spec_code, paper_code, tier, session, role, url) in enumerate(papers):
|
|
if not live.get(i):
|
|
print(f" DROP (not live): {url}", file=sys.stderr)
|
|
continue
|
|
award = specs_meta[spec_code][1]
|
|
by_spec.setdefault(spec_code, []).append({
|
|
"exam_code": _mk_exam_code(prefix, award, paper_code, session, role),
|
|
"paper_code": paper_code, "tier": tier,
|
|
"session": session, "doc_type": role,
|
|
"file": {"source": f"url:{url}", "original_name": os.path.basename(url),
|
|
"provenance": {"source_url": url, "fetched": FETCHED,
|
|
"license": f"{board_code} public past paper"}},
|
|
})
|
|
spec_list = []
|
|
for spec_code, (subject, award, level, first_teach) in specs_meta.items():
|
|
if spec_code not in by_spec:
|
|
continue
|
|
spec_list.append({
|
|
"spec_code": spec_code, "exam_board_code": board_code, "subject_code": subject,
|
|
"award_code": award, "award_level": level, "first_teach": first_teach,
|
|
"papers": sorted(by_spec[spec_code], key=lambda p: p["exam_code"]),
|
|
})
|
|
print(f"[{board_code}] {spec_code}: {len(by_spec[spec_code])} live papers", file=sys.stderr)
|
|
return {"exam_board_code": board_code, "specifications": spec_list}
|
|
|
|
|
|
def main() -> None:
|
|
out_path = os.path.join(os.path.dirname(__file__), "exam-corpus.yaml")
|
|
boards = [
|
|
build_aqa(),
|
|
build_board("EDEXCEL", EDEXCEL_SPECS, EDEXCEL_PAPERS),
|
|
build_board("OCR", OCR_SPECS, OCR_PAPERS),
|
|
]
|
|
n_specs = sum(len(b["specifications"]) for b in boards)
|
|
n_papers = sum(len(s["papers"]) for b in boards for s in b["specifications"])
|
|
manifest = {
|
|
"version": 1,
|
|
"defaults": {"bucket": "cc.examboards"},
|
|
"provenance": {
|
|
"collected_by": "kcar",
|
|
"collected_at": FETCHED,
|
|
"license_posture": ("Public exam-board past papers downloaded from each board's own "
|
|
"official site (AQA filestore, Pearson DAM, OCR Images). Stored in "
|
|
"the private dev cc.examboards bucket for internal exam-marker dev/test. "
|
|
"Each item records its source_url. Review redistribution rights before "
|
|
"any public exposure."),
|
|
"sources": {
|
|
"AQA": "https://filestore.aqa.org.uk/sample-papers-and-mark-schemes/",
|
|
"EDEXCEL": "https://qualifications.pearson.com/en/support/support-topics/exams/past-papers.html",
|
|
"OCR": "https://www.ocr.org.uk/qualifications/past-paper-finder/",
|
|
},
|
|
},
|
|
# Optional: uncomment + set on dev .94 to exercise user-side flows / first-sweep.
|
|
# "test_subset": {"user_email": "teacher@kevlarai.test", "papers": 2},
|
|
# "system_identity": {"user_email": "teacher@kevlarai.test"},
|
|
"boards": boards,
|
|
}
|
|
with open(out_path, "w") as fh:
|
|
yaml.safe_dump(manifest, fh, sort_keys=False, default_flow_style=False, width=120)
|
|
print(f"\nWROTE {out_path}: {n_specs} specs, {n_papers} papers across {len(boards)} boards",
|
|
file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|