feat(seed): expand corpus to 1178 papers + download-only/unseed/granular reset

PRIMARY — corpus breadth (505->1178 papers, 18->60 specs, all URLs HEAD-verified):
- AQA (enumerated): Maths, English Lang/Lit, Geography, Computer Science, Business,
  Psychology, MFL (French/Spanish/German), GCSE + A-level, on top of round-1 sciences.
- Edexcel + OCR (confirmed direct URLs via research): Maths, English, Geography, History,
  Business, Computer Science, GCSE + A-level.
- generate_corpus_manifest.py: _subj/_mfl AQA builders, Edexcel/OCR spec+URL tables,
  derived exam_code (_mk_exam_code) matching the locked convention, concurrent re-verify.
Verified on dev .94: eb_specifications=60, eb_exams=1178, QP=469, doc_type all 'pdf',
seed idempotent (uploaded=673 new, skipped=505), failed=0.

SECONDARY:
- --download-only + persistent bucket-shaped local store (manifests/_corpus_store/, gitignored):
  download-once, seed-many, offline-repeatable; --store-dir/--no-store. (_store_path/_item_bytes/
  download_corpus). Verified: store populated, seed reads offline (download_cached).
- --unseed [--board/--spec]: inverse loader — storage objects (Storage API; protect_delete blocks
  raw SQL), first-sweep seed templates, eb_exams, eb_specifications. Verified reversible on .94.
- Granular admin reset: POST /admin/reset?scope=all|exam-corpus|timetable. reset_environment.reset(scope)
  adds EXAM_CORPUS_TABLES (10) + cc.examboards storage cleanup + TIMETABLE_TABLES (13); 'all' now also
  clears the exam subsystem the legacy reset missed. No schema migration required.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
CC Worker 2026-06-07 23:33:20 +00:00
parent 5750413f43
commit cdc105ae54
6 changed files with 8958 additions and 52 deletions

View File

@ -126,13 +126,24 @@ async def platform_stats(
@router.post("/reset")
async def reset_environment(
scope: str = "all",
_: dict = Depends(require_platform_admin),
) -> Dict[str, Any]:
"""DESTRUCTIVE: wipe all test data. Neo4j + Supabase. Platform admin only."""
"""DESTRUCTIVE: wipe test data. Platform admin only.
scope (query param):
- all : full wipe (Neo4j + Supabase data + auth users) AND exam subsystem + storage.
- exam-corpus : ONLY the exam corpus eb_*/exam_* tables + cc.examboards storage objects
(load/unload the public corpus without touching schools/users).
- timetable : ONLY timetable/calendar materialization tables.
"""
if scope not in ("all", "exam-corpus", "timetable"):
raise HTTPException(status_code=400, detail="scope must be one of: all, exam-corpus, timetable")
import asyncio
import functools
from run.initialization.reset_environment import reset as _reset
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(None, _reset)
result = await loop.run_in_executor(None, functools.partial(_reset, scope))
return {"status": "ok", **result}

View File

@ -0,0 +1,3 @@
# Persistent local corpus store — PDFs are NOT committed (re-downloadable from manifest).
*
!.gitignore

File diff suppressed because it is too large Load Diff

View File

@ -70,7 +70,27 @@ def _trilogy(award: str) -> List[Tuple[str, str, Optional[str]]]:
def _alevel(award: str, papers=("1", "2", "3")) -> List[Tuple[str, str, Optional[str]]]:
return [(f"{award}{p}", f"{award}/{p}", None) for p in papers]
def _subj(award: str, papers, tiers=(None,)) -> List[Tuple[str, str, Optional[str]]]:
"""Generic GCSE/A-level builder. tiers=('F','H') for tiered subjects (Maths/Science),
tiers=(None,) for untiered (English/Geography/CS/Business/Psychology)."""
out = []
for p in papers:
for t in tiers:
tl = t or ""
out.append((f"{award}{p}{tl}", f"{award}/{p}{tl}", t))
return out
def _mfl(award: str) -> List[Tuple[str, str, Optional[str]]]:
"""AQA MFL: Listening/Reading/Writing papers, each Foundation/Higher (Speaking is teacher-conducted,
no public QP). Filestore code encodes skill+tier, e.g. 8658LH = French Listening Higher."""
out = []
for skill in ("L", "R", "W"):
for t in ("F", "H"):
out.append((f"{award}{skill}{t}", f"{award}/{skill}{t}", t))
return out
AQA_SPECS = [
# ── Sciences (round 1 — kept at full depth) ──────────────────────────────────────
("AQA-BIOL-8461", "BIOLOGY", "8461", "GCSE", "2016", _gcse_single("8461")),
("AQA-CHEM-8462", "CHEMISTRY", "8462", "GCSE", "2016", _gcse_single("8462")),
("AQA-PHYS-8463", "PHYSICS", "8463", "GCSE", "2016", _gcse_single("8463")),
@ -81,6 +101,30 @@ AQA_SPECS = [
("AQA-CHEM-7405", "CHEMISTRY", "7405", "A-level", "2015", _alevel("7405")),
("AQA-PHYS-7407", "PHYSICS", "7407", "AS", "2015", _alevel("7407", ("1", "2"))),
("AQA-PHYS-7408", "PHYSICS", "7408", "A-level", "2015", _alevel("7408")),
# ── Round 2 breadth — high-volume core (Maths, English) ───────────────────────────
("AQA-MATH-8300", "MATHEMATICS", "8300", "GCSE", "2015", _subj("8300", ("1", "2", "3"), ("F", "H"))),
("AQA-MATH-7357", "MATHEMATICS", "7357", "A-level", "2017", _alevel("7357", ("1", "2", "3"))),
("AQA-MATH-7356", "MATHEMATICS", "7356", "AS", "2017", _alevel("7356", ("1", "2"))),
("AQA-ENGL-8700", "ENGLISH LANGUAGE", "8700", "GCSE", "2015", _subj("8700", ("1", "2"))),
("AQA-ENGLIT-8702", "ENGLISH LITERATURE", "8702", "GCSE", "2015", _subj("8702", ("1", "2"))),
("AQA-ENGL-7702", "ENGLISH LANGUAGE", "7702", "A-level", "2015", _alevel("7702", ("1", "2"))),
("AQA-ENGLIT-7712", "ENGLISH LITERATURE A", "7712", "A-level", "2015", _alevel("7712", ("1", "2"))),
# ── Round 2 breadth — humanities / others ─────────────────────────────────────────
("AQA-GEOG-8035", "GEOGRAPHY", "8035", "GCSE", "2016", _subj("8035", ("1", "2", "3"))),
("AQA-GEOG-7037", "GEOGRAPHY", "7037", "A-level", "2016", _alevel("7037", ("1", "2"))),
("AQA-COMP-8525", "COMPUTER SCIENCE", "8525", "GCSE", "2020", _subj("8525", ("1", "2"))),
("AQA-COMP-7517", "COMPUTER SCIENCE", "7517", "A-level", "2015", _alevel("7517", ("1", "2"))),
("AQA-BUS-8132", "BUSINESS", "8132", "GCSE", "2017", _subj("8132", ("1", "2"))),
("AQA-BUS-7132", "BUSINESS", "7132", "A-level", "2015", _alevel("7132", ("1", "2", "3"))),
("AQA-PSYC-8182", "PSYCHOLOGY", "8182", "GCSE", "2017", _subj("8182", ("1", "2"))),
("AQA-PSYC-7182", "PSYCHOLOGY", "7182", "A-level", "2015", _alevel("7182", ("1", "2", "3"))),
# ── Round 2 breadth — modern foreign languages (Listening/Reading/Writing, F+H) ───
("AQA-FREN-8658", "FRENCH", "8658", "GCSE", "2016", _mfl("8658")),
("AQA-SPAN-8698", "SPANISH", "8698", "GCSE", "2016", _mfl("8698")),
("AQA-GERM-8668", "GERMAN", "8668", "GCSE", "2016", _mfl("8668")),
("AQA-FREN-7652", "FRENCH", "7652", "A-level", "2016", _alevel("7652", ("1", "2"))),
("AQA-SPAN-7692", "SPANISH", "7692", "A-level", "2016", _alevel("7692", ("1", "2"))),
("AQA-GERM-7662", "GERMAN", "7662", "A-level", "2016", _alevel("7662", ("1", "2"))),
]
AQA_SESSIONS = ["JUN18", "JUN19", "NOV20", "NOV21", "JUN22", "JUN23", "JUN24"]
AQA_ROLES = ["QP", "MS", "ER"]
@ -167,31 +211,113 @@ def build_aqa() -> Dict[str, Any]:
# ─────────────── Edexcel / OCR — confirmed direct URLs (re-verified at build) ───────────────
# Each tuple: (spec_code, subject, award, level, first_teach, exam_code, paper_code, tier,
# session, role, url, original_name)
# These boards aren't templatable (Edexcel has a non-derivable date suffix; OCR uses opaque
# doc-ids), so confirmed URLs are listed as 6-tuples: (spec_code, paper_code, tier, session, role,
# url). exam_code is DERIVED (see _mk_exam_code) so it always matches the locked convention.
EXAM_CODE_PREFIX = {"EDEXCEL": "EDX", "OCR": "OCR"}
def _ec_token(paper_code: str) -> str:
t = paper_code.split("/")[-1]
return str(int(t)) if t.isdigit() else t # "01"->"1", "1H"->"1H", "1CH"->"1CH", "11"->"11"
def _mk_exam_code(prefix: str, award: str, paper_code: str, session: str, role: str) -> str:
y, m = session.split("-")
return f"{prefix}-{award}-{_ec_token(paper_code)}-{y}{m.upper()}-{role}"
_PE = "https://qualifications.pearson.com/content/dam/pdf"
_EDX = f"{_PE}/GCSE/Science/2016"
_OCR = "https://www.ocr.org.uk/Images"
EDEXCEL_SPECS = {
"EDX-BIOL-1BI0": ("BIOLOGY", "1BI0", "GCSE", "2016"),
"EDX-CHEM-1CH0": ("CHEMISTRY", "1CH0", "GCSE", "2016"),
"EDX-PHYS-1PH0": ("PHYSICS", "1PH0", "GCSE", "2016"),
"EDX-COMB-1SC0": ("COMBINED SCIENCE", "1SC0", "GCSE", "2016"),
"EDX-MATH-1MA1": ("MATHEMATICS", "1MA1", "GCSE", "2015"),
"EDX-ENGL-1EN0": ("ENGLISH LANGUAGE", "1EN0", "GCSE", "2015"),
"EDX-ENGLIT-1ET0": ("ENGLISH LITERATURE", "1ET0", "GCSE", "2015"),
"EDX-GEOG-1GA0": ("GEOGRAPHY A", "1GA0", "GCSE", "2016"),
"EDX-HIST-1HI0": ("HISTORY", "1HI0", "GCSE", "2016"),
"EDX-BUS-1BS0": ("BUSINESS", "1BS0", "GCSE", "2017"),
"EDX-COMP-1CP2": ("COMPUTER SCIENCE", "1CP2", "GCSE", "2020"),
"EDX-MATH-9MA0": ("MATHEMATICS", "9MA0", "A-level", "2017"),
"EDX-ENGL-9EN0": ("ENGLISH LANGUAGE", "9EN0", "A-level", "2015"),
"EDX-ENGLIT-9ET0": ("ENGLISH LITERATURE", "9ET0", "A-level", "2015"),
"EDX-GEOG-9GE0": ("GEOGRAPHY", "9GE0", "A-level", "2016"),
}
_EDX = "https://qualifications.pearson.com/content/dam/pdf/GCSE/Science/2016"
EDEXCEL_PAPERS = [
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2024JUN-QP", "1BI0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-1h-que-20240511.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-2F-2023JUN-QP", "1BI0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2f-que-20230610.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-2H-2023JUN-QP", "1BI0/2H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2h-que-20230610.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1F-2023JUN-MS", "1BI0/1F", "F", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1f-rms-20230824.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2024JUN-MS", "1BI0/1H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1h-rms-20240822.pdf"),
("EDX-BIOL-1BI0", "EDX-1BI0-1H-2022JUN-MS", "1BI0/1H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1bi0-1h-rms-20220825.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-1F-2023JUN-QP", "1CH0/1F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1f-que-20230523.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-1H-2024JUN-QP", "1CH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1h-que-20240518.pdf"),
("EDX-CHEM-1CH0", "EDX-1CH0-2H-2024JUN-MS", "1CH0/2H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1ch0-2h-rms-20240822.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-1H-2023JUN-QP", "1PH0/1H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20230526.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2F-2023JUN-QP", "1PH0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-2f-que-20230617.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-1H-2024JUN-QP", "1PH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20240523.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2H-2023JUN-MS", "1PH0/2H", "H", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1ph0-2h-rms-20230824.pdf"),
("EDX-PHYS-1PH0", "EDX-1PH0-2H-2022JUN-MS", "1PH0/2H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1ph0-2h-rms-20220825.pdf"),
("EDX-COMB-1SC0", "EDX-1SC0-1CH-2023JUN-MS", "1SC0/1CH", None, "2023-Jun", "MS", f"{_EDX}/Exam-materials/1sc0-1ch-rms-20230824.pdf"),
# ── Sciences (round 1) ──
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-1h-que-20240511.pdf"),
("EDX-BIOL-1BI0", "1BI0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2f-que-20230610.pdf"),
("EDX-BIOL-1BI0", "1BI0/2H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1bi0-2h-que-20230610.pdf"),
("EDX-BIOL-1BI0", "1BI0/1F", "F", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1f-rms-20230824.pdf"),
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1bi0-1h-rms-20240822.pdf"),
("EDX-BIOL-1BI0", "1BI0/1H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1bi0-1h-rms-20220825.pdf"),
("EDX-CHEM-1CH0", "1CH0/1F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1f-que-20230523.pdf"),
("EDX-CHEM-1CH0", "1CH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ch0-1h-que-20240518.pdf"),
("EDX-CHEM-1CH0", "1CH0/2H", "H", "2024-Jun", "MS", f"{_EDX}/Exam-materials/1ch0-2h-rms-20240822.pdf"),
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20230526.pdf"),
("EDX-PHYS-1PH0", "1PH0/2F", "F", "2023-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-2f-que-20230617.pdf"),
("EDX-PHYS-1PH0", "1PH0/1H", "H", "2024-Jun", "QP", f"{_EDX}/Exam-materials/1ph0-1h-que-20240523.pdf"),
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2023-Jun", "MS", f"{_EDX}/Exam-materials/1ph0-2h-rms-20230824.pdf"),
("EDX-PHYS-1PH0", "1PH0/2H", "H", "2022-Jun", "MS", f"{_EDX}/exam-materials/1ph0-2h-rms-20220825.pdf"),
("EDX-COMB-1SC0", "1SC0/1CH", None, "2023-Jun", "MS", f"{_EDX}/Exam-materials/1sc0-1ch-rms-20230824.pdf"),
# ── Maths 1MA1 (round 2) ──
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20230520.pdf"),
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20230824.pdf"),
("EDX-MATH-1MA1", "1MA1/1F", "F", "2023-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20230824.pdf"),
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-que-20240517.pdf"),
("EDX-MATH-1MA1", "1MA1/1H", "H", "2024-Jun", "QP", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-que-20240517.pdf"),
("EDX-MATH-1MA1", "1MA1/1F", "F", "2024-Jun", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1f-rms-20240822.pdf"),
("EDX-MATH-1MA1", "1MA1/1H", "H", "2023-Nov", "MS", f"{_PE}/GCSE/Mathematics/2015/Exam-materials/1ma1-1h-rms-20240111.pdf"),
("EDX-MATH-1MA1", "1MA1/1H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-1h-rms-20220825.pdf"),
("EDX-MATH-1MA1", "1MA1/3H", "H", "2022-Jun", "MS", f"{_PE}/GCSE/mathematics/2015/exam-materials/1ma1-3h-rms-20220825.pdf"),
# ── English Language 1EN0 / Literature 1ET0 (round 2) ──
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20240524.pdf"),
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Nov", "QP", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-que-20231108.pdf"),
("EDX-ENGL-1EN0", "1EN0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20240822.pdf"),
("EDX-ENGL-1EN0", "1EN0/02", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20240822.pdf"),
("EDX-ENGL-1EN0", "1EN0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-01-rms-20230824.pdf"),
("EDX-ENGL-1EN0", "1EN0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Language/2015/Exam-materials/1en0-02-rms-20230824.pdf"),
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-que-20230518.pdf"),
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20230525.pdf"),
("EDX-ENGLIT-1ET0", "1ET0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-02-que-20240521.pdf"),
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20230824.pdf"),
("EDX-ENGLIT-1ET0", "1ET0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/English-Literature/2015/Exam-materials/1et0-01-rms-20240822.pdf"),
# ── A-level Maths 9MA0 / English 9EN0 / 9ET0 (round 2) ──
("EDX-MATH-9MA0", "9MA0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-que-20230607.pdf"),
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-que-20230621.pdf"),
("EDX-MATH-9MA0", "9MA0/02", None, "2024-Jun", "QP", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-02-que-20240612.pdf"),
("EDX-MATH-9MA0", "9MA0/31", None, "2023-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-31-rms-20230817.pdf"),
("EDX-MATH-9MA0", "9MA0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/Mathematics/2017/Exam-materials/9ma0-01-rms-20240815.pdf"),
("EDX-ENGL-9EN0", "9EN0/01", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-01-rms-20240815.pdf"),
("EDX-ENGL-9EN0", "9EN0/02", None, "2024-Jun", "MS", f"{_PE}/A-Level/English-Language/2015/Exam-materials/9en0-02-rms-20240815.pdf"),
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2024-Jun", "QP", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-que-20240525.pdf"),
("EDX-ENGLIT-9ET0", "9ET0/01", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-01-rms-20230817.pdf"),
("EDX-ENGLIT-9ET0", "9ET0/03", None, "2023-Jun", "MS", f"{_PE}/A-Level/English-Literature/2015/Exam-materials/9et0-03-rms-20230817.pdf"),
# ── Humanities (round 2) ──
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-que-20230523.pdf"),
("EDX-GEOG-1GA0", "1GA0/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20230824.pdf"),
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-que-20230610.pdf"),
("EDX-GEOG-1GA0", "1GA0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-02-rms-20230824.pdf"),
("EDX-GEOG-1GA0", "1GA0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-01-rms-20240822.pdf"),
("EDX-GEOG-1GA0", "1GA0/03", None, "2024-Jun", "QP", f"{_PE}/GCSE/Geography-A/2016/Exam-materials/1ga0-03-que-20240615.pdf"),
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "QP", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-que-20230519.pdf"),
("EDX-HIST-1HI0", "1HI0/10", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-10-rms-20230824.pdf"),
("EDX-HIST-1HI0", "1HI0/12", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-12-rms-20230824.pdf"),
("EDX-HIST-1HI0", "1HI0/13", None, "2024-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-13-rms-20240822.pdf"),
("EDX-HIST-1HI0", "1HI0/33", None, "2023-Jun", "MS", f"{_PE}/GCSE/History/2016/Exam-materials/1hi0-33-rms-20230824.pdf"),
("EDX-BUS-1BS0", "1BS0/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-que-20230519.pdf"),
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20230613.pdf"),
("EDX-BUS-1BS0", "1BS0/02", None, "2023-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-rms-20230824.pdf"),
("EDX-BUS-1BS0", "1BS0/02", None, "2024-Jun", "QP", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-02-que-20240606.pdf"),
("EDX-BUS-1BS0", "1BS0/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Business/2017/Exam-materials/1bs0-01-rms-20240822.pdf"),
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-que-20230520.pdf"),
("EDX-COMP-1CP2", "1CP2/01", None, "2023-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20230824.pdf"),
("EDX-COMP-1CP2", "1CP2/02", None, "2023-Jun", "QP", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-02-que-20230526.pdf"),
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "QP", f"{_PE}/GCSE/Computer-Science/2020/Exam-materials/1cp2-01-que-20240702.pdf"),
("EDX-COMP-1CP2", "1CP2/01", None, "2024-Jun", "MS", f"{_PE}/GCSE/Computer-science/2020/Exam-materials/1cp2-01-rms-20240822.pdf"),
("EDX-GEOG-9GE0", "9GE0/01", None, "2023-Jun", "QP", f"{_PE}/A-Level/Geography/2016/Exam-materials/9ge0-01-que-20230518.pdf"),
]
OCR_SPECS = {
@ -199,39 +325,123 @@ OCR_SPECS = {
"OCR-CHEM-J248": ("CHEMISTRY", "J248", "GCSE", "2016"),
"OCR-PHYS-J249": ("PHYSICS", "J249", "GCSE", "2016"),
"OCR-COMB-J250": ("COMBINED SCIENCE", "J250", "GCSE", "2016"),
"OCR-MATH-J560": ("MATHEMATICS", "J560", "GCSE", "2015"),
"OCR-ENGL-J351": ("ENGLISH LANGUAGE", "J351", "GCSE", "2015"),
"OCR-ENGLIT-J352": ("ENGLISH LITERATURE", "J352", "GCSE", "2015"),
"OCR-COMP-J277": ("COMPUTER SCIENCE", "J277", "GCSE", "2020"),
"OCR-GEOG-J383": ("GEOGRAPHY A", "J383", "GCSE", "2016"),
"OCR-BUS-J204": ("BUSINESS", "J204", "GCSE", "2017"),
"OCR-HIST-J411": ("HISTORY B (SHP)", "J411", "GCSE", "2016"),
"OCR-MATH-H240": ("MATHEMATICS A", "H240", "A-level", "2017"),
"OCR-ENGLIT-H472": ("ENGLISH LITERATURE", "H472", "A-level", "2015"),
"OCR-ENGL-H470": ("ENGLISH LANGUAGE", "H470", "A-level", "2015"),
}
_OCR = "https://www.ocr.org.uk/Images"
OCR_PAPERS = [
("OCR-BIOL-J247", "OCR-J247-1-2024JUN-QP", "J247/01", "F", "2024-Jun", "QP", f"{_OCR}/727713-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2024JUN-MS", "J247/01", "F", "2024-Jun", "MS", f"{_OCR}/727745-mark-scheme-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2024JUN-QP", "J247/03", "H", "2024-Jun", "QP", f"{_OCR}/727715-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2024JUN-MS", "J247/03", "H", "2024-Jun", "MS", f"{_OCR}/727747-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2023JUN-QP", "J247/01", "F", "2023-Jun", "QP", f"{_OCR}/704945-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2023JUN-MS", "J247/03", "H", "2023-Jun", "MS", f"{_OCR}/704979-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-3-2022JUN-QP", "J247/03", "H", "2022-Jun", "QP", f"{_OCR}/678031-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "OCR-J247-1-2022JUN-MS", "J247/01", "F", "2022-Jun", "MS", f"{_OCR}/678076-mark-scheme-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-1-2024JUN-QP", "J248/01", "F", "2024-Jun", "QP", f"{_OCR}/727718-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-3-2024JUN-MS", "J248/03", "H", "2024-Jun", "MS", f"{_OCR}/727751-mark-scheme-paper-3.pdf"),
("OCR-CHEM-J248", "OCR-J248-1-2023JUN-QP", "J248/01", "F", "2023-Jun", "QP", f"{_OCR}/704950-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "OCR-J248-3-2022JUN-QP", "J248/03", "H", "2022-Jun", "QP", f"{_OCR}/678036-question-paper-paper-3.pdf"),
("OCR-PHYS-J249", "OCR-J249-1-2024JUN-QP", "J249/01", "F", "2024-Jun", "QP", f"{_OCR}/727724-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "OCR-J249-3-2024JUN-MS", "J249/03", "H", "2024-Jun", "MS", f"{_OCR}/727755-mark-scheme-paper-3.pdf"),
("OCR-PHYS-J249", "OCR-J249-1-2023JUN-QP", "J249/01", "F", "2023-Jun", "QP", f"{_OCR}/704956-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "OCR-J249-3-2022JUN-MS", "J249/03", "H", "2022-Jun", "MS", f"{_OCR}/678086-mark-scheme-paper-3.pdf"),
("OCR-COMB-J250", "OCR-J250-1-2024JUN-QP", "J250/01", "F", "2024-Jun", "QP", f"{_OCR}/727730-question-paper-paper-1.pdf"),
("OCR-COMB-J250", "OCR-J250-7-2024JUN-MS", "J250/07", "H", "2024-Jun", "MS", f"{_OCR}/727763-mark-scheme-paper-7.pdf"),
# ── Sciences (round 1) ──
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "QP", f"{_OCR}/727713-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "J247/01", "F", "2024-Jun", "MS", f"{_OCR}/727745-mark-scheme-paper-1.pdf"),
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "QP", f"{_OCR}/727715-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "J247/03", "H", "2024-Jun", "MS", f"{_OCR}/727747-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "J247/01", "F", "2023-Jun", "QP", f"{_OCR}/704945-question-paper-paper-1.pdf"),
("OCR-BIOL-J247", "J247/03", "H", "2023-Jun", "MS", f"{_OCR}/704979-mark-scheme-paper-3.pdf"),
("OCR-BIOL-J247", "J247/03", "H", "2022-Jun", "QP", f"{_OCR}/678031-question-paper-paper-3.pdf"),
("OCR-BIOL-J247", "J247/01", "F", "2022-Jun", "MS", f"{_OCR}/678076-mark-scheme-paper-1.pdf"),
("OCR-CHEM-J248", "J248/01", "F", "2024-Jun", "QP", f"{_OCR}/727718-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "J248/03", "H", "2024-Jun", "MS", f"{_OCR}/727751-mark-scheme-paper-3.pdf"),
("OCR-CHEM-J248", "J248/01", "F", "2023-Jun", "QP", f"{_OCR}/704950-question-paper-paper-1.pdf"),
("OCR-CHEM-J248", "J248/03", "H", "2022-Jun", "QP", f"{_OCR}/678036-question-paper-paper-3.pdf"),
("OCR-PHYS-J249", "J249/01", "F", "2024-Jun", "QP", f"{_OCR}/727724-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "J249/03", "H", "2024-Jun", "MS", f"{_OCR}/727755-mark-scheme-paper-3.pdf"),
("OCR-PHYS-J249", "J249/01", "F", "2023-Jun", "QP", f"{_OCR}/704956-question-paper-paper-1.pdf"),
("OCR-PHYS-J249", "J249/03", "H", "2022-Jun", "MS", f"{_OCR}/678086-mark-scheme-paper-3.pdf"),
("OCR-COMB-J250", "J250/01", "F", "2024-Jun", "QP", f"{_OCR}/727730-question-paper-paper-1.pdf"),
("OCR-COMB-J250", "J250/07", "H", "2024-Jun", "MS", f"{_OCR}/727763-mark-scheme-paper-7.pdf"),
# ── Maths J560 (round 2) ──
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "QP", f"{_OCR}/727817-question-paper-paper-1.pdf"),
("OCR-MATH-J560", "J560/01", "F", "2024-Jun", "MS", f"{_OCR}/727824-mark-scheme-paper-1.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "QP", f"{_OCR}/727820-question-paper-paper-4.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2024-Jun", "MS", f"{_OCR}/727827-mark-scheme-paper-4.pdf"),
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "QP", f"{_OCR}/705050-question-paper-paper-1.pdf"),
("OCR-MATH-J560", "J560/01", "F", "2023-Jun", "MS", f"{_OCR}/705057-mark-scheme-paper-1.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "QP", f"{_OCR}/705053-question-paper-paper-4.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2023-Jun", "MS", f"{_OCR}/705060-mark-scheme-paper-4.pdf"),
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "QP", f"{_OCR}/678149-question-paper-paper-1.pdf"),
("OCR-MATH-J560", "J560/01", "F", "2022-Jun", "MS", f"{_OCR}/678156-mark-scheme-paper-1.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "QP", f"{_OCR}/678152-question-paper-paper-4.pdf"),
("OCR-MATH-J560", "J560/04", "H", "2022-Jun", "MS", f"{_OCR}/678159-mark-scheme-paper-4.pdf"),
# ── English Language J351 / Literature J352 (round 2) ──
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "QP", f"{_OCR}/727556-question-paper-communicating-information-and-ideas.pdf"),
("OCR-ENGL-J351", "J351/01", None, "2024-Jun", "MS", f"{_OCR}/727658-mark-scheme-communication-information-and-ideas.pdf"),
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "QP", f"{_OCR}/727558-question-paper-exploring-effects-and-impact.pdf"),
("OCR-ENGL-J351", "J351/02", None, "2024-Jun", "MS", f"{_OCR}/727659-mark-scheme-exploring-effects-and-impact.pdf"),
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "QP", f"{_OCR}/704782-question-paper-communicating-information-and-ideas.pdf"),
("OCR-ENGL-J351", "J351/01", None, "2023-Jun", "MS", f"{_OCR}/704888-mark-scheme-communication-information-and-ideas.pdf"),
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "QP", f"{_OCR}/677852-question-paper-communicating-information-and-ideas.pdf"),
("OCR-ENGL-J351", "J351/01", None, "2022-Jun", "MS", f"{_OCR}/677967-mark-scheme-communication-information-and-ideas.pdf"),
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "QP", f"{_OCR}/727830-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
("OCR-ENGLIT-J352", "J352/01", None, "2024-Jun", "MS", f"{_OCR}/727832-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "QP", f"{_OCR}/727831-question-paper-exploring-poetry-and-shakespeare.pdf"),
("OCR-ENGLIT-J352", "J352/02", None, "2024-Jun", "MS", f"{_OCR}/727833-mark-scheme-exploring-poetry-and-shakespeare.pdf"),
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "QP", f"{_OCR}/705069-question-paper-exploring-modern-and-literary-heritage-texts.pdf"),
("OCR-ENGLIT-J352", "J352/01", None, "2023-Jun", "MS", f"{_OCR}/705075-mark-scheme-exploring-modern-and-literary-heritage-texts.pdf"),
# ── A-level Maths H240 / English Lit H472 / Lang H470 (round 2) ──
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "QP", f"{_OCR}/726654-question-paper-pure-mathematics.pdf"),
("OCR-MATH-H240", "H240/01", None, "2024-Jun", "MS", f"{_OCR}/726795-mark-scheme-pure-mathematics.pdf"),
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "QP", f"{_OCR}/726656-question-paper-pure-mathematics-and-statistics.pdf"),
("OCR-MATH-H240", "H240/02", None, "2024-Jun", "MS", f"{_OCR}/726796-mark-scheme-pure-mathematics-and-statistics.pdf"),
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "QP", f"{_OCR}/703866-question-paper-pure-mathematics.pdf"),
("OCR-MATH-H240", "H240/01", None, "2023-Jun", "MS", f"{_OCR}/704008-mark-scheme-pure-mathematics.pdf"),
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "QP", f"{_OCR}/676845-question-paper-pure-mathematics.pdf"),
("OCR-MATH-H240", "H240/01", None, "2022-Jun", "MS", f"{_OCR}/677005-mark-scheme-pure-mathematics.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "QP", f"{_OCR}/726602-question-paper-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2024-Jun", "MS", f"{_OCR}/726762-mark-scheme-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "QP", f"{_OCR}/703813-question-paper-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2023-Jun", "MS", f"{_OCR}/703974-mark-scheme-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "QP", f"{_OCR}/676783-question-paper-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGLIT-H472", "H472/01", None, "2022-Jun", "MS", f"{_OCR}/676965-mark-scheme-drama-and-poetry-pre-1900.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "QP", f"{_OCR}/726595-question-paper-exploring-language.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2024-Jun", "MS", f"{_OCR}/726764-mark-scheme-exploring-language.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "QP", f"{_OCR}/703806-question-paper-exploring-language.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2023-Jun", "MS", f"{_OCR}/703976-mark-scheme-exploring-language.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "QP", f"{_OCR}/676772-question-paper-exploring-language.pdf"),
("OCR-ENGL-H470", "H470/01", None, "2022-Jun", "MS", f"{_OCR}/676967-mark-scheme-exploring-language.pdf"),
# ── Humanities (round 2) ──
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "QP", f"{_OCR}/727534-question-paper-computer-systems.pdf"),
("OCR-COMP-J277", "J277/01", None, "2024-Jun", "MS", f"{_OCR}/727652-mark-scheme-computer-systems.pdf"),
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "QP", f"{_OCR}/727535-question-paper-computational-thinking-algorithms-and-programming.pdf"),
("OCR-COMP-J277", "J277/02", None, "2024-Jun", "MS", f"{_OCR}/727653-mark-scheme-computational-thinking-algorithms-and-programming.pdf"),
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "QP", f"{_OCR}/727564-question-paper-living-in-the-uk-today.pdf"),
("OCR-GEOG-J383", "J383/01", None, "2024-Jun", "MS", f"{_OCR}/727661-mark-scheme-living-in-the-uk-today.pdf"),
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "QP", f"{_OCR}/727566-question-paper-the-world-around-us.pdf"),
("OCR-GEOG-J383", "J383/02", None, "2024-Jun", "MS", f"{_OCR}/727662-mark-scheme-the-world-around-us.pdf"),
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "QP", f"{_OCR}/727519-question-paper-business-1-business-activity-marketing-and-people.pdf"),
("OCR-BUS-J204", "J204/01", None, "2024-Jun", "MS", f"{_OCR}/727634-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "QP", f"{_OCR}/727520-question-paper-business-2-operations-finance-and-influences-on-business.pdf"),
("OCR-BUS-J204", "J204/02", None, "2024-Jun", "MS", f"{_OCR}/727635-mark-scheme-business-2-operations-finance-and-influences-on-business.pdf"),
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "QP", f"{_OCR}/704745-question-paper-business-1-business-activity-marketing-and-people.pdf"),
("OCR-BUS-J204", "J204/01", None, "2023-Jun", "MS", f"{_OCR}/704864-mark-scheme-business-1-business-activity-marketing-and-people.pdf"),
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "QP", f"{_OCR}/727590-question-paper-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
("OCR-HIST-J411", "J411/11", None, "2024-Jun", "MS", f"{_OCR}/727678-mark-scheme-the-people-s-health-c.1250-to-present-with-the-norman-conquest-1065-1087.pdf"),
]
def build_board(board_code: str, specs_meta: Dict, papers: List) -> Dict[str, Any]:
prefix = EXAM_CODE_PREFIX[board_code]
print(f"[{board_code}] re-verifying {len(papers)} confirmed URLs...", file=sys.stderr)
live: Dict[int, bool] = {}
with cf.ThreadPoolExecutor(max_workers=24) as ex:
futs = {ex.submit(head_ok, p[5]): i for i, p in enumerate(papers)}
for fut in cf.as_completed(futs):
live[futs[fut]] = fut.result()
by_spec: Dict[str, List[Dict[str, Any]]] = {}
for spec_code, exam_code, paper_code, tier, session, role, url in papers:
if not head_ok(url):
for i, (spec_code, paper_code, tier, session, role, url) in enumerate(papers):
if not live.get(i):
print(f" DROP (not live): {url}", file=sys.stderr)
continue
award = specs_meta[spec_code][1]
by_spec.setdefault(spec_code, []).append({
"exam_code": exam_code, "paper_code": paper_code, "tier": tier,
"exam_code": _mk_exam_code(prefix, award, paper_code, session, role),
"paper_code": paper_code, "tier": tier,
"session": session, "doc_type": role,
"file": {"source": f"url:{url}", "original_name": os.path.basename(url),
"provenance": {"source_url": url, "fetched": FETCHED,

View File

@ -82,6 +82,41 @@ SUPABASE_TABLES_TO_CLEAR = [
"admin_profiles",
]
# Exam subsystem tables, FK child-first. NOT in the list above — the previous full reset()
# never cleared exam data or storage at all; the granular scopes below fold it in.
EXAM_CORPUS_TABLES = [
"mark_entries",
"student_submissions",
"marking_batches",
"exam_response_areas",
"exam_boundaries",
"exam_template_layout",
"exam_questions",
"exam_templates",
"eb_exams",
"eb_specifications",
]
# Timetable / calendar materialization subset (for scope='timetable').
TIMETABLE_TABLES = [
"lesson_deliveries",
"lesson_collaborators",
"taught_lessons",
"academic_periods",
"academic_days",
"academic_weeks",
"academic_term_breaks",
"academic_terms",
"academic_years",
"teacher_timetable_slots",
"teacher_timetables",
"school_timetables",
"planned_lessons",
]
# Buckets whose objects the exam-corpus reset clears (Storage API — protect_delete blocks raw SQL).
EXAM_STORAGE_BUCKET = "cc.examboards"
def _sb_headers():
url = os.environ["SUPABASE_URL"]
@ -146,13 +181,84 @@ def _supabase_delete_auth_user(url: str, headers: dict, uid: str):
logger.warning(f" Delete auth user {uid}: {r.status_code} {r.text[:80]}")
# ─── Granular helpers ───────────────────────────────────────────────────────────
def _clear_tables(url: str, headers: dict, tables: List[str]) -> "tuple[List[str], List[str]]":
cleared, failed = [], []
for table in tables:
if _sb_clear_table(url, headers, table) in (200, 204):
cleared.append(table)
logger.info(f"{table}")
else:
failed.append(table)
return cleared, failed
def _clear_exam_storage() -> Dict[str, Any]:
"""Remove cc.examboards objects via the Storage API (protect_delete blocks raw SQL deletes).
Gathers storage_loc from eb_exams/eb_specifications BEFORE the rows are cleared."""
try:
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
except Exception as exc:
logger.warning(f" exam storage clear skipped (import): {exc}")
return {"removed": 0, "error": str(exc)}
sb = SupabaseServiceRoleClient().supabase
storage = StorageAdmin()
locs: List[str] = []
for table in ("eb_exams", "eb_specifications"):
try:
rows = sb.table(table).select("storage_loc").execute().data or []
locs += [r["storage_loc"] for r in rows if r.get("storage_loc")]
except Exception as exc:
logger.warning(f" storage_loc gather {table}: {exc}")
by_bucket: Dict[str, List[str]] = {}
for loc in locs:
if "/" in loc:
b, _, p = loc.partition("/")
by_bucket.setdefault(b, []).append(p)
removed = 0
for b, paths in by_bucket.items():
for i in range(0, len(paths), 100):
chunk = paths[i:i + 100]
try:
storage.client.supabase.storage.from_(b).remove(chunk)
removed += len(chunk)
except Exception as exc:
logger.warning(f" storage remove {b}: {exc}")
logger.info(f" exam storage removed {removed} objects from {list(by_bucket)}")
return {"removed": removed, "buckets": list(by_bucket)}
# ─── Main reset ───────────────────────────────────────────────────────────────
def reset() -> Dict[str, Any]:
def reset(scope: str = "all") -> Dict[str, Any]:
"""Destructive reset. scope ∈ {all, exam-corpus, timetable}.
- all : full wipe (Neo4j + Supabase data + auth users) AND the exam subsystem + storage.
- exam-corpus : ONLY eb_*/exam_* tables + cc.examboards storage objects (load/unload the corpus).
- timetable : ONLY timetable/calendar materialization tables.
"""
scope = (scope or "all").lower()
if scope not in ("all", "exam-corpus", "timetable"):
raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable)")
url, headers = _sb_headers()
if scope == "exam-corpus":
logger.info("RESET (scope=exam-corpus) — exam tables + cc.examboards storage")
storage = _clear_exam_storage()
cleared, failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
return {"scope": scope, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
if scope == "timetable":
logger.info("RESET (scope=timetable) — timetable/calendar tables")
cleared, failed = _clear_tables(url, headers, TIMETABLE_TABLES)
return {"scope": scope, "tables_cleared": cleared, "tables_failed": failed}
logger.info("=" * 60)
logger.info("RESET ENVIRONMENT — full destructive wipe starting")
logger.info("=" * 60)
results: Dict[str, Any] = {}
results: Dict[str, Any] = {"scope": scope}
# ── 1. Neo4j: drop everything except system + neo4j ──────────────────────
logger.info("\n[Neo4j] Dropping all non-system databases...")
@ -213,11 +319,22 @@ def reset() -> Dict[str, Any]:
)
logger.info(" kcar → admin_profiles restored ✓")
# ── 5. Exam subsystem: storage objects (Storage API) + exam tables ───────────
# (The legacy full reset cleared neither exam tables nor storage — folded in here.)
logger.info("\n[Supabase] Clearing exam subsystem (storage + eb_*/exam_* tables)...")
exam_storage = _clear_exam_storage()
exam_cleared, exam_failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
results["supabase"] = {
"tables_cleared": cleared,
"tables_failed": failed,
"deleted_users": deleted_emails,
}
results["exam"] = {
"storage": exam_storage,
"tables_cleared": exam_cleared,
"tables_failed": exam_failed,
}
logger.info("\n" + "=" * 60)
logger.info("RESET COMPLETE")

View File

@ -60,6 +60,13 @@ DOC_ROLES = {"QP", "MS", "INSERT", "ER", "SPECIMEN", "GRADE_BOUNDARIES", "DATA_S
TIERS = {"H", "F", None}
# Default working dir for cached url: downloads (override with --cache-dir / EXAM_CORPUS_CACHE).
DEFAULT_CACHE_DIR = os.getenv("EXAM_CORPUS_CACHE", "/tmp/exam-corpus-cache")
# Persistent, mountable local store laid out exactly like the bucket (download once, seed many,
# offline-repeatable). Override with --store-dir / EXAM_CORPUS_STORE. Distinct from --cache-dir,
# which is a throwaway url hash-cache.
DEFAULT_STORE_DIR = os.getenv(
"EXAM_CORPUS_STORE",
os.path.join(os.path.dirname(os.path.abspath(__file__)), "manifests", "_corpus_store"),
)
# ─────────────────────────────── canonical storage paths ───────────────────────────────
@ -95,12 +102,24 @@ class LoadReport:
user_copies: int = 0
swept: int = 0
sweep_failed: int = 0
downloaded: int = 0
download_cached: int = 0
unseed_objects: int = 0
unseed_exams: int = 0
unseed_specs: int = 0
unseed_templates: int = 0
errors: List[str] = field(default_factory=list)
def as_dict(self) -> Dict[str, Any]:
return {
"specs_upserted": self.specs_upserted,
"papers_upserted": self.papers_upserted,
"downloaded": self.downloaded,
"download_cached": self.download_cached,
"unseed_objects": self.unseed_objects,
"unseed_exams": self.unseed_exams,
"unseed_specs": self.unseed_specs,
"unseed_templates": self.unseed_templates,
"files_uploaded": self.files_uploaded,
"files_skipped": self.files_skipped,
"files_failed": self.files_failed,
@ -181,6 +200,70 @@ def _resolve_source_bytes(source: str, *, cache_dir: str) -> bytes:
return fh.read()
# ─────────────────────── persistent local store (download-once, seed-many) ───────────────────────
def _store_path(store_dir: str, storage_loc: str) -> str:
"""Local path mirroring the bucket layout (so the store is directly mountable as the corpus):
storage_loc 'cc.examboards/aqa/physics/8463/1h/2022-jun/qp.pdf'
-> {store_dir}/aqa/physics/8463/1h/2022-jun/qp.pdf
"""
_, _, path = storage_loc.partition("/")
return os.path.join(store_dir, path)
def _item_bytes(source: str, storage_loc: str, *, store_dir: Optional[str], cache_dir: str,
populate: bool = True, rep: Optional[LoadReport] = None) -> bytes:
"""Resolve bytes for an item, preferring the persistent local store when present.
If store_dir holds the file read it (offline). Otherwise resolve the source (local|url:) and,
when populate=True, write it into the store at its canonical path for future offline runs.
"""
if store_dir:
sp = _store_path(store_dir, storage_loc)
if os.path.exists(sp) and os.path.getsize(sp) > 0:
if rep is not None:
rep.download_cached += 1
with open(sp, "rb") as fh:
return fh.read()
data = _resolve_source_bytes(source, cache_dir=cache_dir)
if store_dir and populate:
sp = _store_path(store_dir, storage_loc)
os.makedirs(os.path.dirname(sp), exist_ok=True)
tmp = sp + ".part"
with open(tmp, "wb") as fh:
fh.write(data)
os.replace(tmp, sp)
if rep is not None:
rep.downloaded += 1
return data
def download_corpus(m: Dict[str, Any], *, store_dir: str, board_filter: Optional[str],
spec_filter: Optional[str], cache_dir: str, rep: LoadReport) -> None:
"""--download-only: populate the persistent local store from the manifest. No DB/bucket writes.
A later run with the same --store-dir (e.g. mounted into the container) seeds offline from it."""
for board in m.get("boards", []):
if board_filter and board.get("exam_board_code") != board_filter:
continue
for spec in board.get("specifications", []):
if spec_filter and spec.get("spec_code") != spec_filter:
continue
sf = spec.get("spec_file")
if sf and sf.get("source"):
sloc = spec_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
spec.get("award_code", ""), spec.get("spec_ver", ""))
try:
_item_bytes(sf["source"], sloc, store_dir=store_dir, cache_dir=cache_dir, rep=rep)
except Exception as exc:
rep.errors.append(f"download spec {spec.get('spec_code')}: {exc}")
for p in spec.get("papers", []):
ploc = paper_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
spec.get("award_code", ""), p["paper_code"], p["session"], p["doc_type"])
try:
_item_bytes(p["file"]["source"], ploc, store_dir=store_dir, cache_dir=cache_dir, rep=rep)
except Exception as exc:
rep.errors.append(f"download {p.get('exam_code')}: {exc}")
logger.info(f"download-only done: downloaded={rep.downloaded} already_in_store={rep.download_cached} "
f"errors={len(rep.errors)} store={store_dir}")
# ─────────────────────────────── storage upload (skip-if-exists + sha256) ───────────────────────────────
def _split_loc(storage_loc: str) -> Tuple[str, str]:
bucket, _, path = storage_loc.partition("/")
@ -491,10 +574,88 @@ def first_sweep(client: SupabaseServiceRoleClient, storage: StorageAdmin,
rep.swept += 1
# ─────────────────────────────── unseed (inverse of the loader) ───────────────────────────────
def _chunks(seq: List[Any], n: int = 100):
for i in range(0, len(seq), n):
yield seq[i:i + n]
def unseed(client: SupabaseServiceRoleClient, storage: StorageAdmin, *,
board_filter: Optional[str], spec_filter: Optional[str],
drop_specs: bool = True, drop_seed_templates: bool = True, rep: LoadReport) -> None:
"""Inverse of the loader: remove the seeded public corpus, scoped by --board/--spec (or all).
Deletes (in FK-safe order): cc.examboards storage objects (via the Storage API, since the
protect_delete trigger blocks direct SQL deletes), first-sweep exam_templates created by the
seed (title '... (auto-map seed)', cascades children), eb_exams rows, then eb_specifications.
"""
sb = client.supabase
q = sb.table("eb_specifications").select("spec_code, storage_loc, exam_board_code")
if board_filter:
q = q.eq("exam_board_code", board_filter)
if spec_filter:
q = q.eq("spec_code", spec_filter)
specs = getattr(q.execute(), "data", None) or []
spec_codes = [s["spec_code"] for s in specs]
if not spec_codes:
logger.info("[unseed] no matching specifications; nothing to do")
return
exams: List[Dict[str, Any]] = []
for chunk in _chunks(spec_codes):
res = sb.table("eb_exams").select("id, exam_code, storage_loc").in_("spec_code", chunk).execute()
exams.extend(getattr(res, "data", None) or [])
# 1) Storage objects (Storage API; batch-remove per bucket). Specs may carry a spec PDF too.
by_bucket: Dict[str, List[str]] = {}
for row in exams + specs:
loc = row.get("storage_loc")
if not loc or "/" not in loc:
continue
bkt, _, path = loc.partition("/")
by_bucket.setdefault(bkt, []).append(path)
for bkt, paths in by_bucket.items():
for chunk in _chunks(paths, 100):
try:
storage.client.supabase.storage.from_(bkt).remove(chunk)
rep.unseed_objects += len(chunk)
except Exception as exc:
logger.warning(f"[unseed] storage remove failed ({bkt}, {len(chunk)} objs): {exc}")
# 2) First-sweep templates created by the seed (cascades questions/regions/boundaries/layout).
if drop_seed_templates and exams:
exam_codes = [e["exam_code"] for e in exams if e.get("exam_code")]
for chunk in _chunks(exam_codes, 100):
try:
res = sb.table("exam_templates").delete(count="exact") \
.in_("exam_code", chunk).like("title", "%(auto-map seed)%").execute()
rep.unseed_templates += getattr(res, "count", None) or len(getattr(res, "data", []) or [])
except Exception as exc:
logger.warning(f"[unseed] template delete failed: {exc}")
# 3) Catalogue rows: eb_exams (by id), then eb_specifications (by spec_code).
exam_ids = [e["id"] for e in exams]
for chunk in _chunks(exam_ids, 100):
try:
sb.table("eb_exams").delete().in_("id", chunk).execute()
rep.unseed_exams += len(chunk)
except Exception as exc:
logger.warning(f"[unseed] eb_exams delete failed: {exc}")
if drop_specs:
for chunk in _chunks(spec_codes, 100):
try:
sb.table("eb_specifications").delete().in_("spec_code", chunk).execute()
rep.unseed_specs += len(chunk)
except Exception as exc:
logger.warning(f"[unseed] eb_specifications delete failed: {exc}")
logger.info(f"unseed done: storage_objects={rep.unseed_objects} templates={rep.unseed_templates} "
f"exams={rep.unseed_exams} specs={rep.unseed_specs}")
# ─────────────────────────────── orchestration ───────────────────────────────
def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Optional[str],
spec_filter: Optional[str], user_subset: bool, do_first_sweep: bool,
cache_dir: str = DEFAULT_CACHE_DIR) -> LoadReport:
cache_dir: str = DEFAULT_CACHE_DIR, store_dir: Optional[str] = None) -> LoadReport:
with open(manifest_path) as f:
m = yaml.safe_load(f)
rep = LoadReport()
@ -526,7 +687,9 @@ def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Option
spec.get("award_code", ""), spec.get("spec_ver", ""))
if not dry_run:
try:
spec_sha = upload_file(storage, sloc, _resolve_source_bytes(sf["source"], cache_dir=cache_dir),
spec_sha = upload_file(storage, sloc,
_item_bytes(sf["source"], sloc, store_dir=store_dir,
cache_dir=cache_dir, rep=rep),
force=force, rep=rep)
except Exception as exc:
logger.error(f"[spec-file] {spec.get('spec_code')}: {exc}")
@ -543,7 +706,9 @@ def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Option
continue
psha = None
try:
psha = upload_file(storage, ploc, _resolve_source_bytes(p["file"]["source"], cache_dir=cache_dir),
psha = upload_file(storage, ploc,
_item_bytes(p["file"]["source"], ploc, store_dir=store_dir,
cache_dir=cache_dir, rep=rep),
force=force, rep=rep)
except Exception as exc:
logger.error(f"[paper-file] {p.get('exam_code')}: {exc}")
@ -563,19 +728,49 @@ def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Option
def main() -> None:
ap = argparse.ArgumentParser(description="Seed the public exam-paper corpus from a manifest.")
ap.add_argument("--manifest", required=True)
ap = argparse.ArgumentParser(description="Seed (or unseed) the public exam-paper corpus from a manifest.")
ap.add_argument("--manifest", help="corpus manifest (required except for --unseed)")
ap.add_argument("--dry-run", action="store_true", help="validate + report, no writes")
ap.add_argument("--force", action="store_true", help="re-upload/overwrite existing storage objects")
ap.add_argument("--board", default=None, help="only this exam_board_code")
ap.add_argument("--spec", default=None, help="only this spec_code")
ap.add_argument("--user-subset", action="store_true", help="also seed a user-side test subset")
ap.add_argument("--first-sweep", action="store_true", help="run docling/auto-map first pass on seeded papers")
ap.add_argument("--cache-dir", default=DEFAULT_CACHE_DIR, help="cache dir for url: downloads")
ap.add_argument("--cache-dir", default=DEFAULT_CACHE_DIR, help="throwaway url-hash cache dir")
ap.add_argument("--store-dir", default=DEFAULT_STORE_DIR,
help="persistent, bucket-shaped local store (download-once, seed-many)")
ap.add_argument("--no-store", action="store_true",
help="ignore the local store; always fetch from source (don't read/populate the store)")
ap.add_argument("--download-only", action="store_true",
help="populate the local store from the manifest; no DB/bucket writes")
ap.add_argument("--unseed", action="store_true",
help="INVERSE: remove seeded eb_*/storage/first-sweep templates (scoped by --board/--spec)")
a = ap.parse_args()
rep = load(a.manifest, dry_run=a.dry_run, force=a.force, board_filter=a.board, spec_filter=a.spec,
user_subset=a.user_subset, do_first_sweep=a.first_sweep, cache_dir=a.cache_dir)
store_dir = None if a.no_store else a.store_dir
import json
if a.unseed:
rep = LoadReport()
unseed(SupabaseServiceRoleClient(), StorageAdmin(),
board_filter=a.board, spec_filter=a.spec, rep=rep)
print(json.dumps(rep.as_dict(), indent=2))
return
if not a.manifest:
ap.error("--manifest is required unless --unseed is given")
if a.download_only:
with open(a.manifest) as f:
m = yaml.safe_load(f)
rep = LoadReport()
download_corpus(m, store_dir=(a.store_dir), board_filter=a.board, spec_filter=a.spec,
cache_dir=a.cache_dir, rep=rep)
print(json.dumps(rep.as_dict(), indent=2))
return
rep = load(a.manifest, dry_run=a.dry_run, force=a.force, board_filter=a.board, spec_filter=a.spec,
user_subset=a.user_subset, do_first_sweep=a.first_sweep, cache_dir=a.cache_dir,
store_dir=store_dir)
print(json.dumps(rep.as_dict(), indent=2))