api/run/initialization/seed_curriculum.py
CC Worker 5750413f43 feat(seed): implement exam-corpus loader + filled 505-paper manifest
Implements the seed_exam_corpus.py skeleton TODOs against the real APIs and
fills the public exam corpus from official board sources.

Loader (run/initialization/seed_exam_corpus.py):
- _resolve_source_bytes: local path | url: fetch with on-disk cache + PDF validation
- upload_file: real StorageAdmin.upload_file, skip-if-exists+sha256 unless --force
- upsert_specification/upsert_paper: real upserts on spec_code/exam_code.
  Fix: QP/MS/INSERT/ER role -> eb_exams.type_code; doc_type set to 'pdf'
  (doc_type is CHECK-constrained to file formats; the skeleton wrote the role there).
- copy_user_test_subset: copy a QP subset into a test user's cc.users exam space + files rows
- first_sweep: auto_map + the /auto-map row mapper over seeded QPs -> system-owned
  exam_templates + questions/response_areas/boundaries/layout (idempotent)
- identity discovery via institute_memberships.profile_id

Manifest (run/initialization/manifests/):
- exam-corpus.yaml: 505 papers / 18 specs / AQA+Edexcel+OCR, every source URL HEAD-verified.
  AQA sciences GCSE 8461/8462/8463/8464 + AS/A-level 7401-7408, sessions JUN18-JUN24, QP+MS+ER, F+H.
- generate_corpus_manifest.py: regenerates + re-verifies all URLs from official hosts.

seed_curriculum.py: deprecation banner -> superseded by seed_exam_corpus.py; storage_loc
standardised on cc.examboards.

Verified on dev .94: full 505-paper seed (eb_specifications=18, eb_exams=505, QP=211),
idempotent re-runs, first-sweep + user-subset, 6/6 buckets provisioned.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 22:58:03 +00:00

390 lines
18 KiB
Python

"""
seed_curriculum.py — DEPRECATED hardcoded curriculum/exam seeder.
⚠️ SUPERSEDED (2026-06-07) by the manifest-driven corpus loader:
run/initialization/seed_exam_corpus.py (+ manifests/exam-corpus.yaml)
The exam-board parts of this file (eb_specifications / eb_exams) are now seeded from a
verified, provenance-bearing manifest with real uploaded PDFs — not the hardcoded rows
below. This module also had a storage_loc inconsistency the overhaul standardises away:
exam-board files belong in the `cc.examboards` bucket at the canonical path
`cc.examboards/{board}/{subject}/{award}/{paper}/{session}/{role}.pdf`, NOT under
`cc.public.snapshots/curriculum/...` (the placeholder rows below still show the old path).
KEEP ONLY for the Neo4j `curriculum_topics` seed (step [3]) which has no replacement yet.
Do NOT use the eb_specifications/eb_exams blocks for new work — use seed_exam_corpus.py.
Run (Neo4j curriculum topics only is the supported remaining use):
python3 -c "from run.initialization.seed_curriculum import seed; seed()"
"""
import os
import time
import uuid
import requests
from typing import Dict, Any, List, Optional
SUPA_URL = os.environ["SUPABASE_URL"]
SERVICE_KEY = os.environ["SERVICE_ROLE_KEY"]
API_BASE = os.environ.get("API_BASE_URL", "http://localhost:8000")
# ─── School constants ────────────────────────────────────────────────────────
KEVLARAI_ID = "6585bf91-6ae8-4d72-ab54-cddf3ba4e648"
GREENFIELD_ID = "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
# ─── Exam board specifications ───────────────────────────────────────────────
# Realistic UK exam board data for the subjects we teach.
SPECIFICATIONS = [
# AQA Physics
{
"spec_code": "AQA-PHYS-8201",
"exam_board_code": "AQA",
"award_code": "8201",
"subject_code": "PHYSICS",
"first_teach": "2016",
"spec_ver": "1.3",
"storage_loc": "cc.public.snapshots/curriculum/aqa/physics/8201_spec.pdf",
"doc_type": "pdf",
},
{
"spec_code": "AQA-PHYS-8203",
"exam_board_code": "AQA",
"award_code": "8203",
"subject_code": "PHYSICS",
"first_teach": "2016",
"spec_ver": "1.3",
"storage_loc": "cc.public.snapshots/curriculum/aqa/physics/8203_spec.pdf",
"doc_type": "pdf",
},
# AQA GCSE Physics 8463 (standalone) — the real spec for the exam-marker test paper
# (AQA Physics Paper 1H 2022). Spec graph: cc.public.exams Specification AQA-PHYS-8463.
{
"spec_code": "AQA-PHYS-8463",
"exam_board_code": "AQA",
"award_code": "8463",
"subject_code": "PHYSICS",
"first_teach": "2016",
"spec_ver": "1.0",
"storage_loc": "cc.examboards/aqa/physics/8463/8463_spec.pdf", # placeholder (no file yet)
"doc_type": "pdf",
},
# Edexcel Maths
{
"spec_code": "EDX-MATH-1MA1",
"exam_board_code": "EDexcel",
"award_code": "1MA1",
"subject_code": "MATHEMATICS",
"first_teach": "2015",
"spec_ver": "2.0",
"storage_loc": "cc.public.snapshots/curriculum/edexcel/maths/1MA1_spec.pdf",
"doc_type": "pdf",
},
# OCR Maths
{
"spec_code": "OCR-MATH-FMH1",
"exam_board_code": "OCR",
"award_code": "FMH1",
"subject_code": "MATHEMATICS",
"first_teach": "2017",
"spec_ver": "1.1",
"storage_loc": "cc.public.snapshots/curriculum/ocr/maths/FMH1_spec.pdf",
"doc_type": "pdf",
},
# AQA Computer Science
{
"spec_code": "AQA-COMP-7516",
"exam_board_code": "AQA",
"award_code": "7516",
"subject_code": "COMPUTER SCIENCE",
"first_teach": "2016",
"spec_ver": "1.2",
"storage_loc": "cc.public.snapshots/curriculum/aqa/cs/7516_spec.pdf",
"doc_type": "pdf",
},
# Edexcel Computer Science
{
"spec_code": "EDX-COMP-X042",
"exam_board_code": "Edexcel",
"award_code": "X042",
"subject_code": "COMPUTER SCIENCE",
"first_teach": "2016",
"spec_ver": "1.0",
"storage_loc": "cc.public.snapshots/curriculum/edexcel/cs/X042_spec.pdf",
"doc_type": "pdf",
},
]
# ─── Exam papers ─────────────────────────────────────────────────────────────
# Realistic exam paper references linked to specifications.
EXAMS = [
# AQA GCSE Physics 8463/1 Higher — the exam-marker test paper (real PDF uploaded to
# cc.examboards). Join key for cc.public.exams ExamPaper.exam_code.
{"exam_code": "AQA-PHYS-8463-1H-22-JUN", "spec_code": "AQA-PHYS-8463", "paper_code": "8463/1",
"tier": "higher", "session": "June", "type_code": "QP",
"storage_loc": "cc.examboards/aqa/physics/8463/AQA-PHYS-8463-1H-22-JUN.pdf"},
# AQA Physics 8201/1 (Foundation)
{"exam_code": "AQA-PHYS-8201-1-23-JUN", "spec_code": "AQA-PHYS-8201", "paper_code": "8201/1",
"tier": "foundation", "session": "June", "type_code": "QP"},
{"exam_code": "AQA-PHYS-8201-MS-23-JUN", "spec_code": "AQA-PHYS-8201", "paper_code": "8201/1",
"tier": "foundation", "session": "June", "type_code": "MS"},
{"exam_code": "AQA-PHYS-8201-ER-23-JUN", "spec_code": "AQA-PHYS-8201", "paper_code": "8201/1",
"tier": "foundation", "session": "June", "type_code": "ER"},
# AQA Physics 8201/2 (Higher)
{"exam_code": "AQA-PHYS-8201-2-23-JUN", "spec_code": "AQA-PHYS-8201", "paper_code": "8201/2",
"tier": "higher", "session": "June", "type_code": "QP"},
{"exam_code": "AQA-PHYS-8201-MS-23-JUN-H", "spec_code": "AQA-PHYS-8201", "paper_code": "8201/2",
"tier": "higher", "session": "June", "type_code": "MS"},
# Edexcel Maths 1MA1/1 (Foundation)
{"exam_code": "EDX-MATH-1MA1-1-24-JUN", "spec_code": "EDX-MATH-1MA1", "paper_code": "1MA1/1F",
"tier": "foundation", "session": "June", "type_code": "QP"},
{"exam_code": "EDX-MATH-1MA1-MS-24-JUN", "spec_code": "EDX-MATH-1MA1", "paper_code": "1MA1/1F",
"tier": "foundation", "session": "June", "type_code": "MS"},
# Edexcel Maths 1MA1/2 (Higher)
{"exam_code": "EDX-MATH-1MA1-2-24-JUN", "spec_code": "EDX-MATH-1MA1", "paper_code": "1MA1/2H",
"tier": "higher", "session": "June", "type_code": "QP"},
{"exam_code": "EDX-MATH-1MA1-MS-24-JUN-H", "spec_code": "EDX-MATH-1MA1", "paper_code": "1MA1/2H",
"tier": "higher", "session": "June", "type_code": "MS"},
# OCR Maths FMH1/1
{"exam_code": "OCR-MATH-FMH1-1-24-JUN", "spec_code": "OCR-MATH-FMH1", "paper_code": "FMH1/1",
"tier": "higher", "session": "June", "type_code": "QP"},
{"exam_code": "OCR-MATH-FMH1-MS-24-JUN", "spec_code": "OCR-MATH-FMH1", "paper_code": "FMH1/1",
"tier": "higher", "session": "June", "type_code": "MS"},
# AQA CS 7516/1
{"exam_code": "AQA-COMP-7516-1-23-JUN", "spec_code": "AQA-COMP-7516", "paper_code": "7516/1",
"tier": None, "session": "June", "type_code": "QP"},
{"exam_code": "AQA-COMP-7516-MS-23-JUN", "spec_code": "AQA-COMP-7516", "paper_code": "7516/1",
"tier": None, "session": "June", "type_code": "MS"},
# AQA CS 7516/2
{"exam_code": "AQA-COMP-7516-2-23-JUN", "spec_code": "AQA-COMP-7516", "paper_code": "7516/2",
"tier": None, "session": "June", "type_code": "QP"},
{"exam_code": "AQA-COMP-7516-ER-23-JUN", "spec_code": "AQA-COMP-7516", "paper_code": "7516/2",
"tier": None, "session": "June", "type_code": "ER"},
]
# ─── Neo4j curriculum topics ─────────────────────────────────────────────────
# Curriculum topics stored in Neo4j school databases (not Supabase).
CURRICULUM_TOPICS = {
"Physics": [
{"topic_code": "PHYS-KS3-01", "title": "Forces", "year_group": "9", "key_stage": "3",
"description": "Contact and non-contact forces, resultant forces, moments"},
{"topic_code": "PHYS-KS3-02", "title": "Energy", "year_group": "9", "key_stage": "3",
"description": "Energy stores, transfers, conservation, dissipation"},
{"topic_code": "PHYS-KS3-03", "title": "Waves", "year_group": "9", "key_stage": "3",
"description": "Transverse and longitudinal waves, reflection, refraction, diffraction"},
{"topic_code": "PHYS-KS4-01", "title": "Electricity", "year_group": "10", "key_stage": "4",
"description": "Circuits, current, potential difference, resistance, power"},
{"topic_code": "PHYS-KS4-02", "title": "Magnetism and Electromagnetism", "year_group": "10", "key_stage": "4",
"description": "Magnetic fields, electromagnets, motors, generators"},
{"topic_code": "PHYS-KS4-03", "title": "Atomic Structure", "year_group": "10", "key_stage": "4",
"description": "Atoms, isotopes, radioactivity, half-life"},
{"topic_code": "PHYS-KS4-04", "title": "Particle Physics", "year_group": "11", "key_stage": "4",
"description": "Standard model, quarks, leptons, bosons"},
{"topic_code": "PHYS-KS4-05", "title": "Cosmology", "year_group": "11", "key_stage": "4",
"description": "Big Bang, stellar evolution, redshift"},
],
"Mathematics": [
{"topic_code": "MATH-KS3-01", "title": "Number", "year_group": "9", "key_stage": "3",
"description": "Integers, fractions, decimals, percentages, ratio, proportion"},
{"topic_code": "MATH-KS3-02", "title": "Algebra", "year_group": "9", "key_stage": "3",
"description": "Expressions, equations, inequalities, sequences"},
{"topic_code": "MATH-KS3-03", "title": "Geometry", "year_group": "9", "key_stage": "3",
"description": "Angles, polygons, circles, transformations, constructions"},
{"topic_code": "MATH-KS4-01", "title": "Number and Algebra", "year_group": "10", "key_stage": "4",
"description": "Surds, indices, standard form, expanding brackets, factorising"},
{"topic_code": "MATH-KS4-02", "title": "Graphs and Functions", "year_group": "10", "key_stage": "4",
"description": "Linear, quadratic, cubic graphs, gradients, intercepts"},
{"topic_code": "MATH-KS4-03", "title": "Statistics and Probability", "year_group": "10", "key_stage": "4",
"description": "Data types, charts, expected frequency, tree diagrams, two-way tables"},
{"topic_code": "MATH-KS4-04", "title": "Geometry and Measures", "year_group": "10", "key_stage": "4",
"description": "Area, volume, surface area, Pythagoras, trigonometry, bearings"},
{"topic_code": "MATH-KS4-05", "title": "Simultaneous Equations and Quadratics", "year_group": "11", "key_stage": "4",
"description": "Solving simultaneous equations, completing the square, quadratic formula"},
],
"Computer Science": [
{"topic_code": "CS-KS4-01", "title": "Data Representation", "year_group": "10", "key_stage": "4",
"description": "Binary, hexadecimal, bit operations, compression, encryption"},
{"topic_code": "CS-KS4-02", "title": "Computer Systems", "year_group": "10", "key_stage": "4",
"description": "CPU architecture, memory, storage, networks, topologies"},
{"topic_code": "CS-KS4-03", "title": "Algorithms and Programming", "year_group": "10", "key_stage": "4",
"description": "Algorithms, flowcharts, pseudocode, debugging, testing"},
{"topic_code": "CS-KS4-04", "title": "Data Types and Structures", "year_group": "11", "key_stage": "4",
"description": "Strings, arrays, lists, records, 2D arrays"},
{"topic_code": "CS-KS4-05", "title": "Boolean Logic and Search", "year_group": "11", "key_stage": "4",
"description": "Boolean operators, linear search, binary search, sorting"},
],
}
# ─── Helpers ───────────────────────────────────────────────────────────────────
def _sb_headers() -> Dict:
return {
"apikey": SERVICE_KEY,
"Authorization": f"Bearer {SERVICE_KEY}",
"Content-Type": "application/json",
}
def _sign_in(email: str, password: str) -> str:
r = requests.post(
f"{SUPA_URL}/auth/v1/token?grant_type=password",
headers={"apikey": SERVICE_KEY, "Content-Type": "application/json"},
json={"email": email, "password": password},
)
r.raise_for_status()
return r.json()["access_token"]
# ─── Main seed ─────────────────────────────────────────────────────────────────
def seed() -> Dict[str, Any]:
print("=" * 60)
print("Curriculum seed — exam board specs and exams")
print("=" * 60)
results: Dict[str, Any] = {}
errors: List[str] = []
# ── [1] Seed eb_specifications ──────────────────────────────────────────
print("\n[1] Seeding exam board specifications...")
specs_created = 0
specs_skipped = 0
for spec in SPECIFICATIONS:
r = requests.post(
f"{SUPA_URL}/rest/v1/eb_specifications",
headers={**_sb_headers(), "Prefer": "return=representation"},
json={
**spec,
"id": str(uuid.uuid4()),
"doc_details": {},
"docling_docs": {},
},
params={"on_conflict": "spec_code"},
)
if r.status_code in (200, 201):
specs_created += 1
print(f"{spec['spec_code']} ({spec['exam_board_code']}/{spec['subject_code']})")
elif r.status_code == 409:
specs_skipped += 1
print(f" ~ SKIP (exists): {spec['spec_code']}")
else:
err = f"spec {spec['spec_code']}: {r.status_code} {r.text[:100]}"
print(f"{err}")
errors.append(err)
results["specifications"] = {"created": specs_created, "skipped": specs_skipped}
# ── [2] Seed eb_exams ───────────────────────────────────────────────────
print("\n[2] Seeding exam papers...")
exams_created = 0
exams_skipped = 0
for exam in EXAMS:
r = requests.post(
f"{SUPA_URL}/rest/v1/eb_exams",
headers={**_sb_headers(), "Prefer": "return=representation"},
json={
**exam,
"id": str(uuid.uuid4()),
"doc_details": {},
"docling_docs": {},
},
params={"on_conflict": "exam_code"},
)
if r.status_code in (200, 201):
exams_created += 1
print(f"{exam['exam_code']} ({exam['type_code']})")
elif r.status_code == 409:
exams_skipped += 1
print(f" ~ SKIP (exists): {exam['exam_code']}")
else:
err = f"exam {exam['exam_code']}: {r.status_code} {r.text[:100]}"
print(f"{err}")
errors.append(err)
results["exams"] = {"created": exams_created, "skipped": exams_skipped}
# ── [3] Seed Neo4j curriculum topics ────────────────────────────────────
print("\n[3] Seeding Neo4j curriculum topics...")
try:
from neo4j import GraphDatabase
driver = GraphDatabase.driver("bolt://192.168.0.209:7687", auth=("neo4j", "&%N304j&%"))
topics_created = 0
topics_skipped = 0
for school_id, school_name in [(KEVLARAI_ID, "KevlarAI"), (GREENFIELD_ID, "Greenfield Academy")]:
db_name = f"cc.institutes.{school_id.replace('-', '')}"
print(f"\n [{school_name}] -> {db_name}")
with driver.session(database=db_name) as s:
for subject, topics in CURRICULUM_TOPICS.items():
# Create subject node
s.run(
"MERGE (s:Subject {code: $subject}) "
"SET s.title = $title, s.school_id = $school_id",
subject=subject, title=subject, school_id=school_id,
)
for topic in topics:
result = s.run(
"MERGE (t:CurriculumTopic {code: $code}) "
"SET t.title = $title, "
" t.year_group = $year_group, "
" t.key_stage = $key_stage, "
" t.description = $description, "
" t.subject_code = $subject, "
" t.school_id = $school_id "
"MERGE (s:Subject {code: $subject}) "
"MERGE (s)-[:CONTAINS_TOPIC]->(t)",
code=topic["topic_code"],
title=topic["title"],
year_group=topic["year_group"],
key_stage=topic["key_stage"],
description=topic["description"],
subject=subject,
school_id=school_id,
)
# Check if it was created or matched
topics_created += 1
print(f"{school_name}: {len(CURRICULUM_TOPICS) * len(list(CURRICULUM_TOPICS.values())[0])} topic nodes")
driver.close()
results["neo4j_topics"] = {"created": topics_created}
except Exception as e:
err = f"neo4j_topics: {e}"
print(f"{err}")
errors.append(err)
results["neo4j_topics"] = {"error": str(e)}
# ── Summary ─────────────────────────────────────────────────────────────
print("\n" + "=" * 60)
results["success"] = len(errors) == 0
results["errors"] = errors
print(f"COMPLETE — {specs_created} specs, {exams_created} exams, "
f"{results.get('neo4j_topics', {}).get('created', '?')} topics")
if errors:
print(f"Errors ({len(errors)}):")
for e in errors:
print(f"{e}")
print("=" * 60)
return results
if __name__ == "__main__":
import json
print(json.dumps(seed(), indent=2, default=str))