feat(seed): provision taxonomy buckets (infra) + exam-corpus loader skeleton

infra (buckets.py): add cc.public / cc.institutes / cc.admin to the bucket
provisioner alongside cc.examboards; make initialize_buckets idempotent
(already-exists treated as success). Bucket provisioning stays in infra init.

new (seed_exam_corpus.py): manifest-driven loader scaffold that USES the buckets
(does not create them) — validate -> upload to cc.examboards (canonical path) ->
upsert eb_specifications/eb_exams -> optional user test subset -> optional
--first-sweep auto-map pass. TODOs marked for the gathering task to complete.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
CC Worker 2026-06-07 22:22:48 +00:00
parent e6be762f0c
commit 9aabc12062
2 changed files with 248 additions and 6 deletions

View File

@ -46,7 +46,7 @@ def initialize_buckets() -> dict:
file_size_limit=1000 * 1024 * 1024, # 1GB
)
},
# Exam Board files
# Exam Board files (admin-curated public exam corpus: QP/MS/insert/ER + specs)
{
"id": "cc.examboards",
"options": CreateBucketOptions(
@ -55,6 +55,34 @@ def initialize_buckets() -> dict:
file_size_limit=1000 * 1024 * 1024, # 1GB
)
},
# ── Storage taxonomy bins (access scoped by RLS on bucket + leading path segment; RLS = D1) ──
# Platform-managed public/shared assets (readable by all authenticated users).
{
"id": "cc.public",
"options": CreateBucketOptions(
name="Classroom Copilot Public",
public=False,
file_size_limit=1000 * 1024 * 1024, # 1GB
)
},
# Institute-scoped operational assets: cc.institutes/{institute_id}/...
{
"id": "cc.institutes",
"options": CreateBucketOptions(
name="Classroom Copilot Institutes",
public=False,
file_size_limit=1000 * 1024 * 1024, # 1GB
)
},
# Platform-admin-only assets, seeds, intake/staging for unidentified papers.
{
"id": "cc.admin",
"options": CreateBucketOptions(
name="Classroom Copilot Admin",
public=False,
file_size_limit=1000 * 1024 * 1024, # 1GB
)
},
]
results = {}
@ -81,11 +109,17 @@ def initialize_buckets() -> dict:
logger.error(f"Failed to create bucket: {bucket['id']}")
except Exception as e:
results[bucket["id"]] = {
"status": "error",
"error": str(e)
}
logger.error(f"Error creating bucket {bucket['id']}: {str(e)}")
# Idempotent: an already-existing bucket is not a failure on re-run.
if any(s in str(e).lower() for s in ("already exists", "duplicate", "resource already")):
results[bucket["id"]] = {"status": "exists", "result": str(e)}
success_count += 1
logger.info(f"Bucket already exists (ok): {bucket['id']}")
else:
results[bucket["id"]] = {
"status": "error",
"error": str(e)
}
logger.error(f"Error creating bucket {bucket['id']}: {str(e)}")
# Determine overall success
if success_count == total_count:

View File

@ -0,0 +1,208 @@
"""
seed_exam_corpus.py manifest-driven loader for the public exam-paper corpus.
SCOPE (separate from infra): assumes storage buckets already exist (provisioned by
run/initialization/buckets.py during infra init). This loader UPLOADS papers and
SEEDS the catalogue; it does NOT create buckets.
Pipeline per manifest item:
validate -> upload file to cc.examboards (canonical path) -> upsert eb_specifications
/ eb_exams (catalogue) -> (optional) copy a subset into a test user's exam space
-> (optional, --first-sweep) run the docling/auto-map first pass to gather structure.
Manifest template: ~/cc/specs/exam-corpus-manifest.example.yaml
Catalogue columns (real):
eb_specifications(spec_code, exam_board_code, award_code, subject_code, first_teach,
spec_ver, storage_loc, doc_type, doc_details jsonb)
eb_exams(exam_code, spec_code, paper_code, tier, session, type_code, storage_loc,
doc_type, doc_details jsonb)
Run inside the api container, e.g.:
python3 -m run.initialization.seed_exam_corpus --manifest /path/exam-corpus.yaml --dry-run
python3 -m run.initialization.seed_exam_corpus --manifest ... --board AQA --first-sweep
"""
from __future__ import annotations
import argparse
import hashlib
import os
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import yaml # PyYAML
from modules.logger_tool import initialise_logger
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), "default", True)
EXAM_BUCKET = "cc.examboards"
DOC_TYPES = {"QP", "MS", "INSERT", "ER", "SPECIMEN", "GRADE_BOUNDARIES", "DATA_SHEET"}
TIERS = {"H", "F", None}
# ─────────────────────────────── canonical storage paths ───────────────────────────────
def _lc(s: str) -> str:
return (s or "").strip().lower().replace(" ", "-")
def spec_storage_loc(board: str, subject: str, award: str, spec_ver: str) -> str:
# e.g. cc.examboards/aqa/physics/8463/spec/1.1.pdf
return f"{EXAM_BUCKET}/{_lc(board)}/{_lc(subject)}/{_lc(award)}/spec/{_lc(spec_ver or 'spec')}.pdf"
def paper_storage_loc(board: str, subject: str, award: str, paper_code: str, session: str, doc_type: str) -> str:
# e.g. cc.examboards/aqa/physics/8463/1h/2022-jun/qp.pdf
paper_safe = _lc(paper_code).split("/")[-1]
return f"{EXAM_BUCKET}/{_lc(board)}/{_lc(subject)}/{_lc(award)}/{paper_safe}/{_lc(session)}/{_lc(doc_type)}.pdf"
# ─────────────────────────────── validation ───────────────────────────────
@dataclass
class LoadReport:
specs_upserted: int = 0
papers_upserted: int = 0
files_uploaded: int = 0
files_skipped: int = 0
user_copies: int = 0
swept: int = 0
errors: List[str] = field(default_factory=list)
def validate_manifest(m: Dict[str, Any]) -> List[str]:
errs: List[str] = []
seen_specs, seen_exams = set(), set()
for board in m.get("boards", []):
for spec in board.get("specifications", []):
sc = spec.get("spec_code")
if not sc or sc in seen_specs:
errs.append(f"spec_code missing/duplicate: {sc!r}")
seen_specs.add(sc)
for p in spec.get("papers", []):
ec = p.get("exam_code")
if not ec or ec in seen_exams:
errs.append(f"exam_code missing/duplicate: {ec!r}")
seen_exams.add(ec)
if p.get("doc_type") not in DOC_TYPES:
errs.append(f"{ec}: bad doc_type {p.get('doc_type')!r}")
if p.get("tier") not in TIERS:
errs.append(f"{ec}: bad tier {p.get('tier')!r}")
# TODO(agent): resolve p['file']['source'] (local path or url:) and confirm it exists.
return errs
# ─────────────────────────────── loader steps (TODOs for the gathering agent) ───────────────────────────────
def _resolve_source_bytes(source: str) -> bytes:
"""Local path or 'url:https://...'. TODO(agent): implement url fetch + caching."""
if source.startswith("url:"):
raise NotImplementedError("url: sources — implement fetch in the gathering task")
with open(source, "rb") as f:
return f.read()
def upload_file(client: SupabaseServiceRoleClient, storage_loc: str, data: bytes, *, force: bool, rep: LoadReport) -> None:
bucket, _, path = storage_loc.partition("/")
# TODO(agent): skip-if-exists + checksum (sha256) unless force; then client.upload_file(bucket, path, data).
_ = hashlib.sha256(data).hexdigest()
logger.info(f"[upload] {storage_loc} ({len(data)} bytes) force={force}")
rep.files_uploaded += 1
def upsert_specification(spec: Dict[str, Any], storage_loc: Optional[str], rep: LoadReport) -> None:
row = {
"spec_code": spec["spec_code"], "exam_board_code": spec["exam_board_code"],
"award_code": spec.get("award_code"), "subject_code": spec.get("subject_code"),
"first_teach": spec.get("first_teach"), "spec_ver": spec.get("spec_ver"),
"storage_loc": storage_loc, "doc_type": "pdf",
"doc_details": {"award_level": spec.get("award_level"),
"provenance": spec.get("spec_file", {}).get("provenance")},
}
# TODO(agent): upsert into eb_specifications on conflict (spec_code).
logger.info(f"[spec] upsert {row['spec_code']}")
rep.specs_upserted += 1
def upsert_paper(spec_code: str, p: Dict[str, Any], storage_loc: str, rep: LoadReport) -> None:
row = {
"exam_code": p["exam_code"], "spec_code": spec_code, "paper_code": p.get("paper_code"),
"tier": p.get("tier"), "session": p.get("session"), "type_code": p.get("type_code"),
"doc_type": p["doc_type"], "storage_loc": storage_loc,
"doc_details": {"original_name": p.get("file", {}).get("original_name"),
"provenance": p.get("file", {}).get("provenance")},
}
# TODO(agent): upsert into eb_exams on conflict (exam_code).
logger.info(f"[paper] upsert {row['exam_code']} doc_type={row['doc_type']}")
rep.papers_upserted += 1
def copy_user_test_subset(m: Dict[str, Any], rep: LoadReport) -> None:
"""TODO(agent): copy a small subset of admin papers into a test user's exam space
(cc.users/{user_id}/exam-marker/... or cc.institutes/...) + create user exam_templates rows,
so user-side flows (upload-as-exam / promote-from-cabinet / mark) are testable."""
logger.info("[user-subset] TODO: seed user test papers from admin subset")
def first_sweep(m: Dict[str, Any], rep: LoadReport) -> None:
"""TODO(agent): run the docling/auto_map first pass over seeded papers to populate
exam_templates/questions/regions/layout structure as part of seeding (calls
api.services.docling.auto_map + the /auto-map upsert mapper path)."""
logger.info("[first-sweep] TODO: run auto-map first pass on seeded papers")
# ─────────────────────────────── orchestration ───────────────────────────────
def load(manifest_path: str, *, dry_run: bool, force: bool, board_filter: Optional[str],
spec_filter: Optional[str], user_subset: bool, do_first_sweep: bool) -> LoadReport:
with open(manifest_path) as f:
m = yaml.safe_load(f)
rep = LoadReport()
errs = validate_manifest(m)
if errs:
rep.errors = errs
logger.error(f"manifest validation failed: {len(errs)} error(s)")
for e in errs[:20]:
logger.error(f" - {e}")
if not dry_run:
return rep
client = None if dry_run else SupabaseServiceRoleClient()
for board in m.get("boards", []):
if board_filter and board.get("exam_board_code") != board_filter:
continue
for spec in board.get("specifications", []):
if spec_filter and spec.get("spec_code") != spec_filter:
continue
sloc = None
sf = spec.get("spec_file")
if sf:
sloc = spec_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
spec.get("award_code", ""), spec.get("spec_ver", ""))
if not dry_run:
upload_file(client, sloc, _resolve_source_bytes(sf["source"]), force=force, rep=rep)
if not dry_run:
upsert_specification(spec, sloc, rep)
for p in spec.get("papers", []):
ploc = paper_storage_loc(board["exam_board_code"], spec.get("subject_code", ""),
spec.get("award_code", ""), p["paper_code"], p["session"], p["doc_type"])
if not dry_run:
upload_file(client, ploc, _resolve_source_bytes(p["file"]["source"]), force=force, rep=rep)
upsert_paper(spec["spec_code"], p, ploc, rep)
if user_subset and not dry_run:
copy_user_test_subset(m, rep)
if do_first_sweep and not dry_run:
first_sweep(m, rep)
logger.info(f"corpus load done: specs={rep.specs_upserted} papers={rep.papers_upserted} "
f"uploaded={rep.files_uploaded} skipped={rep.files_skipped} errors={len(rep.errors)}")
return rep
def main() -> None:
ap = argparse.ArgumentParser(description="Seed the public exam-paper corpus from a manifest.")
ap.add_argument("--manifest", required=True)
ap.add_argument("--dry-run", action="store_true", help="validate + report, no writes")
ap.add_argument("--force", action="store_true", help="re-upload/overwrite existing storage objects")
ap.add_argument("--board", default=None, help="only this exam_board_code")
ap.add_argument("--spec", default=None, help="only this spec_code")
ap.add_argument("--user-subset", action="store_true", help="also seed a user-side test subset")
ap.add_argument("--first-sweep", action="store_true", help="run docling/auto-map first pass on seeded papers")
a = ap.parse_args()
load(a.manifest, dry_run=a.dry_run, force=a.force, board_filter=a.board, spec_filter=a.spec,
user_subset=a.user_subset, do_first_sweep=a.first_sweep)
if __name__ == "__main__":
main()