api/run/initialization/reset_environment.py
CC Worker a6753d092f
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
fix(reset): fold --user-subset cleanup into scope=all and scope=exam-corpus
t_d1600327 added a standalone scope=user-subset, but a full reset (scope=all)
and scope=exam-corpus still left the --user-subset cc.users storage objects
orphaned (files rows are wiped by the table clear, but the Storage API objects
are not). Call the same _clear_user_subset_files() helper in both paths so the
finding-#2 gap is fully closed: storage removed before rows, idempotent.

Closes overwatch review finding #2 (user-subset not cleaned by reset).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-08 00:26:24 +00:00

430 lines
19 KiB
Python

"""
reset_environment.py — DESTRUCTIVE wipe of all non-permanent data.
Clears:
- Neo4j: drops ALL databases except system, neo4j (including gaisdata, cc.users.*, cc.institutes.*)
- Supabase: deletes ALL data tables except gais_local_authorities and gais_schools
- Supabase: deletes all auth users except kcar, then re-seeds kcar profile state
- Granular scopes can clear exam corpus, timetable data, or --user-subset seed copies
Safe invariants (never touched):
- kcar auth account
- gais_local_authorities and gais_schools Supabase tables
- system / neo4j Neo4j system databases
Run from inside the ccapi container:
python3 -c "from run.initialization.reset_environment import reset; reset()"
"""
import os
import time
import requests
from typing import List, Dict, Any
from modules.logger_tool import initialise_logger
import modules.database.tools.neo4j_driver_tools as dt
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), "default", True)
KCAR_ID = "d9e1d1a9-04c4-4611-bb05-57babf4a9a28"
KCAR_EMAIL = "kcar@kevlarai.com"
# Neo4j system databases — never drop these
NEO4J_SYSTEM_DBS = {"system", "neo4j"}
# Supabase tables to clear, in FK child-first order.
# gais_local_authorities and gais_schools are intentionally absent.
SUPABASE_TABLES_TO_CLEAR = [
# ── Transcription (deepest children first) ───────────────────────────────
"canvas_events",
"keyword_events",
"transcription_summaries",
"transcription_segments",
"keyword_watches",
"transcription_sessions",
# ── Lesson delivery chain ────────────────────────────────────────────────
"lesson_deliveries",
"lesson_collaborators",
# ── Timetable materialization ────────────────────────────────────────────
"taught_lessons",
# ── Academic calendar (children → parents) ───────────────────────────────
"academic_periods",
"academic_days",
"academic_weeks",
"academic_term_breaks",
"academic_terms",
"academic_years",
# ── Teacher timetables ───────────────────────────────────────────────────
"teacher_timetable_slots",
"teacher_timetables",
"school_timetables",
# ── Lesson plans ─────────────────────────────────────────────────────────
"planned_lessons",
# ── Whiteboard rooms ─────────────────────────────────────────────────────
"whiteboard_rooms",
# ── Classes & enrollment ─────────────────────────────────────────────────
"enrollment_requests",
"class_students",
"class_teachers",
"classes",
# ── Files & brains ───────────────────────────────────────────────────────
"document_artefacts",
"brain_files",
"cabinet_memberships",
"files",
"file_cabinets",
"brains",
# ── Invitations & memberships ────────────────────────────────────────────
"invitations",
"institute_memberships",
"institute_membership_requests",
# ── Institutes ───────────────────────────────────────────────────────────
"institutes",
# ── Profiles (non-kcar cleared separately via auth deletion cascade) ─────
"admin_profiles",
]
# Exam-marker subsystem tables, FK child-first. scope="exam-corpus" is deliberately
# broader than "public papers": it wipes public corpus eb_* rows, templates, layouts,
# questions, boundaries, response areas, marking batches, student submissions, and mark
# entries. NOT in the list above — the previous full reset() never cleared exam data
# or storage at all; the granular scopes below fold it in.
EXAM_CORPUS_TABLES = [
"mark_entries",
"student_submissions",
"marking_batches",
"exam_response_areas",
"exam_boundaries",
"exam_template_layout",
"exam_questions",
"exam_templates",
"eb_exams",
"eb_specifications",
]
# Timetable / calendar materialization subset (for scope='timetable').
TIMETABLE_TABLES = [
"lesson_deliveries",
"lesson_collaborators",
"taught_lessons",
"academic_periods",
"academic_days",
"academic_weeks",
"academic_term_breaks",
"academic_terms",
"academic_years",
"teacher_timetable_slots",
"teacher_timetables",
"school_timetables",
"planned_lessons",
]
# Bucket whose objects scope="exam-corpus" clears for the whole exam-marker subsystem
# (Storage API — protect_delete blocks raw SQL).
EXAM_STORAGE_BUCKET = "cc.examboards"
def _sb_headers():
url = os.environ["SUPABASE_URL"]
key = os.environ["SERVICE_ROLE_KEY"]
return url, {
"apikey": key,
"Authorization": f"Bearer {key}",
"Content-Type": "application/json",
"Prefer": "return=minimal",
}
# Markers that identify a production Supabase target. Destructive reset against any of these is
# refused by default (project rule: ".94 only; .156 human-gated") — set RESET_ALLOW_PROD=1 to override.
PROD_TARGET_MARKERS = ("192.168.0.156", "supabase.classroomcopilot")
def _assert_reset_allowed(url: str, scope: str) -> None:
"""Default-deny destructive reset against a production-looking Supabase target.
The /admin/reset route and this module both act on os.environ['SUPABASE_URL']; without this guard
a platform-admin call on a prod-deployed API would wipe prod data + exam corpus + storage. We refuse
when the target matches a known prod marker unless an explicit RESET_ALLOW_PROD opt-in is set.
"""
target = (url or "").lower()
looks_prod = any(m in target for m in PROD_TARGET_MARKERS)
override = os.environ.get("RESET_ALLOW_PROD", "").strip().lower() in ("1", "true", "yes")
if looks_prod and not override:
raise RuntimeError(
f"refusing destructive reset (scope={scope}) against production-looking target {target!r}; "
f"this is human-gated — set RESET_ALLOW_PROD=1 to override."
)
# ─── Neo4j helpers ────────────────────────────────────────────────────────────
def _neo4j_drop_all_non_system() -> Dict[str, List[str]]:
"""Drop every Neo4j DB except the system-reserved ones."""
with dt.get_session(database="system") as s:
all_dbs = [r["name"] for r in s.run("SHOW DATABASES YIELD name RETURN name")]
to_drop = [db for db in all_dbs if db not in NEO4J_SYSTEM_DBS]
dropped = []
for db in to_drop:
logger.info(f" DROP DATABASE `{db}`")
try:
with dt.get_session(database="system") as s:
s.run(f"DROP DATABASE `{db}` IF EXISTS")
dropped.append(db)
except Exception as e:
logger.warning(f" Could not drop `{db}`: {e}")
return dropped
# ─── Supabase helpers ─────────────────────────────────────────────────────────
# Tables without an uid=1000(kcar) gid=1000(kcar) groups=1000(kcar),27(sudo),119(docker) column — map to the column to use as the delete filter.
TABLE_FILTER_COLUMN = {
"brain_files": "brain_id",
}
def _sb_clear_table(url: str, headers: dict, table: str) -> int:
"""Delete all rows from a Supabase table. Returns HTTP status."""
col = TABLE_FILTER_COLUMN.get(table, "id")
r = requests.delete(
f"{url}/rest/v1/{table}",
headers=headers,
params={col: "not.is.null"},
)
if r.status_code not in (200, 204):
logger.warning(f" Clear {table}: {r.status_code} {r.text[:120]}")
return r.status_code
def _supabase_list_auth_users(url: str, headers: dict) -> List[Dict]:
r = requests.get(f"{url}/auth/v1/admin/users", headers=headers, params={"per_page": 200})
r.raise_for_status()
return r.json().get("users", [])
def _supabase_delete_auth_user(url: str, headers: dict, uid: str):
r = requests.delete(f"{url}/auth/v1/admin/users/{uid}", headers=headers)
if r.status_code not in (200, 204):
logger.warning(f" Delete auth user {uid}: {r.status_code} {r.text[:80]}")
# ─── Granular helpers ───────────────────────────────────────────────────────────
def _clear_tables(url: str, headers: dict, tables: List[str]) -> "tuple[List[str], List[str]]":
cleared, failed = [], []
for table in tables:
if _sb_clear_table(url, headers, table) in (200, 204):
cleared.append(table)
logger.info(f"{table}")
else:
failed.append(table)
return cleared, failed
def _clear_exam_storage() -> Dict[str, Any]:
"""Remove cc.examboards objects for the exam-marker subsystem.
scope="exam-corpus" is not limited to public-paper metadata: it also removes the
storage objects that back exam board corpus files and any downstream exam-marker
artifacts referenced from eb_exams/eb_specifications. Gathers storage_loc from
eb_exams/eb_specifications BEFORE the rows are cleared.
"""
try:
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
except Exception as exc:
logger.warning(f" exam storage clear skipped (import): {exc}")
return {"removed": 0, "error": str(exc)}
sb = SupabaseServiceRoleClient().supabase
storage = StorageAdmin()
locs: List[str] = []
for table in ("eb_exams", "eb_specifications"):
try:
rows = sb.table(table).select("storage_loc").execute().data or []
locs += [r["storage_loc"] for r in rows if r.get("storage_loc")]
except Exception as exc:
logger.warning(f" storage_loc gather {table}: {exc}")
by_bucket: Dict[str, List[str]] = {}
for loc in locs:
if "/" in loc:
b, _, p = loc.partition("/")
by_bucket.setdefault(b, []).append(p)
removed = 0
for b, paths in by_bucket.items():
for i in range(0, len(paths), 100):
chunk = paths[i:i + 100]
try:
storage.client.supabase.storage.from_(b).remove(chunk)
removed += len(chunk)
except Exception as exc:
logger.warning(f" storage remove {b}: {exc}")
logger.info(f" exam storage removed {removed} objects from {list(by_bucket)}")
return {"removed": removed, "buckets": list(by_bucket)}
def _clear_user_subset_files() -> Dict[str, Any]:
"""Remove files rows and cc.users storage objects created by --user-subset seeding.
Reuses the seed/unseed implementation so reset(scope="user-subset") has the
same storage-before-row deletion order and idempotency guarantees as
seed_exam_corpus.py --unseed. The helper only targets rows marked by the seeder:
bucket='cc.users', source='exam-corpus-seed', path LIKE 'exam-marker/%'.
"""
try:
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from run.initialization.seed_exam_corpus import LoadReport, _delete_user_subset_files
except Exception as exc:
logger.warning(f" user-subset clear skipped (import): {exc}")
return {"files_rows_deleted": 0, "storage_objects_removed": 0, "errors": [str(exc)]}
rep = LoadReport()
_delete_user_subset_files(
SupabaseServiceRoleClient(),
StorageAdmin(),
exam_codes=None,
rep=rep,
)
return {
"files_rows_deleted": rep.unseed_user_files,
"storage_objects_removed": rep.unseed_objects,
"errors": rep.errors,
}
# ─── Main reset ───────────────────────────────────────────────────────────────
def reset(scope: str = "all") -> Dict[str, Any]:
"""Destructive reset. scope ∈ {all, exam-corpus, timetable, user-subset}.
- all : full wipe (Neo4j + Supabase data + auth users) AND the entire
exam-marker subsystem listed below, including --user-subset copies.
- exam-corpus : ONLY the entire exam-marker subsystem, not just public papers:
public corpus/eb_* data, cc.examboards storage objects, exam
templates, template layouts, questions, boundaries, response
areas, marking batches, student submissions, mark entries, and
--user-subset cc.users copies.
- timetable : ONLY timetable/calendar materialization tables.
- user-subset : ONLY files rows and cc.users storage objects created by
seed_exam_corpus.py --user-subset.
"""
scope = (scope or "all").lower()
if scope not in ("all", "exam-corpus", "timetable", "user-subset"):
raise ValueError(f"invalid scope {scope!r} (want all|exam-corpus|timetable|user-subset)")
url, headers = _sb_headers()
_assert_reset_allowed(url, scope)
if scope == "exam-corpus":
logger.info("RESET (scope=exam-corpus) — entire exam-marker subsystem: public corpus/eb_* data, cc.examboards storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries, and --user-subset copies")
user_subset = _clear_user_subset_files()
storage = _clear_exam_storage()
cleared, failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
return {"scope": scope, "user_subset": user_subset, "exam_storage": storage, "tables_cleared": cleared, "tables_failed": failed}
if scope == "timetable":
logger.info("RESET (scope=timetable) — timetable/calendar tables")
cleared, failed = _clear_tables(url, headers, TIMETABLE_TABLES)
return {"scope": scope, "tables_cleared": cleared, "tables_failed": failed}
if scope == "user-subset":
logger.info("RESET (scope=user-subset) — --user-subset cc.users storage objects and files rows")
user_subset = _clear_user_subset_files()
return {"scope": scope, "user_subset": user_subset}
logger.info("=" * 60)
logger.info("RESET ENVIRONMENT — full destructive wipe starting")
logger.info("=" * 60)
results: Dict[str, Any] = {"scope": scope}
# ── 1. Neo4j: drop everything except system + neo4j ──────────────────────
logger.info("\n[Neo4j] Dropping all non-system databases...")
dropped = _neo4j_drop_all_non_system()
logger.info(f" Dropped {len(dropped)}: {dropped}")
results["neo4j"] = {"dropped": dropped}
# ── 2. Supabase: clear all data tables (GAIS preserved) ──────────────────
# First remove --user-subset cc.users storage objects (+ their files rows) via the
# Storage API, so the generic files-table clear below doesn't strand orphaned objects.
results["user_subset"] = _clear_user_subset_files()
logger.info("\n[Supabase] Clearing data tables (preserving gais_*)...")
url, headers = _sb_headers()
cleared, failed = [], []
for table in SUPABASE_TABLES_TO_CLEAR:
status = _sb_clear_table(url, headers, table)
if status in (200, 204):
cleared.append(table)
logger.info(f"{table}")
else:
failed.append(table)
logger.info(f" Cleared {len(cleared)} tables, {len(failed)} failed")
# ── 3. Supabase: delete all auth users except kcar ────────────────────────
logger.info("\n[Supabase] Deleting test auth users...")
all_users = _supabase_list_auth_users(url, headers)
deleted_emails = []
for u in all_users:
if u["email"] == KCAR_EMAIL:
continue
_supabase_delete_auth_user(url, headers, u["id"])
deleted_emails.append(u["email"])
time.sleep(0.05)
logger.info(f" Deleted {len(deleted_emails)} auth users")
# Explicit cleanup in case cascade didn't fire
requests.delete(f"{url}/rest/v1/profiles", headers=headers,
params={"id": f"neq.{KCAR_ID}"})
# ── 4. Reset kcar profile to known-good platform_admin state ──────────────
logger.info("\n[Supabase] Resetting kcar profile...")
requests.patch(
f"{url}/rest/v1/profiles",
headers=headers,
params={"id": f"eq.{KCAR_ID}"},
json={"school_id": None},
)
logger.info(" kcar → school_id: null ✓")
# Restore admin_profiles row (wiped with other tables above)
requests.post(
f"{url}/rest/v1/admin_profiles",
headers={**headers, "Prefer": "resolution=merge-duplicates"},
json={
"id": KCAR_ID,
"email": KCAR_EMAIL,
"display_name": "Kevin Carroll",
"admin_role": "super_admin",
"is_super_admin": True,
},
)
logger.info(" kcar → admin_profiles restored ✓")
# ── 5. Exam-marker subsystem: storage objects (Storage API) + all exam tables ──
# This is the same destructive surface as scope="exam-corpus": public corpus/eb_*
# rows, cc.examboards storage, templates/layout/questions/boundaries/response
# areas, marking batches, submissions, and mark entries. (The legacy full reset
# cleared neither exam tables nor storage — folded in here.)
logger.info("\n[Supabase] Clearing entire exam-marker subsystem (public corpus, storage, templates/layout/questions/boundaries/response areas, marking batches, submissions, mark entries)...")
exam_storage = _clear_exam_storage()
exam_cleared, exam_failed = _clear_tables(url, headers, EXAM_CORPUS_TABLES)
results["supabase"] = {
"tables_cleared": cleared,
"tables_failed": failed,
"deleted_users": deleted_emails,
}
results["exam"] = {
"storage": exam_storage,
"tables_cleared": exam_cleared,
"tables_failed": exam_failed,
}
logger.info("\n" + "=" * 60)
logger.info("RESET COMPLETE")
logger.info("=" * 60)
return results
if __name__ == "__main__":
import json
print(json.dumps(reset(), indent=2, default=str))