feat(seed): wire exam-corpus mode into the init entrypoint (gated)

Add 'exam-corpus' INIT_MODE: docker-entrypoint.sh case -> main.py --mode
exam-corpus -> run_exam_corpus_mode() -> seed_exam_corpus.load(). Driven by
EXAM_CORPUS_MANIFEST (+ DRY_RUN/FORCE/BOARD/SPEC/USER_SUBSET/FIRST_SWEEP env).
Skips gracefully (success) when no manifest is configured, so it is safe in a
comma list like INIT_MODE=infra,seed,exam-corpus before papers are gathered.
Bucket provisioning stays in infra mode.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
CC Worker 2026-06-07 22:26:58 +00:00
parent 9aabc12062
commit d8cf3bbc62
2 changed files with 61 additions and 2 deletions

View File

@ -75,6 +75,14 @@ if [ "$RUN_INIT" = "true" ]; then
}
print_success "GAIS data import completed"
;;
"exam-corpus")
print_status "Seeding exam-paper corpus (manifest-gated; skips if none configured)..."
python3 main.py --mode exam-corpus || {
print_error "Exam corpus seed failed!"
exit 1
}
print_success "Exam corpus seed completed"
;;
"full")
print_status "Running full initialization..."
python3 main.py --mode infra || exit 1

53
main.py
View File

@ -323,6 +323,52 @@ def run_gais_data_mode():
# Old clear_dev_redis_queue function removed - now handled by Redis Manager
def run_exam_corpus_mode():
"""Seed the public exam-paper corpus from a manifest (optional, gated).
Env controls:
EXAM_CORPUS_MANIFEST - path to the corpus manifest (required to do anything)
EXAM_CORPUS_DRY_RUN - 'true' to validate + report only
EXAM_CORPUS_FORCE - 'true' to re-upload/overwrite existing objects
EXAM_CORPUS_BOARD/_SPEC - filter to one exam_board_code / spec_code
EXAM_CORPUS_USER_SUBSET - 'true' to also seed a user-side test subset
EXAM_CORPUS_FIRST_SWEEP - 'true' to run the docling/auto-map first pass
Skips gracefully (success) when no manifest is configured/present, so it is safe
in a comma-mode list (e.g. INIT_MODE=infra,seed,exam-corpus) before papers exist.
Buckets are NOT created here infra mode (buckets.py) owns provisioning.
"""
logger.info("Running in exam-corpus seed mode")
manifest = os.getenv("EXAM_CORPUS_MANIFEST")
if not manifest or not os.path.exists(manifest):
logger.warning(
f"exam-corpus: no manifest at EXAM_CORPUS_MANIFEST={manifest!r}; skipping (nothing to seed yet)"
)
return True
try:
from run.initialization.seed_exam_corpus import load
rep = load(
manifest,
dry_run=_truthy_env("EXAM_CORPUS_DRY_RUN"),
force=_truthy_env("EXAM_CORPUS_FORCE"),
board_filter=os.getenv("EXAM_CORPUS_BOARD") or None,
spec_filter=os.getenv("EXAM_CORPUS_SPEC") or None,
user_subset=_truthy_env("EXAM_CORPUS_USER_SUBSET"),
do_first_sweep=_truthy_env("EXAM_CORPUS_FIRST_SWEEP"),
)
if rep.errors:
logger.error(f"exam-corpus seed completed with {len(rep.errors)} error(s)")
return False
logger.info(
f"exam-corpus seed ok: specs={rep.specs_upserted} papers={rep.papers_upserted} "
f"uploaded={rep.files_uploaded}"
)
return True
except Exception as e:
logger.error(f"exam-corpus seed failed: {e}")
return False
def run_development_mode():
"""Run the server in development mode with auto-reload"""
logger.info("Running in development mode")
@ -411,7 +457,7 @@ Startup modes:
parser.add_argument(
'--mode', '-m',
choices=['infra', 'seed', 'seed-test', 'gais-data', 'dev', 'prod'],
choices=['infra', 'seed', 'seed-test', 'gais-data', 'exam-corpus', 'dev', 'prod'],
default='dev',
help='Startup mode (default: dev)'
)
@ -447,6 +493,11 @@ if __name__ == "__main__":
success = run_gais_data_mode()
sys.exit(0 if success else 1)
elif args.mode == 'exam-corpus':
# Seed the public exam-paper corpus from a manifest (gated; skips if none configured)
success = run_exam_corpus_mode()
sys.exit(0 if success else 1)
elif args.mode == 'dev':
# Run development server
run_development_mode()