diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index fd2f367..913e546 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -75,6 +75,14 @@ if [ "$RUN_INIT" = "true" ]; then } print_success "GAIS data import completed" ;; + "exam-corpus") + print_status "Seeding exam-paper corpus (manifest-gated; skips if none configured)..." + python3 main.py --mode exam-corpus || { + print_error "Exam corpus seed failed!" + exit 1 + } + print_success "Exam corpus seed completed" + ;; "full") print_status "Running full initialization..." python3 main.py --mode infra || exit 1 diff --git a/main.py b/main.py index 58ef975..bc113bc 100644 --- a/main.py +++ b/main.py @@ -323,6 +323,52 @@ def run_gais_data_mode(): # Old clear_dev_redis_queue function removed - now handled by Redis Manager +def run_exam_corpus_mode(): + """Seed the public exam-paper corpus from a manifest (optional, gated). + + Env controls: + EXAM_CORPUS_MANIFEST - path to the corpus manifest (required to do anything) + EXAM_CORPUS_DRY_RUN - 'true' to validate + report only + EXAM_CORPUS_FORCE - 'true' to re-upload/overwrite existing objects + EXAM_CORPUS_BOARD/_SPEC - filter to one exam_board_code / spec_code + EXAM_CORPUS_USER_SUBSET - 'true' to also seed a user-side test subset + EXAM_CORPUS_FIRST_SWEEP - 'true' to run the docling/auto-map first pass + + Skips gracefully (success) when no manifest is configured/present, so it is safe + in a comma-mode list (e.g. INIT_MODE=infra,seed,exam-corpus) before papers exist. + Buckets are NOT created here — infra mode (buckets.py) owns provisioning. + """ + logger.info("Running in exam-corpus seed mode") + manifest = os.getenv("EXAM_CORPUS_MANIFEST") + if not manifest or not os.path.exists(manifest): + logger.warning( + f"exam-corpus: no manifest at EXAM_CORPUS_MANIFEST={manifest!r}; skipping (nothing to seed yet)" + ) + return True + try: + from run.initialization.seed_exam_corpus import load + rep = load( + manifest, + dry_run=_truthy_env("EXAM_CORPUS_DRY_RUN"), + force=_truthy_env("EXAM_CORPUS_FORCE"), + board_filter=os.getenv("EXAM_CORPUS_BOARD") or None, + spec_filter=os.getenv("EXAM_CORPUS_SPEC") or None, + user_subset=_truthy_env("EXAM_CORPUS_USER_SUBSET"), + do_first_sweep=_truthy_env("EXAM_CORPUS_FIRST_SWEEP"), + ) + if rep.errors: + logger.error(f"exam-corpus seed completed with {len(rep.errors)} error(s)") + return False + logger.info( + f"exam-corpus seed ok: specs={rep.specs_upserted} papers={rep.papers_upserted} " + f"uploaded={rep.files_uploaded}" + ) + return True + except Exception as e: + logger.error(f"exam-corpus seed failed: {e}") + return False + + def run_development_mode(): """Run the server in development mode with auto-reload""" logger.info("Running in development mode") @@ -411,7 +457,7 @@ Startup modes: parser.add_argument( '--mode', '-m', - choices=['infra', 'seed', 'seed-test', 'gais-data', 'dev', 'prod'], + choices=['infra', 'seed', 'seed-test', 'gais-data', 'exam-corpus', 'dev', 'prod'], default='dev', help='Startup mode (default: dev)' ) @@ -446,7 +492,12 @@ if __name__ == "__main__": # Run GAIS data import success = run_gais_data_mode() sys.exit(0 if success else 1) - + + elif args.mode == 'exam-corpus': + # Seed the public exam-paper corpus from a manifest (gated; skips if none configured) + success = run_exam_corpus_mode() + sys.exit(0 if success else 1) + elif args.mode == 'dev': # Run development server run_development_mode()