feat(seed): wire exam-corpus mode into the init entrypoint (gated)
Add 'exam-corpus' INIT_MODE: docker-entrypoint.sh case -> main.py --mode exam-corpus -> run_exam_corpus_mode() -> seed_exam_corpus.load(). Driven by EXAM_CORPUS_MANIFEST (+ DRY_RUN/FORCE/BOARD/SPEC/USER_SUBSET/FIRST_SWEEP env). Skips gracefully (success) when no manifest is configured, so it is safe in a comma list like INIT_MODE=infra,seed,exam-corpus before papers are gathered. Bucket provisioning stays in infra mode. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
parent
9aabc12062
commit
d8cf3bbc62
@ -75,6 +75,14 @@ if [ "$RUN_INIT" = "true" ]; then
|
||||
}
|
||||
print_success "GAIS data import completed"
|
||||
;;
|
||||
"exam-corpus")
|
||||
print_status "Seeding exam-paper corpus (manifest-gated; skips if none configured)..."
|
||||
python3 main.py --mode exam-corpus || {
|
||||
print_error "Exam corpus seed failed!"
|
||||
exit 1
|
||||
}
|
||||
print_success "Exam corpus seed completed"
|
||||
;;
|
||||
"full")
|
||||
print_status "Running full initialization..."
|
||||
python3 main.py --mode infra || exit 1
|
||||
|
||||
55
main.py
55
main.py
@ -323,6 +323,52 @@ def run_gais_data_mode():
|
||||
|
||||
# Old clear_dev_redis_queue function removed - now handled by Redis Manager
|
||||
|
||||
def run_exam_corpus_mode():
|
||||
"""Seed the public exam-paper corpus from a manifest (optional, gated).
|
||||
|
||||
Env controls:
|
||||
EXAM_CORPUS_MANIFEST - path to the corpus manifest (required to do anything)
|
||||
EXAM_CORPUS_DRY_RUN - 'true' to validate + report only
|
||||
EXAM_CORPUS_FORCE - 'true' to re-upload/overwrite existing objects
|
||||
EXAM_CORPUS_BOARD/_SPEC - filter to one exam_board_code / spec_code
|
||||
EXAM_CORPUS_USER_SUBSET - 'true' to also seed a user-side test subset
|
||||
EXAM_CORPUS_FIRST_SWEEP - 'true' to run the docling/auto-map first pass
|
||||
|
||||
Skips gracefully (success) when no manifest is configured/present, so it is safe
|
||||
in a comma-mode list (e.g. INIT_MODE=infra,seed,exam-corpus) before papers exist.
|
||||
Buckets are NOT created here — infra mode (buckets.py) owns provisioning.
|
||||
"""
|
||||
logger.info("Running in exam-corpus seed mode")
|
||||
manifest = os.getenv("EXAM_CORPUS_MANIFEST")
|
||||
if not manifest or not os.path.exists(manifest):
|
||||
logger.warning(
|
||||
f"exam-corpus: no manifest at EXAM_CORPUS_MANIFEST={manifest!r}; skipping (nothing to seed yet)"
|
||||
)
|
||||
return True
|
||||
try:
|
||||
from run.initialization.seed_exam_corpus import load
|
||||
rep = load(
|
||||
manifest,
|
||||
dry_run=_truthy_env("EXAM_CORPUS_DRY_RUN"),
|
||||
force=_truthy_env("EXAM_CORPUS_FORCE"),
|
||||
board_filter=os.getenv("EXAM_CORPUS_BOARD") or None,
|
||||
spec_filter=os.getenv("EXAM_CORPUS_SPEC") or None,
|
||||
user_subset=_truthy_env("EXAM_CORPUS_USER_SUBSET"),
|
||||
do_first_sweep=_truthy_env("EXAM_CORPUS_FIRST_SWEEP"),
|
||||
)
|
||||
if rep.errors:
|
||||
logger.error(f"exam-corpus seed completed with {len(rep.errors)} error(s)")
|
||||
return False
|
||||
logger.info(
|
||||
f"exam-corpus seed ok: specs={rep.specs_upserted} papers={rep.papers_upserted} "
|
||||
f"uploaded={rep.files_uploaded}"
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"exam-corpus seed failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def run_development_mode():
|
||||
"""Run the server in development mode with auto-reload"""
|
||||
logger.info("Running in development mode")
|
||||
@ -411,7 +457,7 @@ Startup modes:
|
||||
|
||||
parser.add_argument(
|
||||
'--mode', '-m',
|
||||
choices=['infra', 'seed', 'seed-test', 'gais-data', 'dev', 'prod'],
|
||||
choices=['infra', 'seed', 'seed-test', 'gais-data', 'exam-corpus', 'dev', 'prod'],
|
||||
default='dev',
|
||||
help='Startup mode (default: dev)'
|
||||
)
|
||||
@ -446,7 +492,12 @@ if __name__ == "__main__":
|
||||
# Run GAIS data import
|
||||
success = run_gais_data_mode()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
elif args.mode == 'exam-corpus':
|
||||
# Seed the public exam-paper corpus from a manifest (gated; skips if none configured)
|
||||
success = run_exam_corpus_mode()
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
elif args.mode == 'dev':
|
||||
# Run development server
|
||||
run_development_mode()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user