import json import os from pathlib import Path import pytest from api.services.docling import FIRST_PASS_SCHEMA, auto_map SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike")) PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf" PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json" BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf" @pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present") def test_auto_map_matches_spike_physics_template_shape(): expected = json.loads(PHYSICS_TEMPLATE.read_text()) result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT) assert result["meta"]["schema"] == FIRST_PASS_SCHEMA assert result["meta"]["schema"] == expected["meta"]["schema"] assert set(result.keys()) == set(expected.keys()) assert result["meta"]["board"] == expected["meta"]["board"] assert result["meta"]["paper_code"] == expected["meta"]["paper_code"] assert len(result["margins"]) == len(expected["margins"]) assert set(result["pages"].keys()) == set(expected["pages"].keys()) assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"] part_band = result["pages"]["2"]["part_bands"][0] assert set(expected["pages"]["2"]["part_bands"][0].keys()).issubset(part_band.keys()) assert part_band["box"] @pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present") def test_auto_map_fast_path_without_cache_produces_first_pass_template(): result = auto_map( BORN_DIGITAL_PDF.read_bytes(), source_pdf="samples/physics-p1h-2022-qp.pdf", spike_root=SPIKE_ROOT, prefer_cache=False, ) assert result["meta"]["schema"] == FIRST_PASS_SCHEMA assert result["meta"]["board"] == "aqa" assert result["meta"]["paper_code"] == "8463/1" assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf" assert result["margins"] assert result["pages"] def test_auto_map_rejects_empty_pdf_bytes(): with pytest.raises(ValueError): auto_map(b"")