api/tests/test_docling_auto_map.py
Kevin Carter 621d283ceb S5-5: centralized part-box synthesis (band-y x content-margins)
Add synthesize_part_box() as the single authoritative S5 part-box projection
(T3 swap point): content-margin x-extent x part-band y-extent, BOTTOMLEFT
coords; label_box retained as a separate anchor. build() attaches box per part.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 20:38:25 +01:00

54 lines
2.1 KiB
Python

import json
import os
from pathlib import Path
import pytest
from api.services.docling import FIRST_PASS_SCHEMA, auto_map
SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf"
PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json"
BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf"
@pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present")
def test_auto_map_matches_spike_physics_template_shape():
expected = json.loads(PHYSICS_TEMPLATE.read_text())
result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT)
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
assert result["meta"]["schema"] == expected["meta"]["schema"]
assert set(result.keys()) == set(expected.keys())
assert result["meta"]["board"] == expected["meta"]["board"]
assert result["meta"]["paper_code"] == expected["meta"]["paper_code"]
assert len(result["margins"]) == len(expected["margins"])
assert set(result["pages"].keys()) == set(expected["pages"].keys())
assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
part_band = result["pages"]["2"]["part_bands"][0]
assert set(expected["pages"]["2"]["part_bands"][0].keys()).issubset(part_band.keys())
assert part_band["box"]
@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")
def test_auto_map_fast_path_without_cache_produces_first_pass_template():
result = auto_map(
BORN_DIGITAL_PDF.read_bytes(),
source_pdf="samples/physics-p1h-2022-qp.pdf",
spike_root=SPIKE_ROOT,
prefer_cache=False,
)
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
assert result["meta"]["board"] == "aqa"
assert result["meta"]["paper_code"] == "8463/1"
assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf"
assert result["margins"]
assert result["pages"]
def test_auto_map_rejects_empty_pdf_bytes():
with pytest.raises(ValueError):
auto_map(b"")