api/tests/test_docling_auto_map.py
kcar 5938613893
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
[verified] add docling auto-map package wrapper
2026-06-07 20:03:06 +01:00

52 lines
2.1 KiB
Python

import json
import os
from pathlib import Path
import pytest
from api.services.docling import FIRST_PASS_SCHEMA, auto_map
SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf"
PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json"
BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf"
@pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present")
def test_auto_map_matches_spike_physics_template_shape():
expected = json.loads(PHYSICS_TEMPLATE.read_text())
result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT)
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
assert result["meta"]["schema"] == expected["meta"]["schema"]
assert set(result.keys()) == set(expected.keys())
assert result["meta"]["board"] == expected["meta"]["board"]
assert result["meta"]["paper_code"] == expected["meta"]["paper_code"]
assert len(result["margins"]) == len(expected["margins"])
assert set(result["pages"].keys()) == set(expected["pages"].keys())
assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
assert result["pages"]["2"]["part_bands"][0].keys() == expected["pages"]["2"]["part_bands"][0].keys()
@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")
def test_auto_map_fast_path_without_cache_produces_first_pass_template():
result = auto_map(
BORN_DIGITAL_PDF.read_bytes(),
source_pdf="samples/physics-p1h-2022-qp.pdf",
spike_root=SPIKE_ROOT,
prefer_cache=False,
)
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
assert result["meta"]["board"] == "aqa"
assert result["meta"]["paper_code"] == "8463/1"
assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf"
assert result["margins"]
assert result["pages"]
def test_auto_map_rejects_empty_pdf_bytes():
with pytest.raises(ValueError):
auto_map(b"")