52 lines
2.1 KiB
Python
52 lines
2.1 KiB
Python
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from api.services.docling import FIRST_PASS_SCHEMA, auto_map
|
|
|
|
|
|
SPIKE_ROOT = Path(os.environ.get("DOCLING_SPIKE_ROOT", "/home/kcar/dev/docling-exam-spike"))
|
|
PHYSICS_PDF = SPIKE_ROOT / "samples" / "AQA-Physics-Paper-1H-2022-with-qr.pdf"
|
|
PHYSICS_TEMPLATE = SPIKE_ROOT / "results" / "template" / "physics.json"
|
|
BORN_DIGITAL_PDF = SPIKE_ROOT / "samples" / "physics-p1h-2022-qp.pdf"
|
|
|
|
|
|
@pytest.mark.skipif(not (PHYSICS_PDF.exists() and PHYSICS_TEMPLATE.exists()), reason="spike corpus not present")
|
|
def test_auto_map_matches_spike_physics_template_shape():
|
|
expected = json.loads(PHYSICS_TEMPLATE.read_text())
|
|
result = auto_map(PHYSICS_PDF.read_bytes(), spike_root=SPIKE_ROOT)
|
|
|
|
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
|
|
assert result["meta"]["schema"] == expected["meta"]["schema"]
|
|
assert set(result.keys()) == set(expected.keys())
|
|
assert result["meta"]["board"] == expected["meta"]["board"]
|
|
assert result["meta"]["paper_code"] == expected["meta"]["paper_code"]
|
|
assert len(result["margins"]) == len(expected["margins"])
|
|
assert set(result["pages"].keys()) == set(expected["pages"].keys())
|
|
assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
|
|
assert result["pages"]["2"]["part_bands"][0].keys() == expected["pages"]["2"]["part_bands"][0].keys()
|
|
|
|
|
|
@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")
|
|
def test_auto_map_fast_path_without_cache_produces_first_pass_template():
|
|
result = auto_map(
|
|
BORN_DIGITAL_PDF.read_bytes(),
|
|
source_pdf="samples/physics-p1h-2022-qp.pdf",
|
|
spike_root=SPIKE_ROOT,
|
|
prefer_cache=False,
|
|
)
|
|
|
|
assert result["meta"]["schema"] == FIRST_PASS_SCHEMA
|
|
assert result["meta"]["board"] == "aqa"
|
|
assert result["meta"]["paper_code"] == "8463/1"
|
|
assert result["meta"]["source_pdf"] == "samples/physics-p1h-2022-qp.pdf"
|
|
assert result["margins"]
|
|
assert result["pages"]
|
|
|
|
|
|
def test_auto_map_rejects_empty_pdf_bytes():
|
|
with pytest.raises(ValueError):
|
|
auto_map(b"")
|