diff --git a/api/services/docling/template.py b/api/services/docling/template.py index b2b3929..51307ef 100644 --- a/api/services/docling/template.py +++ b/api/services/docling/template.py @@ -44,6 +44,34 @@ def _furn_kind(it): return "chrome_text" +def synthesize_part_box(part_band, content_x_band): + """Return the one authoritative S5 part-box projection. + + Parts remain boxes in S5, but the box is a projection rather than intrinsic + geometry: document content margins provide the x-extent and the part band + provides y. The band end is already bounded by the next part in bands.py; + the original label box remains a separate anchor for rendering/review. + + Coordinates stay in the first-pass PDF-point BOTTOMLEFT bbox shape. + """ + if not content_x_band: + return None + try: + x_left = content_x_band["x_left"] + x_right = content_x_band["x_right"] + y_start = part_band["y_start"] + y_end = part_band["y_end"] + except KeyError: + return None + return { + "l": round(x_left, 1), + "t": round(y_start, 1), + "r": round(x_right, 1), + "b": round(y_end, 1), + "coord_origin": "BOTTOMLEFT", + } + + def build(structured, bands, furniture, pdf=None, page_roles=None): page_roles = page_roles or {} part_bbox = {p["label"]: p.get("bbox") @@ -124,10 +152,15 @@ def build(structured, bands, furniture, pdf=None, page_roles=None): main = [{"question": m["question"], "y_start": m["y_start"], "y_end": m["y_end"], "is_start": m.get("is_start", True), "source": "auto", "confirmed": False} for m in pb["main"]] - part = [{"label": p["label"], "question": p["question"], - "y_start": p["y_start"], "y_end": p["y_end"], - "label_box": part_bbox.get(p["label"]), # app may render a box instead of lines - "source": "auto", "confirmed": False} for p in pb["part"]] + part = [] + for p in pb["part"]: + part.append({ + "label": p["label"], "question": p["question"], + "y_start": p["y_start"], "y_end": p["y_end"], + "label_box": part_bbox.get(p["label"]), # anchor, not the part extent + "box": synthesize_part_box(p, xband), + "source": "auto", "confirmed": False, + }) pr = page_roles.get(pgs) or page_roles.get(pg) or {} pages[pgs] = { "role": pr.get("role", "question"), diff --git a/tests/test_docling_auto_map.py b/tests/test_docling_auto_map.py index a36e641..f4d7d8d 100644 --- a/tests/test_docling_auto_map.py +++ b/tests/test_docling_auto_map.py @@ -26,7 +26,9 @@ def test_auto_map_matches_spike_physics_template_shape(): assert len(result["margins"]) == len(expected["margins"]) assert set(result["pages"].keys()) == set(expected["pages"].keys()) assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"] - assert result["pages"]["2"]["part_bands"][0].keys() == expected["pages"]["2"]["part_bands"][0].keys() + part_band = result["pages"]["2"]["part_bands"][0] + assert set(expected["pages"]["2"]["part_bands"][0].keys()).issubset(part_band.keys()) + assert part_band["box"] @pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present") diff --git a/tests/test_docling_template.py b/tests/test_docling_template.py new file mode 100644 index 0000000..652f6f9 --- /dev/null +++ b/tests/test_docling_template.py @@ -0,0 +1,55 @@ +from api.services.docling.template import build, synthesize_part_box + + +def test_synthesize_part_box_uses_content_margins_and_band_y(): + part_band = {"label": "01.1", "y_start": 712.34, "y_end": 601.27} + content_x_band = {"x_left": 54.04, "x_right": 521.96} + + assert synthesize_part_box(part_band, content_x_band) == { + "l": 54.0, + "t": 712.3, + "r": 522.0, + "b": 601.3, + "coord_origin": "BOTTOMLEFT", + } + + +def test_synthesize_part_box_returns_none_without_margin_contract(): + assert synthesize_part_box({"y_start": 700, "y_end": 650}, {}) is None + assert synthesize_part_box({"y_start": 700}, {"x_left": 50, "x_right": 520}) is None + + +def test_build_carries_label_box_as_anchor_and_single_synthesized_part_box(): + structured = { + "board": "aqa", + "paper_code": "8463/1", + "questions": [{ + "question": "01", + "parts": [{ + "label": "01.1", + "page": 2, + "bbox": {"l": 40, "t": 720, "r": 70, "b": 705}, + }], + }], + } + bands = { + "pages": { + "2": { + "main": [{"question": "01", "y_start": 730, "y_end": 0, "is_start": True}], + "part": [{"label": "01.1", "question": "01", "y_start": 720, "y_end": 610}], + } + } + } + furniture = { + "n_pages": 2, + "content_margins": { + "content_x_band": {"x_left": 55, "x_right": 515}, + "per_page": {"2": {"top": 760, "bottom": 40, "left": 55, "right": 515}}, + }, + "items": [], + } + + part = build(structured, bands, furniture)["pages"]["2"]["part_bands"][0] + + assert part["label_box"] == {"l": 40, "t": 720, "r": 70, "b": 705} + assert part["box"] == {"l": 55, "t": 720, "r": 515, "b": 610, "coord_origin": "BOTTOMLEFT"}