S5-5: centralized part-box synthesis (band-y x content-margins)

Add synthesize_part_box() as the single authoritative S5 part-box projection
(T3 swap point): content-margin x-extent x part-band y-extent, BOTTOMLEFT
coords; label_box retained as a separate anchor. build() attaches box per part.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Kevin Carter 2026-06-07 20:38:25 +01:00
parent 71ddceb19e
commit 621d283ceb
3 changed files with 95 additions and 5 deletions

View File

@ -44,6 +44,34 @@ def _furn_kind(it):
return "chrome_text"
def synthesize_part_box(part_band, content_x_band):
"""Return the one authoritative S5 part-box projection.
Parts remain boxes in S5, but the box is a projection rather than intrinsic
geometry: document content margins provide the x-extent and the part band
provides y. The band end is already bounded by the next part in bands.py;
the original label box remains a separate anchor for rendering/review.
Coordinates stay in the first-pass PDF-point BOTTOMLEFT bbox shape.
"""
if not content_x_band:
return None
try:
x_left = content_x_band["x_left"]
x_right = content_x_band["x_right"]
y_start = part_band["y_start"]
y_end = part_band["y_end"]
except KeyError:
return None
return {
"l": round(x_left, 1),
"t": round(y_start, 1),
"r": round(x_right, 1),
"b": round(y_end, 1),
"coord_origin": "BOTTOMLEFT",
}
def build(structured, bands, furniture, pdf=None, page_roles=None):
page_roles = page_roles or {}
part_bbox = {p["label"]: p.get("bbox")
@ -124,10 +152,15 @@ def build(structured, bands, furniture, pdf=None, page_roles=None):
main = [{"question": m["question"], "y_start": m["y_start"], "y_end": m["y_end"],
"is_start": m.get("is_start", True),
"source": "auto", "confirmed": False} for m in pb["main"]]
part = [{"label": p["label"], "question": p["question"],
"y_start": p["y_start"], "y_end": p["y_end"],
"label_box": part_bbox.get(p["label"]), # app may render a box instead of lines
"source": "auto", "confirmed": False} for p in pb["part"]]
part = []
for p in pb["part"]:
part.append({
"label": p["label"], "question": p["question"],
"y_start": p["y_start"], "y_end": p["y_end"],
"label_box": part_bbox.get(p["label"]), # anchor, not the part extent
"box": synthesize_part_box(p, xband),
"source": "auto", "confirmed": False,
})
pr = page_roles.get(pgs) or page_roles.get(pg) or {}
pages[pgs] = {
"role": pr.get("role", "question"),

View File

@ -26,7 +26,9 @@ def test_auto_map_matches_spike_physics_template_shape():
assert len(result["margins"]) == len(expected["margins"])
assert set(result["pages"].keys()) == set(expected["pages"].keys())
assert result["pages"]["2"]["role"] == expected["pages"]["2"]["role"]
assert result["pages"]["2"]["part_bands"][0].keys() == expected["pages"]["2"]["part_bands"][0].keys()
part_band = result["pages"]["2"]["part_bands"][0]
assert set(expected["pages"]["2"]["part_bands"][0].keys()).issubset(part_band.keys())
assert part_band["box"]
@pytest.mark.skipif(not BORN_DIGITAL_PDF.exists(), reason="born-digital spike PDF not present")

View File

@ -0,0 +1,55 @@
from api.services.docling.template import build, synthesize_part_box
def test_synthesize_part_box_uses_content_margins_and_band_y():
part_band = {"label": "01.1", "y_start": 712.34, "y_end": 601.27}
content_x_band = {"x_left": 54.04, "x_right": 521.96}
assert synthesize_part_box(part_band, content_x_band) == {
"l": 54.0,
"t": 712.3,
"r": 522.0,
"b": 601.3,
"coord_origin": "BOTTOMLEFT",
}
def test_synthesize_part_box_returns_none_without_margin_contract():
assert synthesize_part_box({"y_start": 700, "y_end": 650}, {}) is None
assert synthesize_part_box({"y_start": 700}, {"x_left": 50, "x_right": 520}) is None
def test_build_carries_label_box_as_anchor_and_single_synthesized_part_box():
structured = {
"board": "aqa",
"paper_code": "8463/1",
"questions": [{
"question": "01",
"parts": [{
"label": "01.1",
"page": 2,
"bbox": {"l": 40, "t": 720, "r": 70, "b": 705},
}],
}],
}
bands = {
"pages": {
"2": {
"main": [{"question": "01", "y_start": 730, "y_end": 0, "is_start": True}],
"part": [{"label": "01.1", "question": "01", "y_start": 720, "y_end": 610}],
}
}
}
furniture = {
"n_pages": 2,
"content_margins": {
"content_x_band": {"x_left": 55, "x_right": 515},
"per_page": {"2": {"top": 760, "bottom": 40, "left": 55, "right": 515}},
},
"items": [],
}
part = build(structured, bands, furniture)["pages"]["2"]["part_bands"][0]
assert part["label_box"] == {"l": 40, "t": 720, "r": 70, "b": 705}
assert part["box"] == {"l": 55, "t": 720, "r": 515, "b": 610, "coord_origin": "BOTTOMLEFT"}