api/tests/test_docling_extract.py
kcar 76e11b0b06
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
feat(docling): B1-2 AQA label normalization + missing-.1 inference + MCQ gap (salvaged)
(cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)
2026-06-08 04:03:17 +00:00

82 lines
2.8 KiB
Python

from api.services.docling.extract import aqa_questions_rapid
def _text(raw, page, l, t, r=120, b=None):
return {
"text": raw,
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
}
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
(tmp_path / "p1.json").write_text(
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "02.3" in parts
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
(tmp_path / "p1.json").write_text(
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
(tmp_path / "p2.json").write_text(
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert parts["07.1"]["page"] == 2
assert parts["07.1"]["bbox"]["l"] == 49
assert "07.2" in parts
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
assert label in parts
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "01.3" in parts
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
import json
texts = [
_text("05.2 Some question text", 1, 49, 700),
_text("05.3 Middle question text", 1, 49, 620),
_text("05.5 Later question text", 2, 49, 740),
]
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "05.1" in parts
assert "05.4" in parts
assert "05.5" in parts
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "03.0" in parts