Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
(cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)
82 lines
2.8 KiB
Python
82 lines
2.8 KiB
Python
from api.services.docling.extract import aqa_questions_rapid
|
|
|
|
|
|
def _text(raw, page, l, t, r=120, b=None):
|
|
return {
|
|
"text": raw,
|
|
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
|
|
}
|
|
|
|
|
|
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
|
|
(tmp_path / "p1.json").write_text(
|
|
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
|
)
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
assert "02.3" in parts
|
|
|
|
|
|
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
|
|
(tmp_path / "p1.json").write_text(
|
|
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
|
)
|
|
(tmp_path / "p2.json").write_text(
|
|
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
|
)
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
assert parts["07.1"]["page"] == 2
|
|
assert parts["07.1"]["bbox"]["l"] == 49
|
|
assert "07.2" in parts
|
|
|
|
|
|
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
|
|
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
|
|
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
|
|
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
|
|
import json
|
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
|
|
assert label in parts
|
|
|
|
|
|
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
|
|
import json
|
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
assert "01.3" in parts
|
|
|
|
|
|
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
|
|
import json
|
|
texts = [
|
|
_text("05.2 Some question text", 1, 49, 700),
|
|
_text("05.3 Middle question text", 1, 49, 620),
|
|
_text("05.5 Later question text", 2, 49, 740),
|
|
]
|
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
|
|
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
assert "05.1" in parts
|
|
assert "05.4" in parts
|
|
assert "05.5" in parts
|
|
|
|
|
|
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
|
|
import json
|
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
|
|
|
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
|
|
|
assert "03.0" in parts
|