from api.services.docling.extract import aqa_questions_rapid def _text(raw, page, l, t, r=120, b=None): return { "text": raw, "prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}], } def test_aqa_rapid_cleans_noisy_margin_label(tmp_path): (tmp_path / "p1.json").write_text( '{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}' ) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) assert "02.3" in parts def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path): (tmp_path / "p1.json").write_text( '{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}' ) (tmp_path / "p2.json").write_text( '{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}' ) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) assert parts["07.1"]["page"] == 2 assert parts["07.1"]["bbox"]["l"] == 49 assert "07.2" in parts def test_aqa_rapid_fills_small_mcq_gaps(tmp_path): texts = [_text("06.1 Structured question before Section B", 1, 49, 820)] for idx, n in enumerate(["07", "08", "11", "12", "13"]): texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40)) import json (tmp_path / "p1.json").write_text(json.dumps({"texts": texts})) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]: assert label in parts def test_aqa_rapid_maps_circled_digit_labels(tmp_path): import json (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]})) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) assert "01.3" in parts def test_aqa_rapid_infers_internal_structured_gap(tmp_path): import json texts = [ _text("05.2 Some question text", 1, 49, 700), _text("05.3 Middle question text", 1, 49, 620), _text("05.5 Later question text", 2, 49, 740), ] (tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]})) (tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]})) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) assert "05.1" in parts assert "05.4" in parts assert "05.5" in parts def test_aqa_rapid_keeps_bare_single_part_question(tmp_path): import json (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]})) parts = aqa_questions_rapid(str(tmp_path / "p*.json")) assert "03.0" in parts