diff --git a/api/services/docling/extract.py b/api/services/docling/extract.py index 222dc9a..994f0cd 100755 --- a/api/services/docling/extract.py +++ b/api/services/docling/extract.py @@ -249,6 +249,11 @@ def extract_front_matter(lines, board, code): # ====================================================================== AQA # --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) ----- PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2 +# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3"). +# Normalise only tight margin-column candidates before matching; body decimals +# remain protected by the label-column gate below. +AQA_LABEL_NOISE = re.compile(r"[^0-9.]+") +AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"}) NUM_RE = re.compile(r"^(\d{2})$") # 08 DIG_RE = re.compile(r"^(\d)$") # 4 # A-level papers (7408) render the boxed label GLUED to the question text in one OCR token @@ -279,21 +284,47 @@ def _rapid_pages(rapid_glob): yield pg, json.load(open(fn)) +def _clean_aqa_label(raw): + compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "") + # Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles + # genuine glued label+prose cases from the raw text under the label-column gate. + if re.search(r"[A-Za-z]", compact): + return compact + return AQA_LABEL_NOISE.sub("", compact) + + +def _synthetic_label_bbox(page_lines, fallback): + """Best-effort bbox for an OCR-missed AQA label, preserving the layout contract.""" + body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T] + if body: + top = max(body, key=lambda b: b.get("t", 0)) + return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1), + "coord_origin": top.get("coord_origin", "BOTTOMLEFT")} + if fallback: + return dict(fallback) + return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"} + + def aqa_questions_rapid(rapid_glob): """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts: * GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing), * A-level structured parts glued as a prefix ("01.1 An atom of..." — label column), * A-level Section-B multiple choice: bare sequential top-levels -> NN.0.""" parts = {} + page_lines = defaultdict(list) # page -> [(bbox, raw)] for deterministic inference mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order + top_cands = {} # NN -> (page, bbox) explicit top-level question headers for pg, d in _rapid_pages(rapid_glob): margin = [] for t in d.get("texts", []): raw = (t.get("text") or "").strip() - s = raw.replace(" ", "") + s = _clean_aqa_label(raw) prov = t.get("prov") or [] bb = prov[0].get("bbox") if prov else None - if bb is None or bb["l"] > 140: + if bb is None: + continue + page_lines[pg].append((bb, raw)) + if bb["l"] > 140: continue margin.append((bb, s)) m = PART_RE.match(s) @@ -311,21 +342,67 @@ def aqa_questions_rapid(rapid_glob): nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)] digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)] for nbb, nn in nums: + top_cands.setdefault(nn, (pg, nbb)) ny = (nbb["t"] + nbb["b"]) / 2 for dbb, dd in digs: dy = (dbb["t"] + dbb["b"]) / 2 if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]: parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb}) - # Section B: walk MCQ candidates in reading order, accept the next number in sequence only - structured_q = {int(lab.split(".")[0]) for lab in parts} + # Before Section-B handling, trim isolated high structured labels when a real MCQ run starts + # immediately after the core structured section. This prevents OCR option text such as "36.7Q" + # from moving the MCQ start from Q07 to Q37. + q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")}) + core_q = q_nums[:] + while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2: + core_q.pop() + mcq_nums = {int(nn) for _, nn, _ in mcq_cands} + if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums): + core_set = set(core_q) + for lab in list(parts): + if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"): + parts.pop(lab, None) + + # Infer an OCR-dropped leading .1 part when later structured parts for the same question are + # present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where + # RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths. + by_q = defaultdict(list) + for lab, v in parts.items(): + q, sub = lab.split(".") + if sub != "0": + by_q[q].append((int(sub), v)) + for q, vals in list(by_q.items()): + if f"{q}.1" not in parts: + first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999)) + if first_sub > 1 and first_v.get("page"): + pg = int(first_v["page"]) + parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))} + subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0") + # Fill only one-step internal OCR gaps with support on both sides; do not expand a lone + # false high subpart into a whole run of synthetic labels. + if len(subs) >= 3: + for prev_sub, next_sub in zip(subs, subs[1:]): + if next_sub - prev_sub == 2: + missing = prev_sub + 1 + anchor = parts[f"{q}.{next_sub}"] + parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})} + + # Preserve explicit one-part structured questions seen as a bare top-level header (for example + # GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question + # headers that already have .1/.2 children into extra .0 parts. + present_q = {lab.split(".")[0] for lab in parts} + for q, (pg, bb) in top_cands.items(): + if q not in present_q: + parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb}) + + # Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence. + structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts]) expect = (max(structured_q) + 1) if structured_q else 1 mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down cand = {} # nn -> (page, bbox), first occurrence in reading order for pg, nn, bb in mcq_cands: cand.setdefault(int(nn), (pg, bb)) - # Walk the sequence: take the exact expected number when present; only jump a small gap - # (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't - # truncate the section. Out-of-window noise (misread "60") never enters. + # Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit + # deterministic placeholders so a single garbled number does not end Section B recovery. seq = [] while True: if expect in cand and expect not in structured_q: @@ -334,7 +411,10 @@ def aqa_questions_rapid(rapid_glob): continue nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q] if nxt: - expect = min(nxt) + jump_to = min(nxt) + for missing in range(expect, jump_to): + seq.append((missing, cand[jump_to])) + expect = jump_to continue break # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a diff --git a/tests/test_docling_extract.py b/tests/test_docling_extract.py new file mode 100644 index 0000000..8e9d426 --- /dev/null +++ b/tests/test_docling_extract.py @@ -0,0 +1,81 @@ +from api.services.docling.extract import aqa_questions_rapid + + +def _text(raw, page, l, t, r=120, b=None): + return { + "text": raw, + "prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}], + } + + +def test_aqa_rapid_cleans_noisy_margin_label(tmp_path): + (tmp_path / "p1.json").write_text( + '{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}' + ) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + assert "02.3" in parts + + +def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path): + (tmp_path / "p1.json").write_text( + '{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}' + ) + (tmp_path / "p2.json").write_text( + '{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}' + ) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + assert parts["07.1"]["page"] == 2 + assert parts["07.1"]["bbox"]["l"] == 49 + assert "07.2" in parts + + +def test_aqa_rapid_fills_small_mcq_gaps(tmp_path): + texts = [_text("06.1 Structured question before Section B", 1, 49, 820)] + for idx, n in enumerate(["07", "08", "11", "12", "13"]): + texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40)) + import json + (tmp_path / "p1.json").write_text(json.dumps({"texts": texts})) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]: + assert label in parts + + +def test_aqa_rapid_maps_circled_digit_labels(tmp_path): + import json + (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]})) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + assert "01.3" in parts + + +def test_aqa_rapid_infers_internal_structured_gap(tmp_path): + import json + texts = [ + _text("05.2 Some question text", 1, 49, 700), + _text("05.3 Middle question text", 1, 49, 620), + _text("05.5 Later question text", 2, 49, 740), + ] + (tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]})) + (tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]})) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + assert "05.1" in parts + assert "05.4" in parts + assert "05.5" in parts + + +def test_aqa_rapid_keeps_bare_single_part_question(tmp_path): + import json + (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]})) + + parts = aqa_questions_rapid(str(tmp_path / "p*.json")) + + assert "03.0" in parts