feat(docling): B1-2 AQA label normalization + missing-.1 inference + MCQ gap (salvaged)

(cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)
2026-06-08 05:03:05 +01:00 · 2026-06-08 05:03:05 +01:00 · 76e11b0b06
commit 76e11b0b06
parent 52d1ece212
2 changed files with 169 additions and 8 deletions
--- a/api/services/docling/extract.py
+++ b/api/services/docling/extract.py
@ -249,6 +249,11 @@ def extract_front_matter(lines, board, code):
 # ====================================================================== AQA
 # --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
 PART_RE = re.compile(r"^(\d{2})\.(\d)$")     # 01.2
 # OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
 # Normalise only tight margin-column candidates before matching; body decimals
 # remain protected by the label-column gate below.
 AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
 AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"})
 NUM_RE  = re.compile(r"^(\d{2})$")           # 08
 DIG_RE  = re.compile(r"^(\d)$")              # 4
 # A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
@ -279,21 +284,47 @@ def _rapid_pages(rapid_glob):
        yield pg, json.load(open(fn))
 def _clean_aqa_label(raw):
    compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
    # Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
    # genuine glued label+prose cases from the raw text under the label-column gate.
    if re.search(r"[A-Za-z]", compact):
        return compact
    return AQA_LABEL_NOISE.sub("", compact)
 def _synthetic_label_bbox(page_lines, fallback):
    """Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
    body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
    if body:
        top = max(body, key=lambda b: b.get("t", 0))
        return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
                "coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
    if fallback:
        return dict(fallback)
    return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
 def aqa_questions_rapid(rapid_glob):
    """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
      * GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
      * A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
      * A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
    parts = {}
    page_lines = defaultdict(list)        # page -> [(bbox, raw)] for deterministic inference
    mcq_cands = []                       # (page, NN, bbox) bare top-level candidates, in order
    top_cands = {}                        # NN -> (page, bbox) explicit top-level question headers
    for pg, d in _rapid_pages(rapid_glob):
        margin = []
        for t in d.get("texts", []):
            raw = (t.get("text") or "").strip()
-            s = raw.replace(" ", "")
+            s = _clean_aqa_label(raw)
            prov = t.get("prov") or []
            bb = prov[0].get("bbox") if prov else None
-            if bb is None or bb["l"] > 140:
+            if bb is None:
                continue
            page_lines[pg].append((bb, raw))
            if bb["l"] > 140:
                continue
            margin.append((bb, s))
            m = PART_RE.match(s)
@ -311,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
        nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
        digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
        for nbb, nn in nums:
            top_cands.setdefault(nn, (pg, nbb))
            ny = (nbb["t"] + nbb["b"]) / 2
            for dbb, dd in digs:
                dy = (dbb["t"] + dbb["b"]) / 2
                if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
                    parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
-    # Section B: walk MCQ candidates in reading order, accept the next number in sequence only
+    # Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
-    structured_q = {int(lab.split(".")[0]) for lab in parts}
+    # immediately after the core structured section. This prevents OCR option text such as "36.7Q"
    # from moving the MCQ start from Q07 to Q37.
    q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
    core_q = q_nums[:]
    while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
        core_q.pop()
    mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
    if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
        core_set = set(core_q)
        for lab in list(parts):
            if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
                parts.pop(lab, None)
    # Infer an OCR-dropped leading .1 part when later structured parts for the same question are
    # present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
    # RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
    by_q = defaultdict(list)
    for lab, v in parts.items():
        q, sub = lab.split(".")
        if sub != "0":
            by_q[q].append((int(sub), v))
    for q, vals in list(by_q.items()):
        if f"{q}.1" not in parts:
            first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
            if first_sub > 1 and first_v.get("page"):
                pg = int(first_v["page"])
                parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
        subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
        # Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
        # false high subpart into a whole run of synthetic labels.
        if len(subs) >= 3:
            for prev_sub, next_sub in zip(subs, subs[1:]):
                if next_sub - prev_sub == 2:
                    missing = prev_sub + 1
                    anchor = parts[f"{q}.{next_sub}"]
                    parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
    # Preserve explicit one-part structured questions seen as a bare top-level header (for example
    # GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
    # headers that already have .1/.2 children into extra .0 parts.
    present_q = {lab.split(".")[0] for lab in parts}
    for q, (pg, bb) in top_cands.items():
        if q not in present_q:
            parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
    # Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
    structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
    expect = (max(structured_q) + 1) if structured_q else 1
    mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0)))   # page, then top-down
    cand = {}                            # nn -> (page, bbox), first occurrence in reading order
    for pg, nn, bb in mcq_cands:
        cand.setdefault(int(nn), (pg, bb))
-    # Walk the sequence: take the exact expected number when present; only jump a small gap
+    # Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
-    # (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
+    # deterministic placeholders so a single garbled number does not end Section B recovery.
    # truncate the section. Out-of-window noise (misread "60") never enters.
    seq = []
    while True:
        if expect in cand and expect not in structured_q:
@ -334,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
            continue
        nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
        if nxt:
-            expect = min(nxt)
+            jump_to = min(nxt)
            for missing in range(expect, jump_to):
                seq.append((missing, cand[jump_to]))
            expect = jump_to
            continue
        break
    # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
--- a/tests/test_docling_extract.py
+++ b/tests/test_docling_extract.py
@ -0,0 +1,81 @@
 from api.services.docling.extract import aqa_questions_rapid
 def _text(raw, page, l, t, r=120, b=None):
    return {
        "text": raw,
        "prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
    }
 def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
    (tmp_path / "p1.json").write_text(
        '{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
    )
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    assert "02.3" in parts
 def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
    (tmp_path / "p1.json").write_text(
        '{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
    )
    (tmp_path / "p2.json").write_text(
        '{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
    )
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    assert parts["07.1"]["page"] == 2
    assert parts["07.1"]["bbox"]["l"] == 49
    assert "07.2" in parts
 def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
    texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
    for idx, n in enumerate(["07", "08", "11", "12", "13"]):
        texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
    import json
    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
        assert label in parts
 def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
    import json
    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    assert "01.3" in parts
 def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
    import json
    texts = [
        _text("05.2 Some question text", 1, 49, 700),
        _text("05.3 Middle question text", 1, 49, 620),
        _text("05.5 Later question text", 2, 49, 740),
    ]
    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
    (tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    assert "05.1" in parts
    assert "05.4" in parts
    assert "05.5" in parts
 def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
    import json
    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
    assert "03.0" in parts