diff --git a/api/services/docling/extract.py b/api/services/docling/extract.py
index 222dc9a..994f0cd 100755
--- a/api/services/docling/extract.py
+++ b/api/services/docling/extract.py
@@ -249,6 +249,11 @@ def extract_front_matter(lines, board, code):
 # ====================================================================== AQA
 # --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
 PART_RE = re.compile(r"^(\d{2})\.(\d)$")     # 01.2
+# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
+# Normalise only tight margin-column candidates before matching; body decimals
+# remain protected by the label-column gate below.
+AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
+AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"})
 NUM_RE  = re.compile(r"^(\d{2})$")           # 08
 DIG_RE  = re.compile(r"^(\d)$")              # 4
 # A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
@@ -279,21 +284,47 @@ def _rapid_pages(rapid_glob):
         yield pg, json.load(open(fn))
 
 
+def _clean_aqa_label(raw):
+    compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
+    # Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
+    # genuine glued label+prose cases from the raw text under the label-column gate.
+    if re.search(r"[A-Za-z]", compact):
+        return compact
+    return AQA_LABEL_NOISE.sub("", compact)
+
+
+def _synthetic_label_bbox(page_lines, fallback):
+    """Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
+    body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
+    if body:
+        top = max(body, key=lambda b: b.get("t", 0))
+        return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
+                "coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
+    if fallback:
+        return dict(fallback)
+    return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
+
+
 def aqa_questions_rapid(rapid_glob):
     """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
       * GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
       * A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
       * A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
     parts = {}
+    page_lines = defaultdict(list)        # page -> [(bbox, raw)] for deterministic inference
     mcq_cands = []                       # (page, NN, bbox) bare top-level candidates, in order
+    top_cands = {}                        # NN -> (page, bbox) explicit top-level question headers
     for pg, d in _rapid_pages(rapid_glob):
         margin = []
         for t in d.get("texts", []):
             raw = (t.get("text") or "").strip()
-            s = raw.replace(" ", "")
+            s = _clean_aqa_label(raw)
             prov = t.get("prov") or []
             bb = prov[0].get("bbox") if prov else None
-            if bb is None or bb["l"] > 140:
+            if bb is None:
+                continue
+            page_lines[pg].append((bb, raw))
+            if bb["l"] > 140:
                 continue
             margin.append((bb, s))
             m = PART_RE.match(s)
@@ -311,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
         nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
         digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
         for nbb, nn in nums:
+            top_cands.setdefault(nn, (pg, nbb))
             ny = (nbb["t"] + nbb["b"]) / 2
             for dbb, dd in digs:
                 dy = (dbb["t"] + dbb["b"]) / 2
                 if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
                     parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
-    # Section B: walk MCQ candidates in reading order, accept the next number in sequence only
-    structured_q = {int(lab.split(".")[0]) for lab in parts}
+    # Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
+    # immediately after the core structured section. This prevents OCR option text such as "36.7Q"
+    # from moving the MCQ start from Q07 to Q37.
+    q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
+    core_q = q_nums[:]
+    while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
+        core_q.pop()
+    mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
+    if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
+        core_set = set(core_q)
+        for lab in list(parts):
+            if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
+                parts.pop(lab, None)
+
+    # Infer an OCR-dropped leading .1 part when later structured parts for the same question are
+    # present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
+    # RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
+    by_q = defaultdict(list)
+    for lab, v in parts.items():
+        q, sub = lab.split(".")
+        if sub != "0":
+            by_q[q].append((int(sub), v))
+    for q, vals in list(by_q.items()):
+        if f"{q}.1" not in parts:
+            first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
+            if first_sub > 1 and first_v.get("page"):
+                pg = int(first_v["page"])
+                parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
+        subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
+        # Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
+        # false high subpart into a whole run of synthetic labels.
+        if len(subs) >= 3:
+            for prev_sub, next_sub in zip(subs, subs[1:]):
+                if next_sub - prev_sub == 2:
+                    missing = prev_sub + 1
+                    anchor = parts[f"{q}.{next_sub}"]
+                    parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
+
+    # Preserve explicit one-part structured questions seen as a bare top-level header (for example
+    # GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
+    # headers that already have .1/.2 children into extra .0 parts.
+    present_q = {lab.split(".")[0] for lab in parts}
+    for q, (pg, bb) in top_cands.items():
+        if q not in present_q:
+            parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
+
+    # Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
+    structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
     expect = (max(structured_q) + 1) if structured_q else 1
     mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0)))   # page, then top-down
     cand = {}                            # nn -> (page, bbox), first occurrence in reading order
     for pg, nn, bb in mcq_cands:
         cand.setdefault(int(nn), (pg, bb))
-    # Walk the sequence: take the exact expected number when present; only jump a small gap
-    # (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
-    # truncate the section. Out-of-window noise (misread "60") never enters.
+    # Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
+    # deterministic placeholders so a single garbled number does not end Section B recovery.
     seq = []
     while True:
         if expect in cand and expect not in structured_q:
@@ -334,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
             continue
         nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
         if nxt:
-            expect = min(nxt)
+            jump_to = min(nxt)
+            for missing in range(expect, jump_to):
+                seq.append((missing, cand[jump_to]))
+            expect = jump_to
             continue
         break
     # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
diff --git a/tests/test_docling_extract.py b/tests/test_docling_extract.py
new file mode 100644
index 0000000..8e9d426
--- /dev/null
+++ b/tests/test_docling_extract.py
@@ -0,0 +1,81 @@
+from api.services.docling.extract import aqa_questions_rapid
+
+
+def _text(raw, page, l, t, r=120, b=None):
+    return {
+        "text": raw,
+        "prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
+    }
+
+
+def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
+    (tmp_path / "p1.json").write_text(
+        '{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "02.3" in parts
+
+
+def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
+    (tmp_path / "p1.json").write_text(
+        '{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+    (tmp_path / "p2.json").write_text(
+        '{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
+    )
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert parts["07.1"]["page"] == 2
+    assert parts["07.1"]["bbox"]["l"] == 49
+    assert "07.2" in parts
+
+
+def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
+    texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
+    for idx, n in enumerate(["07", "08", "11", "12", "13"]):
+        texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
+        assert label in parts
+
+
+def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "01.3" in parts
+
+
+def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
+    import json
+    texts = [
+        _text("05.2 Some question text", 1, 49, 700),
+        _text("05.3 Middle question text", 1, 49, 620),
+        _text("05.5 Later question text", 2, 49, 740),
+    ]
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
+    (tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "05.1" in parts
+    assert "05.4" in parts
+    assert "05.5" in parts
+
+
+def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
+    import json
+    (tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
+
+    parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
+
+    assert "03.0" in parts