feat(docling): B1-2 AQA label normalization + missing-.1 inference + MCQ gap (salvaged)
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled

(cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)
This commit is contained in:
kcar 2026-06-08 05:03:05 +01:00 committed by CC Worker
parent 52d1ece212
commit 76e11b0b06
2 changed files with 169 additions and 8 deletions

View File

@ -249,6 +249,11 @@ def extract_front_matter(lines, board, code):
# ====================================================================== AQA # ====================================================================== AQA
# --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) ----- # --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2 PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2
# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
# Normalise only tight margin-column candidates before matching; body decimals
# remain protected by the label-column gate below.
AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
AQA_CIRCLED_DIGITS = str.maketrans({"": "1", "": "2", "": "3", "": "4", "": "5", "": "6", "": "7", "": "8", "": "9"})
NUM_RE = re.compile(r"^(\d{2})$") # 08 NUM_RE = re.compile(r"^(\d{2})$") # 08
DIG_RE = re.compile(r"^(\d)$") # 4 DIG_RE = re.compile(r"^(\d)$") # 4
# A-level papers (7408) render the boxed label GLUED to the question text in one OCR token # A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
@ -279,21 +284,47 @@ def _rapid_pages(rapid_glob):
yield pg, json.load(open(fn)) yield pg, json.load(open(fn))
def _clean_aqa_label(raw):
compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
# Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
# genuine glued label+prose cases from the raw text under the label-column gate.
if re.search(r"[A-Za-z]", compact):
return compact
return AQA_LABEL_NOISE.sub("", compact)
def _synthetic_label_bbox(page_lines, fallback):
"""Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
if body:
top = max(body, key=lambda b: b.get("t", 0))
return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
"coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
if fallback:
return dict(fallback)
return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
def aqa_questions_rapid(rapid_glob): def aqa_questions_rapid(rapid_glob):
"""Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts: """Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
* GCSE standalone label/number boxes (8463 v1's NN.M + NUM/DIG pairing), * GCSE standalone label/number boxes (8463 v1's NN.M + NUM/DIG pairing),
* A-level structured parts glued as a prefix ("01.1 An atom of..." label column), * A-level structured parts glued as a prefix ("01.1 An atom of..." label column),
* A-level Section-B multiple choice: bare sequential top-levels -> NN.0.""" * A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
parts = {} parts = {}
page_lines = defaultdict(list) # page -> [(bbox, raw)] for deterministic inference
mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order
top_cands = {} # NN -> (page, bbox) explicit top-level question headers
for pg, d in _rapid_pages(rapid_glob): for pg, d in _rapid_pages(rapid_glob):
margin = [] margin = []
for t in d.get("texts", []): for t in d.get("texts", []):
raw = (t.get("text") or "").strip() raw = (t.get("text") or "").strip()
s = raw.replace(" ", "") s = _clean_aqa_label(raw)
prov = t.get("prov") or [] prov = t.get("prov") or []
bb = prov[0].get("bbox") if prov else None bb = prov[0].get("bbox") if prov else None
if bb is None or bb["l"] > 140: if bb is None:
continue
page_lines[pg].append((bb, raw))
if bb["l"] > 140:
continue continue
margin.append((bb, s)) margin.append((bb, s))
m = PART_RE.match(s) m = PART_RE.match(s)
@ -311,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)] nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)] digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
for nbb, nn in nums: for nbb, nn in nums:
top_cands.setdefault(nn, (pg, nbb))
ny = (nbb["t"] + nbb["b"]) / 2 ny = (nbb["t"] + nbb["b"]) / 2
for dbb, dd in digs: for dbb, dd in digs:
dy = (dbb["t"] + dbb["b"]) / 2 dy = (dbb["t"] + dbb["b"]) / 2
if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]: if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb}) parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
# Section B: walk MCQ candidates in reading order, accept the next number in sequence only # Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
structured_q = {int(lab.split(".")[0]) for lab in parts} # immediately after the core structured section. This prevents OCR option text such as "36.7Q"
# from moving the MCQ start from Q07 to Q37.
q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
core_q = q_nums[:]
while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
core_q.pop()
mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
core_set = set(core_q)
for lab in list(parts):
if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
parts.pop(lab, None)
# Infer an OCR-dropped leading .1 part when later structured parts for the same question are
# present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
# RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
by_q = defaultdict(list)
for lab, v in parts.items():
q, sub = lab.split(".")
if sub != "0":
by_q[q].append((int(sub), v))
for q, vals in list(by_q.items()):
if f"{q}.1" not in parts:
first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
if first_sub > 1 and first_v.get("page"):
pg = int(first_v["page"])
parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
# Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
# false high subpart into a whole run of synthetic labels.
if len(subs) >= 3:
for prev_sub, next_sub in zip(subs, subs[1:]):
if next_sub - prev_sub == 2:
missing = prev_sub + 1
anchor = parts[f"{q}.{next_sub}"]
parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
# Preserve explicit one-part structured questions seen as a bare top-level header (for example
# GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
# headers that already have .1/.2 children into extra .0 parts.
present_q = {lab.split(".")[0] for lab in parts}
for q, (pg, bb) in top_cands.items():
if q not in present_q:
parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
# Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
expect = (max(structured_q) + 1) if structured_q else 1 expect = (max(structured_q) + 1) if structured_q else 1
mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down
cand = {} # nn -> (page, bbox), first occurrence in reading order cand = {} # nn -> (page, bbox), first occurrence in reading order
for pg, nn, bb in mcq_cands: for pg, nn, bb in mcq_cands:
cand.setdefault(int(nn), (pg, bb)) cand.setdefault(int(nn), (pg, bb))
# Walk the sequence: take the exact expected number when present; only jump a small gap # Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
# (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't # deterministic placeholders so a single garbled number does not end Section B recovery.
# truncate the section. Out-of-window noise (misread "60") never enters.
seq = [] seq = []
while True: while True:
if expect in cand and expect not in structured_q: if expect in cand and expect not in structured_q:
@ -334,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
continue continue
nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q] nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
if nxt: if nxt:
expect = min(nxt) jump_to = min(nxt)
for missing in range(expect, jump_to):
seq.append((missing, cand[jump_to]))
expect = jump_to
continue continue
break break
# Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a # Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a

View File

@ -0,0 +1,81 @@
from api.services.docling.extract import aqa_questions_rapid
def _text(raw, page, l, t, r=120, b=None):
return {
"text": raw,
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
}
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
(tmp_path / "p1.json").write_text(
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "02.3" in parts
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
(tmp_path / "p1.json").write_text(
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
(tmp_path / "p2.json").write_text(
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
)
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert parts["07.1"]["page"] == 2
assert parts["07.1"]["bbox"]["l"] == 49
assert "07.2" in parts
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
assert label in parts
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "01.3" in parts
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
import json
texts = [
_text("05.2 Some question text", 1, 49, 700),
_text("05.3 Middle question text", 1, 49, 620),
_text("05.5 Later question text", 2, 49, 740),
]
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "05.1" in parts
assert "05.4" in parts
assert "05.5" in parts
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
import json
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
assert "03.0" in parts