feat(docling): B1-2 AQA label normalization + missing-.1 inference + MCQ gap (salvaged)
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
(cherry picked from commit a707a5afd92c5c9fb042486229d0ef11549a3f53)
This commit is contained in:
parent
52d1ece212
commit
76e11b0b06
@ -249,6 +249,11 @@ def extract_front_matter(lines, board, code):
|
|||||||
# ====================================================================== AQA
|
# ====================================================================== AQA
|
||||||
# --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
|
# --- v1 path: Docling JSON + RapidOCR boxed labels (the proven 95% recovery) -----
|
||||||
PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2
|
PART_RE = re.compile(r"^(\d{2})\.(\d)$") # 01.2
|
||||||
|
# OCR sometimes inserts bracket/noise glyphs inside boxed labels (e.g. "02].3").
|
||||||
|
# Normalise only tight margin-column candidates before matching; body decimals
|
||||||
|
# remain protected by the label-column gate below.
|
||||||
|
AQA_LABEL_NOISE = re.compile(r"[^0-9.]+")
|
||||||
|
AQA_CIRCLED_DIGITS = str.maketrans({"①": "1", "②": "2", "③": "3", "④": "4", "⑤": "5", "⑥": "6", "⑦": "7", "⑧": "8", "⑨": "9"})
|
||||||
NUM_RE = re.compile(r"^(\d{2})$") # 08
|
NUM_RE = re.compile(r"^(\d{2})$") # 08
|
||||||
DIG_RE = re.compile(r"^(\d)$") # 4
|
DIG_RE = re.compile(r"^(\d)$") # 4
|
||||||
# A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
|
# A-level papers (7408) render the boxed label GLUED to the question text in one OCR token
|
||||||
@ -279,21 +284,47 @@ def _rapid_pages(rapid_glob):
|
|||||||
yield pg, json.load(open(fn))
|
yield pg, json.load(open(fn))
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_aqa_label(raw):
|
||||||
|
compact = (raw or "").strip().translate(AQA_CIRCLED_DIGITS).replace(" ", "")
|
||||||
|
# Do not turn prose/option OCR artifacts like "36.7Q" into labels; PART_PREFIX handles
|
||||||
|
# genuine glued label+prose cases from the raw text under the label-column gate.
|
||||||
|
if re.search(r"[A-Za-z]", compact):
|
||||||
|
return compact
|
||||||
|
return AQA_LABEL_NOISE.sub("", compact)
|
||||||
|
|
||||||
|
|
||||||
|
def _synthetic_label_bbox(page_lines, fallback):
|
||||||
|
"""Best-effort bbox for an OCR-missed AQA label, preserving the layout contract."""
|
||||||
|
body = [bb for bb, _ in page_lines if 90 <= bb.get("l", 999) <= 170 and bb.get("t", 0) >= FOOTER_T]
|
||||||
|
if body:
|
||||||
|
top = max(body, key=lambda b: b.get("t", 0))
|
||||||
|
return {"l": 48.0, "r": 104.0, "t": round(top["t"], 1), "b": round(top["b"], 1),
|
||||||
|
"coord_origin": top.get("coord_origin", "BOTTOMLEFT")}
|
||||||
|
if fallback:
|
||||||
|
return dict(fallback)
|
||||||
|
return {"l": 48.0, "r": 104.0, "t": 780.0, "b": 760.0, "coord_origin": "BOTTOMLEFT"}
|
||||||
|
|
||||||
|
|
||||||
def aqa_questions_rapid(rapid_glob):
|
def aqa_questions_rapid(rapid_glob):
|
||||||
"""Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
|
"""Recover question labels from per-page RapidOCR dumps. Handles three AQA layouts:
|
||||||
* GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
|
* GCSE standalone label/number boxes (8463 — v1's NN.M + NUM/DIG pairing),
|
||||||
* A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
|
* A-level structured parts glued as a prefix ("01.1 An atom of..." — label column),
|
||||||
* A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
|
* A-level Section-B multiple choice: bare sequential top-levels -> NN.0."""
|
||||||
parts = {}
|
parts = {}
|
||||||
|
page_lines = defaultdict(list) # page -> [(bbox, raw)] for deterministic inference
|
||||||
mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order
|
mcq_cands = [] # (page, NN, bbox) bare top-level candidates, in order
|
||||||
|
top_cands = {} # NN -> (page, bbox) explicit top-level question headers
|
||||||
for pg, d in _rapid_pages(rapid_glob):
|
for pg, d in _rapid_pages(rapid_glob):
|
||||||
margin = []
|
margin = []
|
||||||
for t in d.get("texts", []):
|
for t in d.get("texts", []):
|
||||||
raw = (t.get("text") or "").strip()
|
raw = (t.get("text") or "").strip()
|
||||||
s = raw.replace(" ", "")
|
s = _clean_aqa_label(raw)
|
||||||
prov = t.get("prov") or []
|
prov = t.get("prov") or []
|
||||||
bb = prov[0].get("bbox") if prov else None
|
bb = prov[0].get("bbox") if prov else None
|
||||||
if bb is None or bb["l"] > 140:
|
if bb is None:
|
||||||
|
continue
|
||||||
|
page_lines[pg].append((bb, raw))
|
||||||
|
if bb["l"] > 140:
|
||||||
continue
|
continue
|
||||||
margin.append((bb, s))
|
margin.append((bb, s))
|
||||||
m = PART_RE.match(s)
|
m = PART_RE.match(s)
|
||||||
@ -311,21 +342,67 @@ def aqa_questions_rapid(rapid_glob):
|
|||||||
nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
|
nums = [(bb, NUM_RE.match(s).group(1)) for bb, s in margin if NUM_RE.match(s)]
|
||||||
digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
|
digs = [(bb, DIG_RE.match(s).group(1)) for bb, s in margin if DIG_RE.match(s)]
|
||||||
for nbb, nn in nums:
|
for nbb, nn in nums:
|
||||||
|
top_cands.setdefault(nn, (pg, nbb))
|
||||||
ny = (nbb["t"] + nbb["b"]) / 2
|
ny = (nbb["t"] + nbb["b"]) / 2
|
||||||
for dbb, dd in digs:
|
for dbb, dd in digs:
|
||||||
dy = (dbb["t"] + dbb["b"]) / 2
|
dy = (dbb["t"] + dbb["b"]) / 2
|
||||||
if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
|
if abs(ny - dy) < 12 and dbb["l"] >= nbb["l"]:
|
||||||
parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
|
parts.setdefault(f"{nn}.{dd}", {"page": pg, "bbox": nbb})
|
||||||
# Section B: walk MCQ candidates in reading order, accept the next number in sequence only
|
# Before Section-B handling, trim isolated high structured labels when a real MCQ run starts
|
||||||
structured_q = {int(lab.split(".")[0]) for lab in parts}
|
# immediately after the core structured section. This prevents OCR option text such as "36.7Q"
|
||||||
|
# from moving the MCQ start from Q07 to Q37.
|
||||||
|
q_nums = sorted({int(lab.split(".")[0]) for lab in parts if not lab.endswith(".0")})
|
||||||
|
core_q = q_nums[:]
|
||||||
|
while len(core_q) >= 2 and core_q[-1] - core_q[-2] > 2:
|
||||||
|
core_q.pop()
|
||||||
|
mcq_nums = {int(nn) for _, nn, _ in mcq_cands}
|
||||||
|
if core_q and any(max(core_q) < n <= max(core_q) + 3 for n in mcq_nums):
|
||||||
|
core_set = set(core_q)
|
||||||
|
for lab in list(parts):
|
||||||
|
if int(lab.split(".")[0]) not in core_set and not lab.endswith(".0"):
|
||||||
|
parts.pop(lab, None)
|
||||||
|
|
||||||
|
# Infer an OCR-dropped leading .1 part when later structured parts for the same question are
|
||||||
|
# present. AQA papers overwhelmingly start structured questions at .1; this fixes pages where
|
||||||
|
# RapidOCR reads the prose but misses the small boxed label, without changing schemas or model paths.
|
||||||
|
by_q = defaultdict(list)
|
||||||
|
for lab, v in parts.items():
|
||||||
|
q, sub = lab.split(".")
|
||||||
|
if sub != "0":
|
||||||
|
by_q[q].append((int(sub), v))
|
||||||
|
for q, vals in list(by_q.items()):
|
||||||
|
if f"{q}.1" not in parts:
|
||||||
|
first_sub, first_v = min(vals, key=lambda x: (x[0], x[1].get("page") or 999))
|
||||||
|
if first_sub > 1 and first_v.get("page"):
|
||||||
|
pg = int(first_v["page"])
|
||||||
|
parts[f"{q}.1"] = {"page": pg, "bbox": _synthetic_label_bbox(page_lines.get(pg, []), first_v.get("bbox"))}
|
||||||
|
subs = sorted(int(sub) for lab in parts for qq, sub in [lab.split(".")] if qq == q and sub != "0")
|
||||||
|
# Fill only one-step internal OCR gaps with support on both sides; do not expand a lone
|
||||||
|
# false high subpart into a whole run of synthetic labels.
|
||||||
|
if len(subs) >= 3:
|
||||||
|
for prev_sub, next_sub in zip(subs, subs[1:]):
|
||||||
|
if next_sub - prev_sub == 2:
|
||||||
|
missing = prev_sub + 1
|
||||||
|
anchor = parts[f"{q}.{next_sub}"]
|
||||||
|
parts[f"{q}.{missing}"] = {"page": anchor.get("page"), "bbox": dict(anchor.get("bbox") or {})}
|
||||||
|
|
||||||
|
# Preserve explicit one-part structured questions seen as a bare top-level header (for example
|
||||||
|
# GCSE Combined Chemistry 03 with no decimal sub-label) without converting ordinary question
|
||||||
|
# headers that already have .1/.2 children into extra .0 parts.
|
||||||
|
present_q = {lab.split(".")[0] for lab in parts}
|
||||||
|
for q, (pg, bb) in top_cands.items():
|
||||||
|
if q not in present_q:
|
||||||
|
parts.setdefault(f"{q}.0", {"page": pg, "bbox": bb})
|
||||||
|
|
||||||
|
# Section B: walk MCQ candidates in reading order, accepting a tight increasing sequence.
|
||||||
|
structured_q = set(core_q or [int(lab.split(".")[0]) for lab in parts])
|
||||||
expect = (max(structured_q) + 1) if structured_q else 1
|
expect = (max(structured_q) + 1) if structured_q else 1
|
||||||
mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down
|
mcq_cands.sort(key=lambda c: (c[0], -(c[2] or {}).get("t", 0))) # page, then top-down
|
||||||
cand = {} # nn -> (page, bbox), first occurrence in reading order
|
cand = {} # nn -> (page, bbox), first occurrence in reading order
|
||||||
for pg, nn, bb in mcq_cands:
|
for pg, nn, bb in mcq_cands:
|
||||||
cand.setdefault(int(nn), (pg, bb))
|
cand.setdefault(int(nn), (pg, bb))
|
||||||
# Walk the sequence: take the exact expected number when present; only jump a small gap
|
# Take exact expected numbers; for tiny OCR gaps before the next real MCQ candidate, emit
|
||||||
# (<=3) when it's genuinely absent (OCR-garbled, e.g. "09"->"60") so one bad number doesn't
|
# deterministic placeholders so a single garbled number does not end Section B recovery.
|
||||||
# truncate the section. Out-of-window noise (misread "60") never enters.
|
|
||||||
seq = []
|
seq = []
|
||||||
while True:
|
while True:
|
||||||
if expect in cand and expect not in structured_q:
|
if expect in cand and expect not in structured_q:
|
||||||
@ -334,7 +411,10 @@ def aqa_questions_rapid(rapid_glob):
|
|||||||
continue
|
continue
|
||||||
nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
|
nxt = [n for n in cand if expect < n <= expect + 3 and n not in structured_q]
|
||||||
if nxt:
|
if nxt:
|
||||||
expect = min(nxt)
|
jump_to = min(nxt)
|
||||||
|
for missing in range(expect, jump_to):
|
||||||
|
seq.append((missing, cand[jump_to]))
|
||||||
|
expect = jump_to
|
||||||
continue
|
continue
|
||||||
break
|
break
|
||||||
# Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
|
# Only commit if this is a real Section-B MCQ run, not a few stray page/figure numbers on a
|
||||||
|
|||||||
81
tests/test_docling_extract.py
Normal file
81
tests/test_docling_extract.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from api.services.docling.extract import aqa_questions_rapid
|
||||||
|
|
||||||
|
|
||||||
|
def _text(raw, page, l, t, r=120, b=None):
|
||||||
|
return {
|
||||||
|
"text": raw,
|
||||||
|
"prov": [{"page_no": page, "bbox": {"l": l, "t": t, "r": r, "b": b if b is not None else t - 14, "coord_origin": "BOTTOMLEFT"}}],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_cleans_noisy_margin_label(tmp_path):
|
||||||
|
(tmp_path / "p1.json").write_text(
|
||||||
|
'{"texts":[{"text":"02].3","prov":[{"page_no":1,"bbox":{"l":49,"t":515,"r":102,"b":500,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "02.3" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_infers_missing_leading_part_from_next_part(tmp_path):
|
||||||
|
(tmp_path / "p1.json").write_text(
|
||||||
|
'{"texts":[{"text":"Question prose starts here","prov":[{"page_no":1,"bbox":{"l":114,"t":774,"r":500,"b":760,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
(tmp_path / "p2.json").write_text(
|
||||||
|
'{"texts":[{"text":"07.2","prov":[{"page_no":2,"bbox":{"l":49,"t":776,"r":104,"b":761,"coord_origin":"BOTTOMLEFT"}}]}]}'
|
||||||
|
)
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert parts["07.1"]["page"] == 2
|
||||||
|
assert parts["07.1"]["bbox"]["l"] == 49
|
||||||
|
assert "07.2" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_fills_small_mcq_gaps(tmp_path):
|
||||||
|
texts = [_text("06.1 Structured question before Section B", 1, 49, 820)]
|
||||||
|
for idx, n in enumerate(["07", "08", "11", "12", "13"]):
|
||||||
|
texts.append(_text(n + " Which statement is correct?", 1, 49, 780 - idx * 40))
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
for label in ["07.0", "08.0", "09.0", "10.0", "11.0", "12.0", "13.0"]:
|
||||||
|
assert label in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_maps_circled_digit_labels(tmp_path):
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("01.③", 1, 49, 515, r=102, b=500)]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "01.3" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_infers_internal_structured_gap(tmp_path):
|
||||||
|
import json
|
||||||
|
texts = [
|
||||||
|
_text("05.2 Some question text", 1, 49, 700),
|
||||||
|
_text("05.3 Middle question text", 1, 49, 620),
|
||||||
|
_text("05.5 Later question text", 2, 49, 740),
|
||||||
|
]
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": texts[:2]}))
|
||||||
|
(tmp_path / "p2.json").write_text(json.dumps({"texts": texts[2:]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "05.1" in parts
|
||||||
|
assert "05.4" in parts
|
||||||
|
assert "05.5" in parts
|
||||||
|
|
||||||
|
|
||||||
|
def test_aqa_rapid_keeps_bare_single_part_question(tmp_path):
|
||||||
|
import json
|
||||||
|
(tmp_path / "p1.json").write_text(json.dumps({"texts": [_text("03", 1, 49, 775)]}))
|
||||||
|
|
||||||
|
parts = aqa_questions_rapid(str(tmp_path / "p*.json"))
|
||||||
|
|
||||||
|
assert "03.0" in parts
|
||||||
Loading…
x
Reference in New Issue
Block a user