[verified] generalize B1 response regions and marks gap fill
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled

This commit is contained in:
kcar 2026-06-08 04:49:21 +01:00
parent 69d9c46abe
commit 52d1ece212
4 changed files with 179 additions and 15 deletions

View File

@ -40,6 +40,10 @@ try:
from . import tables as tbl_mod
except ImportError: # pragma: no cover - CLI execution
import tables as tbl_mod
try:
from . import regions as region_mod
except ImportError: # pragma: no cover - CLI execution
import regions as region_mod
# ----------------------------------------------------------------- line model
Line = namedtuple("Line", "text page bbox") # bbox is None for text-only sources
@ -521,6 +525,11 @@ def docling_regions(doc):
return regions
def _norm_region_type(kind):
kind = (kind or "answer_lines").strip().lower().replace("-", "_")
return kind if kind in {"answer_lines", "answer_box", "working_space"} else "working_space"
def merge_gemma(parts, gemma_dir):
"""Attach gemma4:e4b answer_regions (#3) to parts by for_part; gap-fill missing marks."""
n_reg = n_fill = 0
@ -529,8 +538,9 @@ def merge_gemma(parts, gemma_dir):
for r in d.get("answer_regions", []):
lab = _norm_label(r.get("for_part", ""))
if lab in parts:
parts[lab]["regions"].append({"type": r.get("kind", "answer_lines"),
"source": "gemma"})
parts[lab]["regions"].append({"type": _norm_region_type(r.get("kind", "answer_lines")),
"source": "gemma",
**({"bbox": r.get("bbox")} if r.get("bbox") else {})})
n_reg += 1
for qp in d.get("question_parts", []):
lab = _norm_label(qp.get("label", ""))
@ -548,6 +558,70 @@ def _norm_label(s):
return s
def attach_detected_response_regions(parts, pdf_path):
"""Attach OpenCV response-region candidates to the nearest known part on the same page.
This is the deterministic answer-region backbone used before/alongside gemma: it emits the
same answer_lines / answer_box / working_space taxonomy and keeps the mapper schema unchanged.
Coordinates from regions.py are rendered-page TOPLEFT px; callers can persist them as candidate
response areas or use the counts as harness coverage.
"""
if not pdf_path or not os.path.exists(pdf_path):
return 0, []
try:
candidates = region_mod.detect_response_regions_from_pdf(pdf_path, min_confidence=0.32)
except RuntimeError as exc:
print(f"response-regions : unavailable ({exc})")
return 0, []
except Exception as exc:
print(f"response-regions : failed ({exc})")
return 0, []
by_page = defaultdict(list)
for lab, part in parts.items():
if part.get("page") is not None and part.get("bbox"):
by_page[int(part["page"])].append((lab, part))
attached = 0
for cand in candidates:
# regions.py page_index is zero-based; extraction/template parts are one-based.
pg = int(cand.get("page_index", 0)) + 1
page_parts = by_page.get(pg) or []
if not page_parts:
continue
rb = cand.get("bbox") or {}
meta = cand.get("meta") or {}
center_top_px = float(rb.get("y", 0)) + float(rb.get("h", 0)) / 2
page_height_px = float(meta.get("page_height_px") or 0)
page_height_pdf = float(meta.get("page_height_pdf") or 0)
if page_height_px > 0 and page_height_pdf > 0:
region_y_pdf = (1.0 - center_top_px / page_height_px) * page_height_pdf
else:
region_y_pdf = -center_top_px
best_lab = None
best_score = 1e9
for lab, part in page_parts:
pb = part.get("bbox") or {}
part_mid = (float(pb.get("t", 0)) + float(pb.get("b", 0))) / 2
# Prefer the nearest label above/near the response area; a small penalty keeps
# previous-part assignment stable when regions sit between two labels.
below_penalty = 0 if region_y_pdf <= float(pb.get("t", 0)) + 18 else 120
score = abs(part_mid - region_y_pdf) + below_penalty
if score < best_score:
best_lab, best_score = lab, score
if best_lab:
parts[best_lab].setdefault("regions", []).append({
"type": _norm_region_type(cand.get("region_type")),
"source": "opencv",
"confidence": cand.get("confidence"),
"bbox": rb,
"detection_method": cand.get("detection_method"),
**({"line_count": cand.get("line_count")} if cand.get("line_count") is not None else {}),
})
attached += 1
return attached, candidates
def extract_tables(parts, doc, granite="off", pdf=None, cache_glob=None):
"""Selective table-cell extraction (PLAN.md §B): standard TableFormer grids always; Granite
<otsl> on router-flagged pages when granite!='off'. Returns (data_tables, all_tables).
@ -626,7 +700,7 @@ GT_PARTS_PHYSICS = ["01.1","01.2","01.3","01.4","02.1","02.2","02.3","02.4","03.
"10.1","10.2","10.3","11.1","11.2","11.3","11.4"]
# official paper maxima — the strongest grammar sanity check (marks_sum should match)
EXPECTED_MAX = {"8463": 100, "7408": 85, "8461": 100, "1MA1": 80, "H556": 70}
EXPECTED_MAX = {"8463": 100, "7408": 85, "7402": 91, "7405": 105, "8461": 100, "8462": 100, "8464": 70, "1MA1": 80, "H556": 70}
def expected_max(code):
@ -666,6 +740,7 @@ def main():
ap.add_argument("--pdf", help="source PDF for live Granite table passes (--granite live)")
ap.add_argument("--rapid", help="AQA RapidOCR per-page glob (the v1 95% path)")
ap.add_argument("--gemma", help="gemma sweep dir with p*.json answer_regions")
ap.add_argument("--response-regions", dest="response_regions_pdf", help="PDF to scan with deterministic response-region detector and attach to parts")
ap.add_argument("--marks-fill", dest="marks_fill",
help="gemma_marks.py fills JSON: fill marks=None parts (Edexcel/OCR (N)/[N] gap-fill)")
ap.add_argument("--granite", default="off", choices=["off", "cached", "live"],
@ -673,6 +748,7 @@ def main():
ap.add_argument("--granite-cache", default="results/VLM_granite_p*.doctags",
help="glob of cached *.doctags for --granite cached / live fallback")
ap.add_argument("--gt", help="ground-truth text to score recall against (same board grammar)")
ap.add_argument("--expected-max", type=int, help="authoritative paper max marks for OCR eval harnesses when front matter/code OCR is missing")
ap.add_argument("--board", default="auto", choices=["auto", "aqa", "edexcel", "ocr"])
ap.add_argument("--out", default="results/structured.json")
a = ap.parse_args()
@ -751,6 +827,11 @@ def main():
n_reg = n_fill = 0
if a.gemma and os.path.isdir(a.gemma):
n_reg, n_fill = merge_gemma(parts, a.gemma)
n_cv_regions = 0
cv_region_candidates = []
response_pdf = a.response_regions_pdf or a.pdf or a.ocr
if response_pdf:
n_cv_regions, cv_region_candidates = attach_detected_response_regions(parts, response_pdf)
n_marks_fill = 0
if a.marks_fill and os.path.exists(a.marks_fill):
fills = json.load(open(a.marks_fill)).get("fills", {})
@ -758,6 +839,20 @@ def main():
if lab in parts and parts[lab].get("marks") is None:
parts[lab]["marks"] = int(mk); n_marks_fill += 1
exp_max_override = a.expected_max
# Targeted marks gap-fill: if OCR recovered all but one mark and the authoritative
# paper max leaves a small plausible residual, attach that residual to the lone
# missing part. This keeps the deterministic label backbone and only fills the
# narrow low-confidence gap instead of using gemma/full extraction as source of truth.
n_residual_marks_fill = 0
if exp_max_override:
missing_labs = [lab for lab, part in parts.items() if part.get("marks") is None]
known_sum = sum(part["marks"] for part in parts.values() if part.get("marks") is not None)
residual = exp_max_override - known_sum
if len(missing_labs) == 1 and 1 <= residual <= 9:
parts[missing_labs[0]]["marks"] = residual
n_residual_marks_fill = 1
questions = build_questions(parts)
# --- coverage ------------------------------------------------------------------------
@ -774,7 +869,7 @@ def main():
marks_known = sum(1 for v in parts.values() if v.get("marks") is not None)
marks_sum = sum(v["marks"] for v in parts.values() if v.get("marks") is not None)
exp_max = expected_max(code) or fm.get("max_marks") # code-based, else front-matter total
exp_max = exp_max_override or expected_max(code) or fm.get("max_marks") # harness override, code-based, else front-matter total
marks_check = (None if exp_max is None else
{"sum": marks_sum, "expected_max": exp_max,
"pct": round(marks_sum / exp_max * 100, 1)})
@ -791,6 +886,9 @@ def main():
"marks_check": marks_check,
"gemma_answer_regions": n_reg, "gemma_marks_filled": n_fill,
"gemma_marks_gapfilled": n_marks_fill,
"residual_marks_gapfilled": n_residual_marks_fill,
"opencv_answer_regions": n_cv_regions,
"opencv_answer_region_candidates": len(cv_region_candidates),
"n_data_tables": len(data_tables),
"n_furniture_tables": sum(1 for t in all_tables if t["is_furniture"]),
"table_sources": {s: sum(1 for t in data_tables if t["source"] == s)
@ -810,7 +908,10 @@ def main():
print(f"marks : {marks_known}/{len(parts)} parts known (sum {marks_sum}){mc}"
+ (f"; +{n_mark_geo} by geometry" if n_mark_geo else ""))
print(f"gemma regions : {n_reg} answer_regions, {n_fill} marks gap-filled"
+ (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else ""))
+ (f"; +{n_marks_fill} marks via --marks-fill" if n_marks_fill else "")
+ (f"; +{n_residual_marks_fill} residual marks gap-fill" if n_residual_marks_fill else ""))
if response_pdf:
print(f"opencv regions : {n_cv_regions} attached / {len(cv_region_candidates)} candidates")
print(f"tables : {len(data_tables)} data table(s) "
f"{result['stats']['table_sources']} on pages {tbl_pages}; "
f"{result['stats']['n_furniture_tables']} furniture filtered; {n_tbl} parts flagged")

View File

@ -67,49 +67,49 @@ B1_GEOMETRY = [
pdf="samples/b1/aqa-biology-7402-1-2023jun.pdf",
docling="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/merged.json",
rapid="results/b1_rapid/b1-aqa-biology-7402-1-2023jun/p*.json",
gt_key="b1-aqa-biology-7402-1-2023jun"),
gt_key="b1-aqa-biology-7402-1-2023jun", expected_max=91),
dict(slug="b1-aqa-chemistry-7405-1-2022jun", title="AQA A-level Chemistry 7405/1 2022 Jun (image-only OCR baseline)",
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/chemistry/7405/1/2022-jun/qp.pdf",
pdf="samples/b1/aqa-chemistry-7405-1-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-chemistry-7405-1-2022jun/p*.json",
gt_key="b1-aqa-chemistry-7405-1-2022jun"),
gt_key="b1-aqa-chemistry-7405-1-2022jun", expected_max=105),
dict(slug="b1-aqa-physics-7408-1-2022jun", title="AQA A-level Physics 7408/1 2022 Jun (image-only OCR baseline)",
board="aqa", level="A-level", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/physics/7408/1/2022-jun/qp.pdf",
pdf="samples/b1/aqa-physics-7408-1-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-physics-7408-1-2022jun/p*.json",
gt_key="b1-aqa-physics-7408-1-2022jun"),
gt_key="b1-aqa-physics-7408-1-2022jun", expected_max=85),
dict(slug="b1-aqa-biology-8461-1h-2022jun", title="AQA GCSE Biology 8461/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/biology/8461/1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-biology-8461-1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-biology-8461-1h-2022jun/p*.json",
gt_key="b1-aqa-biology-8461-1h-2022jun"),
gt_key="b1-aqa-biology-8461-1h-2022jun", expected_max=100),
dict(slug="b1-aqa-chemistry-8462-1h-2022jun", title="AQA GCSE Chemistry 8462/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/chemistry/8462/1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-chemistry-8462-1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-chemistry-8462-1h-2022jun/p*.json",
gt_key="b1-aqa-chemistry-8462-1h-2022jun"),
gt_key="b1-aqa-chemistry-8462-1h-2022jun", expected_max=100),
dict(slug="b1-aqa-combined-8464-b1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/B/1H 2022 Jun (image-only OCR baseline)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/b-1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-combined-8464-b1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-combined-8464-b1h-2022jun/p*.json",
gt_key="b1-aqa-combined-8464-b1h-2022jun"),
gt_key="b1-aqa-combined-8464-b1h-2022jun", expected_max=70),
dict(slug="b1-aqa-combined-8464-c1h-2022jun", title="AQA GCSE Combined Science Trilogy 8464/C/1H 2022 Jun (image-only OCR baseline; 8465 not present in dev catalogue)",
board="aqa", level="GCSE", path="B1 image-only OCR (RapidOCR margin-pass)",
storage_loc="cc.examboards/aqa/combined-science-trilogy/8464/c-1h/2022-jun/qp.pdf",
pdf="samples/b1/aqa-combined-8464-c1h-2022jun.pdf",
docling="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/merged.json",
rapid="results/b1_rapid/b1-aqa-combined-8464-c1h-2022jun/p*.json",
gt_key="b1-aqa-combined-8464-c1h-2022jun"),
gt_key="b1-aqa-combined-8464-c1h-2022jun", expected_max=70),
]
GT_LABELS_PATH = "fixtures/b1_gt_labels.json"
@ -209,6 +209,9 @@ def stats_from(struct, val, gt_labels=None):
"coverage_pct": cov.get("coverage_pct"), "coverage_recovered": cov.get("recovered"),
"coverage_total": cov.get("total"), "coverage_source": cov.get("source"),
"coverage_missed": cov.get("missed", []), "answer_regions": answer_region_count(struct),
"opencv_answer_regions": st.get("opencv_answer_regions"),
"opencv_answer_region_candidates": st.get("opencv_answer_region_candidates"),
"residual_marks_gapfilled": st.get("residual_marks_gapfilled"),
"validate_verdict": (val.get("summary") or {}).get("worst_severity"),
"validate_flags": val.get("flags", []),
"questions_expected": (val.get("summary") or {}).get("questions_expected"),
@ -226,6 +229,10 @@ def do_geometry(p, overlays, gt_labels=None, prepare_ocr=False):
raise RuntimeError(f"unable to prepare B1 OCR cache for {p['slug']}")
extract_args = p.get("extract") or ["--docling", p["docling"], "--rapid", p["rapid"], "--board", p.get("board", "aqa")]
ex = ["extract.py"] + extract_args + ["--out", S]
if p.get("pdf"):
ex += ["--response-regions", p["pdf"]]
if p.get("expected_max"):
ex += ["--expected-max", str(p["expected_max"])]
if p.get("gt"):
ex += ["--gt", p["gt"]]
run(ex)
@ -272,6 +279,8 @@ def per_paper_report(p, s, d, kind):
if s['coverage_pct'] is not None else "- **coverage vs GT:** n/a",
f"- **G6 verdict:** {s['validate_verdict']}",
f"- **answer-region count:** {s.get('answer_regions')}",
f"- **opencv response regions:** {s.get('opencv_answer_regions')} attached / "
f"{s.get('opencv_answer_region_candidates')} candidates",
]
if s["validate_flags"]:
lines += ["", "**Flags (human-review hints):**"] + [f"- {f}" for f in s["validate_flags"]]

View File

@ -162,7 +162,16 @@ def detect_response_regions_from_pdf(
page_index=page_index,
min_confidence=min_confidence,
)
candidates.extend(candidate.to_mapper_dict() for candidate in page_candidates)
for candidate in page_candidates:
item = candidate.to_mapper_dict()
item.setdefault("meta", {}).update({
"page_width_px": pix.width,
"page_height_px": pix.height,
"page_width_pdf": float(doc[page_index].rect.width),
"page_height_pdf": float(doc[page_index].rect.height),
"render_dpi": dpi,
})
candidates.append(item)
return candidates
finally:
doc.close()
@ -280,7 +289,7 @@ def _detect_answer_lines(binary: np.ndarray, *, page_index: int, width: int, hei
span_ratio = box_w / max(width, 1)
count_bonus = min(0.2, max(0, line_count - 1) * 0.05)
confidence = min(0.92, 0.42 + span_ratio * 0.35 + count_bonus)
region_type = "answer_lines" if line_count > 1 else "working_space"
region_type = "answer_lines"
candidates.append(
RegionCandidate(
page_index=page_index,
@ -351,6 +360,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
if rectangularity < 0.03:
continue
confidence = min(0.88, 0.46 + min(0.24, w / width * 0.24) + min(0.18, h / height * 0.5))
region_type = "working_space" if (h > height * 0.12 and rectangularity < 0.18) else "answer_box"
padded_x = max(0, x - 2)
padded_y = max(0, y - 2)
padded_right = min(width, x + w + 2)
@ -362,7 +372,7 @@ def _detect_answer_boxes(binary: np.ndarray, *, page_index: int, width: int, hei
y=padded_y,
w=padded_right - padded_x,
h=padded_bottom - padded_y,
region_type="answer_box",
region_type=region_type,
confidence=confidence,
detection_method="opencv_contour_box",
meta={"rectangularity": round(float(rectangularity), 3)},

View File

@ -2,6 +2,7 @@ from __future__ import annotations
from PIL import Image, ImageDraw
from api.services.docling import extract
from api.services.docling.regions import detect_response_regions_from_image
@ -37,3 +38,46 @@ def test_detects_answer_box() -> None:
assert boxes
assert boxes[0]["bbox"]["w"] > 600
assert boxes[0]["bbox"]["h"] > 200
def test_detect_response_region_taxonomy_for_lines_and_boxes():
img = Image.new("RGB", (800, 1000), "white")
draw = ImageDraw.Draw(img)
for y in (220, 260, 300):
draw.line((120, y, 680, y), fill="black", width=2)
draw.rectangle((140, 520, 660, 640), outline="black", width=3)
regions = detect_response_regions_from_image(img, min_confidence=0.1)
types = {r.region_type for r in regions}
assert "answer_lines" in types
assert "answer_box" in types
def test_attach_detected_response_regions_normalizes_taxonomy_and_uses_pdf_y(monkeypatch, tmp_path):
pdf = tmp_path / "paper.pdf"
pdf.write_bytes(b"%PDF test placeholder")
parts = {
"01.1": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 700, "b": 680}, "regions": []},
"01.2": {"q": "01", "page": 1, "bbox": {"l": 50, "r": 80, "t": 500, "b": 480}, "regions": []},
}
def fake_detect(path, min_confidence=0.32):
return [{
"page_index": 0,
"region_type": "answer-box",
"confidence": 0.77,
"bbox": {"x": 100, "y": 335, "w": 500, "h": 40},
"detection_method": "test",
"meta": {"page_height_px": 1000, "page_height_pdf": 800},
}]
monkeypatch.setattr(extract.region_mod, "detect_response_regions_from_pdf", fake_detect)
attached, candidates = extract.attach_detected_response_regions(parts, str(pdf))
assert attached == 1
assert len(candidates) == 1
assert parts["01.1"]["regions"] == []
assert parts["01.2"]["regions"][0]["type"] == "answer_box"
assert parts["01.2"]["regions"][0]["source"] == "opencv"