api/api/services/docling/page_roles.py

#!/usr/bin/env python3
"""
page_roles.py — tag every page with a structural role (the first-pass page-layout pass).

Roles: cover / question / continuation / blank / appendix. Drives two things in the template:
  * the human sees the paper's shape (which pages are non-question), and
  * MARGINS are disabled on pages that have no content column (cover, blank) — the override the
    user asked for ("the front page doesn't have margins").

Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and
whether the page carries a question band. Output feeds template.py via --page-roles.

Usage:
  python page_roles.py <docling_doc.json> --bands <bands.json> [--out results/page_roles/x.json]
"""
import json, argparse
from collections import defaultdict

BLANK_MAX = 130          # non-space chars at/below which a page is boilerplate-only (blank)
COVER_KW = ("time allowed", "instructions", "materials", "information for")
BLANK_KW = ("blank page", "no questions printed", "no questions are printed")
APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet")

# pages where there is no content column -> margins do not apply (the user's override case)
NO_MARGIN_ROLES = {"cover", "blank"}


def page_text(doc):
    chars, blob = defaultdict(int), defaultdict(list)
    for t in doc.get("texts", []):
        prov = t.get("prov") or []
        pg = prov[0].get("page_no") if prov else None
        if pg:
            s = t.get("text") or ""
            chars[pg] += sum(1 for c in s if not c.isspace())
            blob[pg].append(s.lower())
    return chars, {pg: " ".join(v) for pg, v in blob.items()}


def tag(doc, qpages):
    chars, blob = page_text(doc)
    n = max([*chars, *qpages, 1])
    first_q = min(qpages) if qpages else n + 1
    last_q = max(qpages) if qpages else 0
    roles = {}
    for pg in range(1, n + 1):
        b = blob.get(pg, "")
        if pg in qpages:
            role = "question"
        elif pg < first_q and any(k in b for k in COVER_KW):
            role = "cover"                   # before blank: the cover's instructions mention "blank"
        elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300):
            role = "blank"
        elif any(k in b for k in APPENDIX_KW):
            role = "appendix"
        elif first_q <= pg <= last_q:
            role = "continuation"           # no question label but inside the question range
        else:
            role = "appendix"               # content outside the question range (end-matter/insert)
        roles[pg] = {"role": role, "chars": chars[pg],
                     "margins_enabled": role not in NO_MARGIN_ROLES,
                     "source": "auto", "confirmed": False}
    return roles


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("doc")
    ap.add_argument("--bands", required=True)
    ap.add_argument("--out", default="results/page_roles.json")
    a = ap.parse_args()
    bands = json.load(open(a.bands))
    qpages = {int(p) for p in bands["pages"]}
    roles = tag(json.load(open(a.doc)), qpages)
    json.dump({"pages": roles}, open(a.out, "w"), indent=2)
    from collections import Counter
    c = Counter(v["role"] for v in roles.values())
    print(f"roles: {dict(c)}")
    for pg in sorted(roles):
        r = roles[pg]
        flag = "" if r["margins_enabled"] else "  (no margins)"
        if r["role"] != "question":
            print(f"  p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}")
    print(f"-> wrote {a.out}")


if __name__ == "__main__":
    main()