api/api/services/docling/page_roles.py
kcar 5938613893
Some checks failed
api-ci-deploy / test-build-deploy (push) Has been cancelled
[verified] add docling auto-map package wrapper
2026-06-07 20:03:06 +01:00

89 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
page_roles.py — tag every page with a structural role (the first-pass page-layout pass).
Roles: cover / question / continuation / blank / appendix. Drives two things in the template:
* the human sees the paper's shape (which pages are non-question), and
* MARGINS are disabled on pages that have no content column (cover, blank) — the override the
user asked for ("the front page doesn't have margins").
Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and
whether the page carries a question band. Output feeds template.py via --page-roles.
Usage:
python page_roles.py <docling_doc.json> --bands <bands.json> [--out results/page_roles/x.json]
"""
import json, argparse
from collections import defaultdict
BLANK_MAX = 130 # non-space chars at/below which a page is boilerplate-only (blank)
COVER_KW = ("time allowed", "instructions", "materials", "information for")
BLANK_KW = ("blank page", "no questions printed", "no questions are printed")
APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet")
# pages where there is no content column -> margins do not apply (the user's override case)
NO_MARGIN_ROLES = {"cover", "blank"}
def page_text(doc):
chars, blob = defaultdict(int), defaultdict(list)
for t in doc.get("texts", []):
prov = t.get("prov") or []
pg = prov[0].get("page_no") if prov else None
if pg:
s = t.get("text") or ""
chars[pg] += sum(1 for c in s if not c.isspace())
blob[pg].append(s.lower())
return chars, {pg: " ".join(v) for pg, v in blob.items()}
def tag(doc, qpages):
chars, blob = page_text(doc)
n = max([*chars, *qpages, 1])
first_q = min(qpages) if qpages else n + 1
last_q = max(qpages) if qpages else 0
roles = {}
for pg in range(1, n + 1):
b = blob.get(pg, "")
if pg in qpages:
role = "question"
elif pg < first_q and any(k in b for k in COVER_KW):
role = "cover" # before blank: the cover's instructions mention "blank"
elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300):
role = "blank"
elif any(k in b for k in APPENDIX_KW):
role = "appendix"
elif first_q <= pg <= last_q:
role = "continuation" # no question label but inside the question range
else:
role = "appendix" # content outside the question range (end-matter/insert)
roles[pg] = {"role": role, "chars": chars[pg],
"margins_enabled": role not in NO_MARGIN_ROLES,
"source": "auto", "confirmed": False}
return roles
def main():
ap = argparse.ArgumentParser()
ap.add_argument("doc")
ap.add_argument("--bands", required=True)
ap.add_argument("--out", default="results/page_roles.json")
a = ap.parse_args()
bands = json.load(open(a.bands))
qpages = {int(p) for p in bands["pages"]}
roles = tag(json.load(open(a.doc)), qpages)
json.dump({"pages": roles}, open(a.out, "w"), indent=2)
from collections import Counter
c = Counter(v["role"] for v in roles.values())
print(f"roles: {dict(c)}")
for pg in sorted(roles):
r = roles[pg]
flag = "" if r["margins_enabled"] else " (no margins)"
if r["role"] != "question":
print(f" p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}")
print(f"-> wrote {a.out}")
if __name__ == "__main__":
main()