89 lines
3.5 KiB
Python
89 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
page_roles.py — tag every page with a structural role (the first-pass page-layout pass).
|
|
|
|
Roles: cover / question / continuation / blank / appendix. Drives two things in the template:
|
|
* the human sees the paper's shape (which pages are non-question), and
|
|
* MARGINS are disabled on pages that have no content column (cover, blank) — the override the
|
|
user asked for ("the front page doesn't have margins").
|
|
|
|
Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and
|
|
whether the page carries a question band. Output feeds template.py via --page-roles.
|
|
|
|
Usage:
|
|
python page_roles.py <docling_doc.json> --bands <bands.json> [--out results/page_roles/x.json]
|
|
"""
|
|
import json, argparse
|
|
from collections import defaultdict
|
|
|
|
BLANK_MAX = 130 # non-space chars at/below which a page is boilerplate-only (blank)
|
|
COVER_KW = ("time allowed", "instructions", "materials", "information for")
|
|
BLANK_KW = ("blank page", "no questions printed", "no questions are printed")
|
|
APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet")
|
|
|
|
# pages where there is no content column -> margins do not apply (the user's override case)
|
|
NO_MARGIN_ROLES = {"cover", "blank"}
|
|
|
|
|
|
def page_text(doc):
|
|
chars, blob = defaultdict(int), defaultdict(list)
|
|
for t in doc.get("texts", []):
|
|
prov = t.get("prov") or []
|
|
pg = prov[0].get("page_no") if prov else None
|
|
if pg:
|
|
s = t.get("text") or ""
|
|
chars[pg] += sum(1 for c in s if not c.isspace())
|
|
blob[pg].append(s.lower())
|
|
return chars, {pg: " ".join(v) for pg, v in blob.items()}
|
|
|
|
|
|
def tag(doc, qpages):
|
|
chars, blob = page_text(doc)
|
|
n = max([*chars, *qpages, 1])
|
|
first_q = min(qpages) if qpages else n + 1
|
|
last_q = max(qpages) if qpages else 0
|
|
roles = {}
|
|
for pg in range(1, n + 1):
|
|
b = blob.get(pg, "")
|
|
if pg in qpages:
|
|
role = "question"
|
|
elif pg < first_q and any(k in b for k in COVER_KW):
|
|
role = "cover" # before blank: the cover's instructions mention "blank"
|
|
elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300):
|
|
role = "blank"
|
|
elif any(k in b for k in APPENDIX_KW):
|
|
role = "appendix"
|
|
elif first_q <= pg <= last_q:
|
|
role = "continuation" # no question label but inside the question range
|
|
else:
|
|
role = "appendix" # content outside the question range (end-matter/insert)
|
|
roles[pg] = {"role": role, "chars": chars[pg],
|
|
"margins_enabled": role not in NO_MARGIN_ROLES,
|
|
"source": "auto", "confirmed": False}
|
|
return roles
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("doc")
|
|
ap.add_argument("--bands", required=True)
|
|
ap.add_argument("--out", default="results/page_roles.json")
|
|
a = ap.parse_args()
|
|
bands = json.load(open(a.bands))
|
|
qpages = {int(p) for p in bands["pages"]}
|
|
roles = tag(json.load(open(a.doc)), qpages)
|
|
json.dump({"pages": roles}, open(a.out, "w"), indent=2)
|
|
from collections import Counter
|
|
c = Counter(v["role"] for v in roles.values())
|
|
print(f"roles: {dict(c)}")
|
|
for pg in sorted(roles):
|
|
r = roles[pg]
|
|
flag = "" if r["margins_enabled"] else " (no margins)"
|
|
if r["role"] != "question":
|
|
print(f" p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}")
|
|
print(f"-> wrote {a.out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|