#!/usr/bin/env python3 """ page_roles.py — tag every page with a structural role (the first-pass page-layout pass). Roles: cover / question / continuation / blank / appendix. Drives two things in the template: * the human sees the paper's shape (which pages are non-question), and * MARGINS are disabled on pages that have no content column (cover, blank) — the override the user asked for ("the front page doesn't have margins"). Signals (deterministic, no GPU): per-page non-space char count, cover/boilerplate keywords, and whether the page carries a question band. Output feeds template.py via --page-roles. Usage: python page_roles.py --bands [--out results/page_roles/x.json] """ import json, argparse from collections import defaultdict BLANK_MAX = 130 # non-space chars at/below which a page is boilerplate-only (blank) COVER_KW = ("time allowed", "instructions", "materials", "information for") BLANK_KW = ("blank page", "no questions printed", "no questions are printed") APPENDIX_KW = ("data sheet", "formula", "periodic table", "insert", "resource booklet") # pages where there is no content column -> margins do not apply (the user's override case) NO_MARGIN_ROLES = {"cover", "blank"} def page_text(doc): chars, blob = defaultdict(int), defaultdict(list) for t in doc.get("texts", []): prov = t.get("prov") or [] pg = prov[0].get("page_no") if prov else None if pg: s = t.get("text") or "" chars[pg] += sum(1 for c in s if not c.isspace()) blob[pg].append(s.lower()) return chars, {pg: " ".join(v) for pg, v in blob.items()} def tag(doc, qpages): chars, blob = page_text(doc) n = max([*chars, *qpages, 1]) first_q = min(qpages) if qpages else n + 1 last_q = max(qpages) if qpages else 0 roles = {} for pg in range(1, n + 1): b = blob.get(pg, "") if pg in qpages: role = "question" elif pg < first_q and any(k in b for k in COVER_KW): role = "cover" # before blank: the cover's instructions mention "blank" elif chars[pg] <= BLANK_MAX or (any(k in b for k in BLANK_KW) and chars[pg] < 300): role = "blank" elif any(k in b for k in APPENDIX_KW): role = "appendix" elif first_q <= pg <= last_q: role = "continuation" # no question label but inside the question range else: role = "appendix" # content outside the question range (end-matter/insert) roles[pg] = {"role": role, "chars": chars[pg], "margins_enabled": role not in NO_MARGIN_ROLES, "source": "auto", "confirmed": False} return roles def main(): ap = argparse.ArgumentParser() ap.add_argument("doc") ap.add_argument("--bands", required=True) ap.add_argument("--out", default="results/page_roles.json") a = ap.parse_args() bands = json.load(open(a.bands)) qpages = {int(p) for p in bands["pages"]} roles = tag(json.load(open(a.doc)), qpages) json.dump({"pages": roles}, open(a.out, "w"), indent=2) from collections import Counter c = Counter(v["role"] for v in roles.values()) print(f"roles: {dict(c)}") for pg in sorted(roles): r = roles[pg] flag = "" if r["margins_enabled"] else " (no margins)" if r["role"] != "question": print(f" p{pg:2d}: {r['role']:12s} chars={r['chars']}{flag}") print(f"-> wrote {a.out}") if __name__ == "__main__": main()