#!/usr/bin/env python3 """Create a lightweight structural audit for a manuscript/template DOCX pair. This script intentionally uses only the Python standard library. It inspects the OOXML package directly and reports headings, paragraph-style usage, section-like paragraphs, table count, and image count. It does not modify documents. """ from __future__ import annotations import argparse import collections import json import re import sys import zipfile from pathlib import Path from xml.etree import ElementTree as ET NS = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", } SECTION_PATTERNS = [ "abstract", "keywords", "introduction", "results", "discussion", "conclusion", "experimental", "methods", "acknowledgements", "acknowledgments", "conflict of interest", "data availability", "references", ] def read_xml(zf: zipfile.ZipFile, name: str) -> ET.Element | None: try: return ET.fromstring(zf.read(name)) except KeyError: return None def text_from_paragraph(p: ET.Element) -> str: parts = [] for t in p.findall(".//w:t", NS): if t.text: parts.append(t.text) return "".join(parts).strip() def style_from_paragraph(p: ET.Element) -> str: style = p.find("./w:pPr/w:pStyle", NS) if style is None: return "(none)" return style.attrib.get(f"{{{NS['w']}}}val", "(unknown)") def inspect_docx(path: Path) -> dict: with zipfile.ZipFile(path) as zf: document = read_xml(zf, "word/document.xml") if document is None: raise ValueError(f"{path} does not contain word/document.xml") paragraphs = document.findall(".//w:p", NS) tables = document.findall(".//w:tbl", NS) drawings = document.findall(".//w:drawing", NS) style_counts: collections.Counter[str] = collections.Counter() headings = [] section_hits = [] nonempty_count = 0 for index, p in enumerate(paragraphs, start=1): text = text_from_paragraph(p) style = style_from_paragraph(p) style_counts[style] += 1 if not text: continue nonempty_count += 1 normalized = re.sub(r"\s+", " ", text).strip().lower() if style.lower().startswith("heading") or style.lower().startswith("title"): headings.append({"index": index, "style": style, "text": text[:160]}) if any(pattern == normalized or normalized.startswith(pattern + ":") for pattern in SECTION_PATTERNS): section_hits.append({"index": index, "style": style, "text": text[:160]}) return { "path": str(path), "paragraphs": len(paragraphs), "nonempty_paragraphs": nonempty_count, "tables": len(tables), "drawings": len(drawings), "top_styles": style_counts.most_common(20), "headings": headings[:80], "section_hits": section_hits[:80], } def main() -> int: parser = argparse.ArgumentParser(description="Audit a manuscript DOCX against a template DOCX.") parser.add_argument("--manuscript", required=True, type=Path) parser.add_argument("--template", required=True, type=Path) parser.add_argument("--out", type=Path, help="Optional JSON output path.") args = parser.parse_args() report = { "manuscript": inspect_docx(args.manuscript), "template": inspect_docx(args.template), } payload = json.dumps(report, ensure_ascii=False, indent=2) if args.out: args.out.write_text(payload, encoding="utf-8") else: print(payload) return 0 if __name__ == "__main__": sys.exit(main())