QQwrite-skill/scripts/docx_template_audit.py

#!/usr/bin/env python3
"""Create a lightweight structural audit for a manuscript/template DOCX pair.

This script intentionally uses only the Python standard library. It inspects the
OOXML package directly and reports headings, paragraph-style usage, section-like
paragraphs, table count, and image count. It does not modify documents.
"""

from __future__ import annotations

import argparse
import collections
import json
import re
import sys
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET


NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
}


SECTION_PATTERNS = [
    "abstract",
    "keywords",
    "introduction",
    "results",
    "discussion",
    "conclusion",
    "experimental",
    "methods",
    "acknowledgements",
    "acknowledgments",
    "conflict of interest",
    "data availability",
    "references",
]


def read_xml(zf: zipfile.ZipFile, name: str) -> ET.Element | None:
    try:
        return ET.fromstring(zf.read(name))
    except KeyError:
        return None


def text_from_paragraph(p: ET.Element) -> str:
    parts = []
    for t in p.findall(".//w:t", NS):
        if t.text:
            parts.append(t.text)
    return "".join(parts).strip()


def style_from_paragraph(p: ET.Element) -> str:
    style = p.find("./w:pPr/w:pStyle", NS)
    if style is None:
        return "(none)"
    return style.attrib.get(f"{{{NS['w']}}}val", "(unknown)")


def inspect_docx(path: Path) -> dict:
    with zipfile.ZipFile(path) as zf:
        document = read_xml(zf, "word/document.xml")
        if document is None:
            raise ValueError(f"{path} does not contain word/document.xml")

        paragraphs = document.findall(".//w:p", NS)
        tables = document.findall(".//w:tbl", NS)
        drawings = document.findall(".//w:drawing", NS)

        style_counts: collections.Counter[str] = collections.Counter()
        headings = []
        section_hits = []
        nonempty_count = 0

        for index, p in enumerate(paragraphs, start=1):
            text = text_from_paragraph(p)
            style = style_from_paragraph(p)
            style_counts[style] += 1
            if not text:
                continue
            nonempty_count += 1
            normalized = re.sub(r"\s+", " ", text).strip().lower()
            if style.lower().startswith("heading") or style.lower().startswith("title"):
                headings.append({"index": index, "style": style, "text": text[:160]})
            if any(pattern == normalized or normalized.startswith(pattern + ":") for pattern in SECTION_PATTERNS):
                section_hits.append({"index": index, "style": style, "text": text[:160]})

        return {
            "path": str(path),
            "paragraphs": len(paragraphs),
            "nonempty_paragraphs": nonempty_count,
            "tables": len(tables),
            "drawings": len(drawings),
            "top_styles": style_counts.most_common(20),
            "headings": headings[:80],
            "section_hits": section_hits[:80],
        }


def main() -> int:
    parser = argparse.ArgumentParser(description="Audit a manuscript DOCX against a template DOCX.")
    parser.add_argument("--manuscript", required=True, type=Path)
    parser.add_argument("--template", required=True, type=Path)
    parser.add_argument("--out", type=Path, help="Optional JSON output path.")
    args = parser.parse_args()

    report = {
        "manuscript": inspect_docx(args.manuscript),
        "template": inspect_docx(args.template),
    }

    payload = json.dumps(report, ensure_ascii=False, indent=2)
    if args.out:
        args.out.write_text(payload, encoding="utf-8")
    else:
        print(payload)
    return 0


if __name__ == "__main__":
    sys.exit(main())