QQwrite-skill/scripts/docx_template_audit.py

128 lines
3.8 KiB
Python

#!/usr/bin/env python3
"""Create a lightweight structural audit for a manuscript/template DOCX pair.
This script intentionally uses only the Python standard library. It inspects the
OOXML package directly and reports headings, paragraph-style usage, section-like
paragraphs, table count, and image count. It does not modify documents.
"""
from __future__ import annotations
import argparse
import collections
import json
import re
import sys
import zipfile
from pathlib import Path
from xml.etree import ElementTree as ET
NS = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
}
SECTION_PATTERNS = [
"abstract",
"keywords",
"introduction",
"results",
"discussion",
"conclusion",
"experimental",
"methods",
"acknowledgements",
"acknowledgments",
"conflict of interest",
"data availability",
"references",
]
def read_xml(zf: zipfile.ZipFile, name: str) -> ET.Element | None:
try:
return ET.fromstring(zf.read(name))
except KeyError:
return None
def text_from_paragraph(p: ET.Element) -> str:
parts = []
for t in p.findall(".//w:t", NS):
if t.text:
parts.append(t.text)
return "".join(parts).strip()
def style_from_paragraph(p: ET.Element) -> str:
style = p.find("./w:pPr/w:pStyle", NS)
if style is None:
return "(none)"
return style.attrib.get(f"{{{NS['w']}}}val", "(unknown)")
def inspect_docx(path: Path) -> dict:
with zipfile.ZipFile(path) as zf:
document = read_xml(zf, "word/document.xml")
if document is None:
raise ValueError(f"{path} does not contain word/document.xml")
paragraphs = document.findall(".//w:p", NS)
tables = document.findall(".//w:tbl", NS)
drawings = document.findall(".//w:drawing", NS)
style_counts: collections.Counter[str] = collections.Counter()
headings = []
section_hits = []
nonempty_count = 0
for index, p in enumerate(paragraphs, start=1):
text = text_from_paragraph(p)
style = style_from_paragraph(p)
style_counts[style] += 1
if not text:
continue
nonempty_count += 1
normalized = re.sub(r"\s+", " ", text).strip().lower()
if style.lower().startswith("heading") or style.lower().startswith("title"):
headings.append({"index": index, "style": style, "text": text[:160]})
if any(pattern == normalized or normalized.startswith(pattern + ":") for pattern in SECTION_PATTERNS):
section_hits.append({"index": index, "style": style, "text": text[:160]})
return {
"path": str(path),
"paragraphs": len(paragraphs),
"nonempty_paragraphs": nonempty_count,
"tables": len(tables),
"drawings": len(drawings),
"top_styles": style_counts.most_common(20),
"headings": headings[:80],
"section_hits": section_hits[:80],
}
def main() -> int:
parser = argparse.ArgumentParser(description="Audit a manuscript DOCX against a template DOCX.")
parser.add_argument("--manuscript", required=True, type=Path)
parser.add_argument("--template", required=True, type=Path)
parser.add_argument("--out", type=Path, help="Optional JSON output path.")
args = parser.parse_args()
report = {
"manuscript": inspect_docx(args.manuscript),
"template": inspect_docx(args.template),
}
payload = json.dumps(report, ensure_ascii=False, indent=2)
if args.out:
args.out.write_text(payload, encoding="utf-8")
else:
print(payload)
return 0
if __name__ == "__main__":
sys.exit(main())