128 lines
3.8 KiB
Python
128 lines
3.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Create a lightweight structural audit for a manuscript/template DOCX pair.
|
|
|
|
This script intentionally uses only the Python standard library. It inspects the
|
|
OOXML package directly and reports headings, paragraph-style usage, section-like
|
|
paragraphs, table count, and image count. It does not modify documents.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import collections
|
|
import json
|
|
import re
|
|
import sys
|
|
import zipfile
|
|
from pathlib import Path
|
|
from xml.etree import ElementTree as ET
|
|
|
|
|
|
NS = {
|
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
}
|
|
|
|
|
|
SECTION_PATTERNS = [
|
|
"abstract",
|
|
"keywords",
|
|
"introduction",
|
|
"results",
|
|
"discussion",
|
|
"conclusion",
|
|
"experimental",
|
|
"methods",
|
|
"acknowledgements",
|
|
"acknowledgments",
|
|
"conflict of interest",
|
|
"data availability",
|
|
"references",
|
|
]
|
|
|
|
|
|
def read_xml(zf: zipfile.ZipFile, name: str) -> ET.Element | None:
|
|
try:
|
|
return ET.fromstring(zf.read(name))
|
|
except KeyError:
|
|
return None
|
|
|
|
|
|
def text_from_paragraph(p: ET.Element) -> str:
|
|
parts = []
|
|
for t in p.findall(".//w:t", NS):
|
|
if t.text:
|
|
parts.append(t.text)
|
|
return "".join(parts).strip()
|
|
|
|
|
|
def style_from_paragraph(p: ET.Element) -> str:
|
|
style = p.find("./w:pPr/w:pStyle", NS)
|
|
if style is None:
|
|
return "(none)"
|
|
return style.attrib.get(f"{{{NS['w']}}}val", "(unknown)")
|
|
|
|
|
|
def inspect_docx(path: Path) -> dict:
|
|
with zipfile.ZipFile(path) as zf:
|
|
document = read_xml(zf, "word/document.xml")
|
|
if document is None:
|
|
raise ValueError(f"{path} does not contain word/document.xml")
|
|
|
|
paragraphs = document.findall(".//w:p", NS)
|
|
tables = document.findall(".//w:tbl", NS)
|
|
drawings = document.findall(".//w:drawing", NS)
|
|
|
|
style_counts: collections.Counter[str] = collections.Counter()
|
|
headings = []
|
|
section_hits = []
|
|
nonempty_count = 0
|
|
|
|
for index, p in enumerate(paragraphs, start=1):
|
|
text = text_from_paragraph(p)
|
|
style = style_from_paragraph(p)
|
|
style_counts[style] += 1
|
|
if not text:
|
|
continue
|
|
nonempty_count += 1
|
|
normalized = re.sub(r"\s+", " ", text).strip().lower()
|
|
if style.lower().startswith("heading") or style.lower().startswith("title"):
|
|
headings.append({"index": index, "style": style, "text": text[:160]})
|
|
if any(pattern == normalized or normalized.startswith(pattern + ":") for pattern in SECTION_PATTERNS):
|
|
section_hits.append({"index": index, "style": style, "text": text[:160]})
|
|
|
|
return {
|
|
"path": str(path),
|
|
"paragraphs": len(paragraphs),
|
|
"nonempty_paragraphs": nonempty_count,
|
|
"tables": len(tables),
|
|
"drawings": len(drawings),
|
|
"top_styles": style_counts.most_common(20),
|
|
"headings": headings[:80],
|
|
"section_hits": section_hits[:80],
|
|
}
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Audit a manuscript DOCX against a template DOCX.")
|
|
parser.add_argument("--manuscript", required=True, type=Path)
|
|
parser.add_argument("--template", required=True, type=Path)
|
|
parser.add_argument("--out", type=Path, help="Optional JSON output path.")
|
|
args = parser.parse_args()
|
|
|
|
report = {
|
|
"manuscript": inspect_docx(args.manuscript),
|
|
"template": inspect_docx(args.template),
|
|
}
|
|
|
|
payload = json.dumps(report, ensure_ascii=False, indent=2)
|
|
if args.out:
|
|
args.out.write_text(payload, encoding="utf-8")
|
|
else:
|
|
print(payload)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|