Clean invalid Unicode before LLM requests

2026-05-22 16:49:40 +08:00 · 2026-05-22 16:49:40 +08:00 · 5630118f7d
parent dd6e06665c
commit 5630118f7d
1 changed files with 11 additions and 1 deletions
--- a/scripts/generate_zotero_ai_note.py
+++ b/scripts/generate_zotero_ai_note.py
@ -52,6 +52,16 @@ def fail(message: str) -> None:
    raise SystemExit(1)


+def clean_text(value: Any) -> Any:
+    if isinstance(value, str):
+        return value.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
+    if isinstance(value, list):
+        return [clean_text(item) for item in value]
+    if isinstance(value, dict):
+        return {clean_text(key): clean_text(item) for key, item in value.items()}
+    return value
+
+
 def load_dotenv(path: Path) -> None:
    if not path.exists():
        return
@ -473,7 +483,7 @@ def build_prompt(item: dict[str, Any], bibtex: str, fulltext: str, vault: Path,
        [
            "只输出最终 Markdown 笔记，不要输出解释、判断过程或代码围栏。",
            "文献材料如下：",
-            json.dumps(source, ensure_ascii=False, indent=2),
+            json.dumps(clean_text(source), ensure_ascii=False, indent=2),
        ]
    )
    return "\n\n".join(instructions)