Clean invalid Unicode before LLM requests

This commit is contained in:
qyh15 2026-05-22 16:49:40 +08:00
parent dd6e06665c
commit 5630118f7d
1 changed files with 11 additions and 1 deletions

View File

@ -52,6 +52,16 @@ def fail(message: str) -> None:
raise SystemExit(1) raise SystemExit(1)
def clean_text(value: Any) -> Any:
if isinstance(value, str):
return value.encode("utf-8", errors="ignore").decode("utf-8", errors="ignore")
if isinstance(value, list):
return [clean_text(item) for item in value]
if isinstance(value, dict):
return {clean_text(key): clean_text(item) for key, item in value.items()}
return value
def load_dotenv(path: Path) -> None: def load_dotenv(path: Path) -> None:
if not path.exists(): if not path.exists():
return return
@ -473,7 +483,7 @@ def build_prompt(item: dict[str, Any], bibtex: str, fulltext: str, vault: Path,
[ [
"只输出最终 Markdown 笔记,不要输出解释、判断过程或代码围栏。", "只输出最终 Markdown 笔记,不要输出解释、判断过程或代码围栏。",
"文献材料如下:", "文献材料如下:",
json.dumps(source, ensure_ascii=False, indent=2), json.dumps(clean_text(source), ensure_ascii=False, indent=2),
] ]
) )
return "\n\n".join(instructions) return "\n\n".join(instructions)