Skip bad PDF pages during text extraction

This commit is contained in:
qyh15 2026-05-22 22:51:21 +08:00
parent 5630118f7d
commit 69d175c684
1 changed files with 8 additions and 2 deletions

View File

@ -409,7 +409,10 @@ def extract_pdf_text(path: Path, max_chars: int) -> str:
reader = PdfReader(str(path)) reader = PdfReader(str(path))
chunks = [] chunks = []
for page in reader.pages[:max_pages]: for page in reader.pages[:max_pages]:
try:
chunks.append(page.extract_text() or "") chunks.append(page.extract_text() or "")
except Exception:
continue
if sum(len(chunk) for chunk in chunks) >= max_chars: if sum(len(chunk) for chunk in chunks) >= max_chars:
break break
return "\n".join(chunks)[:max_chars] return "\n".join(chunks)[:max_chars]
@ -419,7 +422,10 @@ def extract_pdf_text(path: Path, max_chars: int) -> str:
reader = PdfReader(str(path)) reader = PdfReader(str(path))
chunks = [] chunks = []
for page in reader.pages[:max_pages]: for page in reader.pages[:max_pages]:
try:
chunks.append(page.extract_text() or "") chunks.append(page.extract_text() or "")
except Exception:
continue
if sum(len(chunk) for chunk in chunks) >= max_chars: if sum(len(chunk) for chunk in chunks) >= max_chars:
break break
return "\n".join(chunks)[:max_chars] return "\n".join(chunks)[:max_chars]