Skip bad PDF pages during text extraction
This commit is contained in:
parent
5630118f7d
commit
69d175c684
|
|
@ -409,7 +409,10 @@ def extract_pdf_text(path: Path, max_chars: int) -> str:
|
||||||
reader = PdfReader(str(path))
|
reader = PdfReader(str(path))
|
||||||
chunks = []
|
chunks = []
|
||||||
for page in reader.pages[:max_pages]:
|
for page in reader.pages[:max_pages]:
|
||||||
chunks.append(page.extract_text() or "")
|
try:
|
||||||
|
chunks.append(page.extract_text() or "")
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
if sum(len(chunk) for chunk in chunks) >= max_chars:
|
if sum(len(chunk) for chunk in chunks) >= max_chars:
|
||||||
break
|
break
|
||||||
return "\n".join(chunks)[:max_chars]
|
return "\n".join(chunks)[:max_chars]
|
||||||
|
|
@ -419,7 +422,10 @@ def extract_pdf_text(path: Path, max_chars: int) -> str:
|
||||||
reader = PdfReader(str(path))
|
reader = PdfReader(str(path))
|
||||||
chunks = []
|
chunks = []
|
||||||
for page in reader.pages[:max_pages]:
|
for page in reader.pages[:max_pages]:
|
||||||
chunks.append(page.extract_text() or "")
|
try:
|
||||||
|
chunks.append(page.extract_text() or "")
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
if sum(len(chunk) for chunk in chunks) >= max_chars:
|
if sum(len(chunk) for chunk in chunks) >= max_chars:
|
||||||
break
|
break
|
||||||
return "\n".join(chunks)[:max_chars]
|
return "\n".join(chunks)[:max_chars]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue