Replace PyPDF2 with pymupdf for PDF text extraction

pymupdf produces cleaner text (e.g. no concatenated words in headers),
handles encrypted and malformed PDFs that PyPDF2 failed on, and
supports non-Latin scripts.
This commit is contained in:
oobabooga 2026-03-04 06:43:37 -08:00
parent f4d787ab8d
commit f010aa1612
15 changed files with 18 additions and 20 deletions

View file

@ -657,15 +657,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
def extract_pdf_text(pdf_path):
"""Extract text from a PDF file"""
import PyPDF2
import pymupdf
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
with pymupdf.open(pdf_path) as doc:
for page in doc:
text += page.get_text() + "\n\n"
return text.strip()
except Exception as e: