Replace PyPDF2 with pymupdf for PDF text extraction

pymupdf produces cleaner text (e.g. no concatenated words in headers),
handles encrypted and malformed PDFs that PyPDF2 failed on, and
supports non-Latin scripts.
This commit is contained in:
oobabooga 2026-03-04 06:43:37 -08:00
parent f4d787ab8d
commit f010aa1612
15 changed files with 18 additions and 20 deletions

View file

@ -657,15 +657,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
def extract_pdf_text(pdf_path):
"""Extract text from a PDF file"""
import PyPDF2
import pymupdf
text = ""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n\n"
with pymupdf.open(pdf_path) as doc:
for page in doc:
text += page.get_text() + "\n\n"
return text.strip()
except Exception as e:

View file

@ -17,7 +17,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests