Replace PyPDF2 with pymupdf for PDF text extraction

pymupdf produces cleaner text (e.g. no concatenated words in headers),
handles encrypted and malformed PDFs that PyPDF2 failed on, and
supports non-Latin scripts.
This commit is contained in:
oobabooga 2026-03-04 06:43:37 -08:00
parent f4d787ab8d
commit f010aa1612
15 changed files with 18 additions and 20 deletions

View file

@ -17,7 +17,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -15,7 +15,7 @@ peft==0.18.*
Pillow>=9.5.0
psutil
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests

View file

@ -6,7 +6,7 @@ jinja2==3.1.6
markdown
numpy==2.2.*
pydantic==2.11.0
PyPDF2==3.0.1
pymupdf==1.27.1
python-docx==1.1.2
pyyaml
requests