mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-06 05:33:50 +01:00
Replace PyPDF2 with pymupdf for PDF text extraction
pymupdf produces cleaner text (e.g. no concatenated words in headers), handles encrypted and malformed PDFs that PyPDF2 failed on, and supports non-Latin scripts.
This commit is contained in:
parent
f4d787ab8d
commit
f010aa1612
|
|
@ -657,15 +657,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
|
|||
|
||||
def extract_pdf_text(pdf_path):
|
||||
"""Extract text from a PDF file"""
|
||||
import PyPDF2
|
||||
import pymupdf
|
||||
|
||||
text = ""
|
||||
try:
|
||||
with open(pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
text += page.extract_text() + "\n\n"
|
||||
with pymupdf.open(pdf_path) as doc:
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n\n"
|
||||
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ peft==0.18.*
|
|||
Pillow>=9.5.0
|
||||
psutil
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ jinja2==3.1.6
|
|||
markdown
|
||||
numpy==2.2.*
|
||||
pydantic==2.11.0
|
||||
PyPDF2==3.0.1
|
||||
pymupdf==1.27.1
|
||||
python-docx==1.1.2
|
||||
pyyaml
|
||||
requests
|
||||
|
|
|
|||
Loading…
Reference in a new issue