mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-04-05 14:45:28 +00:00
Replace PyPDF2 with pymupdf for PDF text extraction
pymupdf produces cleaner text (e.g. no concatenated words in headers), handles encrypted and malformed PDFs that PyPDF2 failed on, and supports non-Latin scripts.
This commit is contained in:
parent
f4d787ab8d
commit
f010aa1612
15 changed files with 18 additions and 20 deletions
|
|
@ -657,15 +657,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):
|
|||
|
||||
def extract_pdf_text(pdf_path):
|
||||
"""Extract text from a PDF file"""
|
||||
import PyPDF2
|
||||
import pymupdf
|
||||
|
||||
text = ""
|
||||
try:
|
||||
with open(pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
text += page.extract_text() + "\n\n"
|
||||
with pymupdf.open(pdf_path) as doc:
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n\n"
|
||||
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue