Replace PyPDF2 with pymupdf for PDF text extraction

pymupdf produces cleaner text (e.g. no concatenated words in headers), handles encrypted and malformed PDFs that PyPDF2 failed on, and supports non-Latin scripts.
2026-04-05 14:45:28 +00:00 · 2026-03-04 06:43:37 -08:00 · 2026-03-04 06:43:37 -08:00 · f010aa1612
commit f010aa1612
parent f4d787ab8d
15 changed files with 18 additions and 20 deletions
--- a/modules/chat.py
+++ b/modules/chat.py
@ -657,15 +657,13 @@ def add_message_attachment(history, row_idx, file_path, is_user=True):

 def extract_pdf_text(pdf_path):
    """Extract text from a PDF file"""
-    import PyPDF2
+    import pymupdf

    text = ""
    try:
-        with open(pdf_path, 'rb') as file:
-            pdf_reader = PyPDF2.PdfReader(file)
-            for page_num in range(len(pdf_reader.pages)):
-                page = pdf_reader.pages[page_num]
-                text += page.extract_text() + "\n\n"
+        with pymupdf.open(pdf_path) as doc:
+            for page in doc:
+                text += page.get_text() + "\n\n"

        return text.strip()
    except Exception as e: