From c908ac00d76d263c202a1b0cd2ed48c6f369f5e5 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sat, 14 Mar 2026 09:29:17 -0700 Subject: [PATCH] Replace html2text with trafilatura for better web content extraction After this change a lot of boilerplate is removed from web pages, saving tokens on agentic loops. --- modules/web_search.py | 21 ++++++++----------- requirements/full/requirements.txt | 1 - requirements/full/requirements_amd.txt | 1 - .../full/requirements_apple_intel.txt | 1 - .../full/requirements_apple_silicon.txt | 1 - requirements/full/requirements_cpu_only.txt | 1 - requirements/full/requirements_nowheels.txt | 1 - requirements/portable/requirements.txt | 2 +- requirements/portable/requirements_amd.txt | 2 +- .../portable/requirements_apple_intel.txt | 2 +- .../portable/requirements_apple_silicon.txt | 2 +- .../portable/requirements_cpu_only.txt | 2 +- .../portable/requirements_cuda131.txt | 2 +- .../portable/requirements_nowheels.txt | 2 +- requirements/portable/requirements_vulkan.txt | 2 +- 15 files changed, 17 insertions(+), 26 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index 216d7933..a4424ee3 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -42,9 +42,9 @@ def get_current_timestamp(): def download_web_page(url, timeout=10, include_links=False): """ - Download a web page and convert its HTML content to structured Markdown text. + Download a web page and extract its main content as Markdown text. """ - import html2text + import trafilatura try: _validate_url(url) @@ -62,16 +62,13 @@ def download_web_page(url, timeout=10, include_links=False): response.raise_for_status() - # Initialize the HTML to Markdown converter - h = html2text.HTML2Text() - h.body_width = 0 - h.ignore_images = True - h.ignore_links = not include_links - - # Convert the HTML to Markdown - markdown_text = h.handle(response.text) - - return markdown_text + result = trafilatura.extract( + response.text, + include_links=include_links, + output_format='markdown', + url=url + ) + return result or "" except requests.exceptions.RequestException as e: logger.error(f"Error downloading {url}: {e}") return "" diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 8a0802f7..e493d83d 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -6,7 +6,6 @@ diffusers==0.37.* einops fastapi==0.112.4 flash-linear-attention==0.4.* -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index 9b31d668..48cace33 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -4,7 +4,6 @@ datasets diffusers==0.37.* einops fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 138639e5..f9132f2e 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -4,7 +4,6 @@ datasets diffusers==0.37.* einops fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index f3ebd171..e4b2882d 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -4,7 +4,6 @@ datasets diffusers==0.37.* einops fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index e32a2ed1..1b42737b 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -4,7 +4,6 @@ datasets diffusers==0.37.* einops fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index 6128c0ed..ea9ad2c7 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -4,7 +4,6 @@ datasets diffusers==0.37.* einops fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 93eb3b85..0471cc73 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_amd.txt b/requirements/portable/requirements_amd.txt index 36e0e4d9..dfefce20 100644 --- a/requirements/portable/requirements_amd.txt +++ b/requirements/portable/requirements_amd.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index 495bd5fa..5c032e6b 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 7e82f68d..385ecedf 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 046619e1..d8f7d494 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_cuda131.txt b/requirements/portable/requirements_cuda131.txt index 590562f8..adc6a065 100644 --- a/requirements/portable/requirements_cuda131.txt +++ b/requirements/portable/requirements_cuda131.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index 8c3e2aac..942f7a2a 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index bf80deb0..fca722fd 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,6 +1,5 @@ audioop-lts<1.0; python_version >= "3.13" fastapi==0.112.4 -html2text==2025.4.15 huggingface-hub==1.5.* jinja2==3.1.6 markdown @@ -11,6 +10,7 @@ python-docx==1.1.2 pyyaml requests rich +trafilatura==2.0.0 tqdm # Gradio