From 263b5d5557efd7632f0d02adb6f7f44020f19b41 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Mon, 9 Jun 2025 17:55:26 -0700 Subject: [PATCH] Use html2text to extract the text of web searches without losing formatting --- modules/web_search.py | 36 ++++++++++--------- requirements/full/requirements.txt | 2 +- requirements/full/requirements_amd.txt | 2 +- requirements/full/requirements_amd_noavx2.txt | 2 +- .../full/requirements_apple_intel.txt | 2 +- .../full/requirements_apple_silicon.txt | 2 +- requirements/full/requirements_cpu_only.txt | 2 +- .../full/requirements_cpu_only_noavx2.txt | 2 +- requirements/full/requirements_cuda128.txt | 2 +- .../full/requirements_cuda128_noavx2.txt | 2 +- requirements/full/requirements_noavx2.txt | 2 +- requirements/full/requirements_nowheels.txt | 2 +- requirements/portable/requirements.txt | 2 +- .../portable/requirements_apple_intel.txt | 2 +- .../portable/requirements_apple_silicon.txt | 2 +- .../portable/requirements_cpu_only.txt | 2 +- .../portable/requirements_cpu_only_noavx2.txt | 2 +- requirements/portable/requirements_noavx2.txt | 2 +- .../portable/requirements_nowheels.txt | 2 +- requirements/portable/requirements_vulkan.txt | 2 +- .../portable/requirements_vulkan_noavx2.txt | 2 +- 21 files changed, 39 insertions(+), 37 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index 2b6c6c40..ffd7e483 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -3,8 +3,6 @@ from concurrent.futures import as_completed from datetime import datetime import requests -from bs4 import BeautifulSoup -from duckduckgo_search import DDGS from modules.logging_colors import logger @@ -14,35 +12,39 @@ def get_current_timestamp(): return datetime.now().strftime('%b %d, %Y %H:%M') -def download_web_page(url, timeout=5): - """Download and extract text from a web page""" +def download_web_page(url, timeout=10): + """ + Download a web page and convert its HTML content to structured Markdown text. + """ + import html2text + try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=timeout) - response.raise_for_status() + response.raise_for_status() # Raise an exception for bad status codes - soup = BeautifulSoup(response.content, 'html.parser') + # Initialize the HTML to Markdown converter + h = html2text.HTML2Text() + h.body_width = 0 - # Remove script and style elements - for script in soup(["script", "style"]): - script.decompose() + # Convert the HTML to Markdown + markdown_text = h.handle(response.text) - # Get text and clean it up - text = soup.get_text() - lines = (line.strip() for line in text.splitlines()) - chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) - text = ' '.join(chunk for chunk in chunks if chunk) - - return text - except Exception as e: + return markdown_text + except requests.exceptions.RequestException as e: logger.error(f"Error downloading {url}: {e}") return "" + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + return "" def perform_web_search(query, num_pages=3, max_workers=5): """Perform web search and return results with content""" + from duckduckgo_search import DDGS + try: with DDGS() as ddgs: results = list(ddgs.text(query, max_results=num_pages)) diff --git a/requirements/full/requirements.txt b/requirements/full/requirements.txt index 277f8249..b751482a 100644 --- a/requirements/full/requirements.txt +++ b/requirements/full/requirements.txt @@ -1,5 +1,4 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets @@ -7,6 +6,7 @@ duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_amd.txt b/requirements/full/requirements_amd.txt index dbf35c34..11bacf97 100644 --- a/requirements/full/requirements_amd.txt +++ b/requirements/full/requirements_amd.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_amd_noavx2.txt b/requirements/full/requirements_amd_noavx2.txt index 2e5eb6c9..a64a93f0 100644 --- a/requirements/full/requirements_amd_noavx2.txt +++ b/requirements/full/requirements_amd_noavx2.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_apple_intel.txt b/requirements/full/requirements_apple_intel.txt index 9a19ab29..62747ac4 100644 --- a/requirements/full/requirements_apple_intel.txt +++ b/requirements/full/requirements_apple_intel.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_apple_silicon.txt b/requirements/full/requirements_apple_silicon.txt index 973d9bfb..bc82f07a 100644 --- a/requirements/full/requirements_apple_silicon.txt +++ b/requirements/full/requirements_apple_silicon.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_cpu_only.txt b/requirements/full/requirements_cpu_only.txt index 4a48a51f..f880f40a 100644 --- a/requirements/full/requirements_cpu_only.txt +++ b/requirements/full/requirements_cpu_only.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_cpu_only_noavx2.txt b/requirements/full/requirements_cpu_only_noavx2.txt index 76bde864..6d8875cb 100644 --- a/requirements/full/requirements_cpu_only_noavx2.txt +++ b/requirements/full/requirements_cpu_only_noavx2.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_cuda128.txt b/requirements/full/requirements_cuda128.txt index 9fc99606..b2bcf91c 100644 --- a/requirements/full/requirements_cuda128.txt +++ b/requirements/full/requirements_cuda128.txt @@ -1,5 +1,4 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets @@ -7,6 +6,7 @@ duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* diff --git a/requirements/full/requirements_cuda128_noavx2.txt b/requirements/full/requirements_cuda128_noavx2.txt index ff34673a..54496cd7 100644 --- a/requirements/full/requirements_cuda128_noavx2.txt +++ b/requirements/full/requirements_cuda128_noavx2.txt @@ -1,5 +1,4 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets @@ -7,6 +6,7 @@ duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==2.2.* diff --git a/requirements/full/requirements_noavx2.txt b/requirements/full/requirements_noavx2.txt index 6cd0fa65..eabcdbd0 100644 --- a/requirements/full/requirements_noavx2.txt +++ b/requirements/full/requirements_noavx2.txt @@ -1,5 +1,4 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 bitsandbytes==0.45.* colorama datasets @@ -7,6 +6,7 @@ duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/full/requirements_nowheels.txt b/requirements/full/requirements_nowheels.txt index a412367c..d26663a7 100644 --- a/requirements/full/requirements_nowheels.txt +++ b/requirements/full/requirements_nowheels.txt @@ -1,11 +1,11 @@ accelerate==1.5.* -beautifulsoup4==4.13.4 colorama datasets duckduckgo_search==8.0.2 einops fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements.txt b/requirements/portable/requirements.txt index 60ce941e..5e5d4ba5 100644 --- a/requirements/portable/requirements.txt +++ b/requirements/portable/requirements.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_apple_intel.txt b/requirements/portable/requirements_apple_intel.txt index b1649bc9..4909f5a2 100644 --- a/requirements/portable/requirements_apple_intel.txt +++ b/requirements/portable/requirements_apple_intel.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_apple_silicon.txt b/requirements/portable/requirements_apple_silicon.txt index 571eba52..e54b2593 100644 --- a/requirements/portable/requirements_apple_silicon.txt +++ b/requirements/portable/requirements_apple_silicon.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_cpu_only.txt b/requirements/portable/requirements_cpu_only.txt index 88170cf3..74c0c5a7 100644 --- a/requirements/portable/requirements_cpu_only.txt +++ b/requirements/portable/requirements_cpu_only.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_cpu_only_noavx2.txt b/requirements/portable/requirements_cpu_only_noavx2.txt index e96cef49..264bc378 100644 --- a/requirements/portable/requirements_cpu_only_noavx2.txt +++ b/requirements/portable/requirements_cpu_only_noavx2.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_noavx2.txt b/requirements/portable/requirements_noavx2.txt index 78f94aa5..fcb8f05e 100644 --- a/requirements/portable/requirements_noavx2.txt +++ b/requirements/portable/requirements_noavx2.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_nowheels.txt b/requirements/portable/requirements_nowheels.txt index f6c866cf..3d30e6d6 100644 --- a/requirements/portable/requirements_nowheels.txt +++ b/requirements/portable/requirements_nowheels.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_vulkan.txt b/requirements/portable/requirements_vulkan.txt index 3e41427d..395f225f 100644 --- a/requirements/portable/requirements_vulkan.txt +++ b/requirements/portable/requirements_vulkan.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.* diff --git a/requirements/portable/requirements_vulkan_noavx2.txt b/requirements/portable/requirements_vulkan_noavx2.txt index 022ebb61..0d41f541 100644 --- a/requirements/portable/requirements_vulkan_noavx2.txt +++ b/requirements/portable/requirements_vulkan_noavx2.txt @@ -1,7 +1,7 @@ -beautifulsoup4==4.13.4 duckduckgo_search==8.0.2 fastapi==0.112.4 gradio==4.37.* +html2text==2025.4.15 jinja2==3.1.6 markdown numpy==1.26.*