Use html2text to extract the text of web searches without losing formatting

This commit is contained in:
oobabooga 2025-06-09 17:55:26 -07:00
parent f5a5d0c0cb
commit 263b5d5557
21 changed files with 39 additions and 37 deletions

View file

@ -3,8 +3,6 @@ from concurrent.futures import as_completed
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
from modules.logging_colors import logger
@ -14,35 +12,39 @@ def get_current_timestamp():
return datetime.now().strftime('%b %d, %Y %H:%M')
def download_web_page(url, timeout=5):
"""Download and extract text from a web page"""
def download_web_page(url, timeout=10):
"""
Download a web page and convert its HTML content to structured Markdown text.
"""
import html2text
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
response.raise_for_status() # Raise an exception for bad status codes
soup = BeautifulSoup(response.content, 'html.parser')
# Initialize the HTML to Markdown converter
h = html2text.HTML2Text()
h.body_width = 0
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Convert the HTML to Markdown
markdown_text = h.handle(response.text)
# Get text and clean it up
text = soup.get_text()
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
return text
except Exception as e:
return markdown_text
except requests.exceptions.RequestException as e:
logger.error(f"Error downloading {url}: {e}")
return ""
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
return ""
def perform_web_search(query, num_pages=3, max_workers=5):
"""Perform web search and return results with content"""
from duckduckgo_search import DDGS
try:
with DDGS() as ddgs:
results = list(ddgs.text(query, max_results=num_pages))

View file

@ -1,5 +1,4 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,5 +1,4 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==2.2.*

View file

@ -1,5 +1,4 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==2.2.*

View file

@ -1,5 +1,4 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
bitsandbytes==0.45.*
colorama
datasets
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,11 +1,11 @@
accelerate==1.5.*
beautifulsoup4==4.13.4
colorama
datasets
duckduckgo_search==8.0.2
einops
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*

View file

@ -1,7 +1,7 @@
beautifulsoup4==4.13.4
duckduckgo_search==8.0.2
fastapi==0.112.4
gradio==4.37.*
html2text==2025.4.15
jinja2==3.1.6
markdown
numpy==1.26.*