mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-01-05 00:01:01 +01:00
Use html2text to extract the text of web searches without losing formatting
This commit is contained in:
parent
f5a5d0c0cb
commit
263b5d5557
|
|
@ -3,8 +3,6 @@ from concurrent.futures import as_completed
|
|||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from duckduckgo_search import DDGS
|
||||
|
||||
from modules.logging_colors import logger
|
||||
|
||||
|
|
@ -14,35 +12,39 @@ def get_current_timestamp():
|
|||
return datetime.now().strftime('%b %d, %Y %H:%M')
|
||||
|
||||
|
||||
def download_web_page(url, timeout=5):
|
||||
"""Download and extract text from a web page"""
|
||||
def download_web_page(url, timeout=10):
|
||||
"""
|
||||
Download a web page and convert its HTML content to structured Markdown text.
|
||||
"""
|
||||
import html2text
|
||||
|
||||
try:
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
response = requests.get(url, headers=headers, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
response.raise_for_status() # Raise an exception for bad status codes
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
# Initialize the HTML to Markdown converter
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0
|
||||
|
||||
# Remove script and style elements
|
||||
for script in soup(["script", "style"]):
|
||||
script.decompose()
|
||||
# Convert the HTML to Markdown
|
||||
markdown_text = h.handle(response.text)
|
||||
|
||||
# Get text and clean it up
|
||||
text = soup.get_text()
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = ' '.join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text
|
||||
except Exception as e:
|
||||
return markdown_text
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error downloading {url}: {e}")
|
||||
return ""
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def perform_web_search(query, num_pages=3, max_workers=5):
|
||||
"""Perform web search and return results with content"""
|
||||
from duckduckgo_search import DDGS
|
||||
|
||||
try:
|
||||
with DDGS() as ddgs:
|
||||
results = list(ddgs.text(query, max_results=num_pages))
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
bitsandbytes==0.45.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
|
|||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
bitsandbytes==0.45.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
|
|||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==2.2.*
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
bitsandbytes==0.45.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
|
|||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==2.2.*
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
bitsandbytes==0.45.*
|
||||
colorama
|
||||
datasets
|
||||
|
|
@ -7,6 +6,7 @@ duckduckgo_search==8.0.2
|
|||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
accelerate==1.5.*
|
||||
beautifulsoup4==4.13.4
|
||||
colorama
|
||||
datasets
|
||||
duckduckgo_search==8.0.2
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
beautifulsoup4==4.13.4
|
||||
duckduckgo_search==8.0.2
|
||||
fastapi==0.112.4
|
||||
gradio==4.37.*
|
||||
html2text==2025.4.15
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
numpy==1.26.*
|
||||
|
|
|
|||
Loading…
Reference in a new issue