text-generation-webui/modules/web_search.py
2025-10-10 09:29:33 +03:00

324 lines
11 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import concurrent.futures
import html
import random
import re
import urllib.request
from concurrent.futures import as_completed
from datetime import datetime
from urllib.parse import quote_plus, urlparse, parse_qs, unquote
import requests
from modules import shared
from modules.logging_colors import logger
def get_current_timestamp():
"""Returns the current time in 24-hour format"""
return datetime.now().strftime('%b %d, %Y %H:%M')
def download_web_page(url, timeout=10):
"""
Download a web page and convert its HTML content to Markdown text,
handling Brotli/gzip and non-HTML content robustly.
"""
logger.info(f"Downloading {url}")
# --- soft deps
try:
import html2text
except Exception:
logger.exception("html2text import failed")
html2text = None
try:
from readability import Document
except Exception:
Document = None
try:
import brotli as _brotli
have_brotli = True
except Exception:
_brotli = None
have_brotli = False
import gzip, zlib, re, html as _html
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
# IMPORTANT: only advertise br if brotli is installed
"Accept-Encoding": "gzip, deflate" + (", br" if have_brotli else ""),
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
try:
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
# --- bail out early if it's not HTML
ctype = resp.headers.get("Content-Type", "").lower()
if not any(t in ctype for t in ("text/html", "application/xhtml+xml")):
logger.warning("Non-HTML content-type %r at %s", ctype, url)
return ""
# --- get raw bytes then decompress if server didn't/requests couldn't
raw = resp.content # bytes
enc_hdr = resp.headers.get("Content-Encoding", "").lower()
# If requests didn't decode (it normally does gzip/deflate), handle manually.
if "br" in enc_hdr and have_brotli:
try:
raw = _brotli.decompress(raw)
except Exception:
# it may already be decoded; ignore
pass
elif "gzip" in enc_hdr:
try:
raw = gzip.decompress(raw)
except Exception:
pass
elif "deflate" in enc_hdr:
try:
raw = zlib.decompress(raw, -zlib.MAX_WBITS)
except Exception:
pass
# --- decode text with a robust charset guess
# use HTTP charset if present
charset = None
if "charset=" in ctype:
charset = ctype.split("charset=")[-1].split(";")[0].strip()
if not charset:
# requests detector
charset = resp.apparent_encoding or "utf-8"
try:
html_text = raw.decode(charset, errors="replace")
except Exception:
html_text = raw.decode("utf-8", errors="replace")
# anti-bot shells (avoid empty output surprises)
if re.search(r"(cf-chl|Just a moment|enable JavaScript)", html_text, re.I):
logger.warning("Possible anti-bot/challenge page at %s", url)
# --- extract readable text (readability -> html2text -> fallback)
md_readability = ""
if Document is not None:
try:
doc = Document(html_text)
title = (doc.short_title() or "").strip()
main_html = doc.summary(html_partial=True)
main_text = re.sub(r"<[^>]+>", " ", main_html, flags=re.S)
main_text = re.sub(r"\s+", " ", main_text).strip()
if title:
md_readability = f"# {title}\n\n{main_text}".strip()
else:
md_readability = main_text
except Exception:
logger.exception("readability failed on %s", url)
md_html2text = ""
if html2text is not None:
try:
h = html2text.HTML2Text()
h.body_width = 0
h.ignore_images = True
h.ignore_links = True
h.single_line_break = True
md_html2text = (h.handle(html_text) or "").strip()
except Exception:
logger.exception("html2text failed on %s", url)
def _clean(s):
s = re.sub(r"<[^>]+>", " ", s, flags=re.S)
return _html.unescape(re.sub(r"\s+", " ", s)).strip()
# fallback: meta/title/headers/paragraphs + noscript
parts = []
t = re.search(r"<title[^>]*>(.*?)</title>", html_text, re.I | re.S)
if t:
parts.append(f"# {_clean(t.group(1))}")
for pat in [
r'<meta[^>]+property=["\']og:title["\'][^>]+content=["\'](.*?)["\']',
r'<meta[^>]+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']',
r'<meta[^>]+name=["\']description["\'][^>]+content=["\'](.*?)["\']',
r'<meta[^>]+name=["\']twitter:description["\'][^>]+content=["\'](.*?)["\']',
]:
m = re.search(pat, html_text, re.I | re.S)
if m:
parts.append(_clean(m.group(1)))
parts += [f"## {_clean(h)}" for h in re.findall(r"<h[1-3][^>]*>(.*?)</h[1-3]>", html_text, re.I | re.S)[:4] if _clean(h)]
parts += [_clean(p) for p in re.findall(r"<p[^>]*>(.*?)</p>", html_text, re.I | re.S)[:8] if _clean(p)]
for n in re.findall(r"<noscript[^>]*>(.*?)</noscript>", html_text, re.I | re.S):
c = _clean(n)
if c:
parts.append(c)
md_fallback = "\n\n".join([p for p in parts if p]).strip()
best = max([md_readability, md_html2text, md_fallback], key=lambda s: len(s or ""))
if not best.strip():
logger.warning("Empty content extracted from %s", url)
return best
except requests.exceptions.RequestException as e:
logger.error(f"Error downloading {url}: {e}")
return ""
except Exception:
logger.exception("Unexpected error while downloading %s", url)
return ""
def _extract_results_from_duckduckgo(response_text, num_pages):
# 1) Grab the title anchors (they carry the real clickable href)
# We capture both the inner text (title) and href.
anchor_pattern = re.compile(
r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)</a>',
re.DOTALL | re.IGNORECASE
)
matches = anchor_pattern.findall(response_text)
results = []
for href, title_html in matches:
# 2) Resolve DuckDuckGo redirect: ?uddg=<encoded_target>
parsed = urlparse(href)
target_url = href
if parsed.netloc.endswith("duckduckgo.com"):
qs = parse_qs(parsed.query)
if "uddg" in qs and qs["uddg"]:
target_url = unquote(qs["uddg"][0])
# 3) Clean title
title_text = re.sub(r'<[^>]+>', '', title_html).strip()
title_text = html.unescape(title_text)
# 4) Basic normalization: add scheme if missing
if target_url.startswith("//"):
target_url = "https:" + target_url
elif not re.match(r'^https?://', target_url, flags=re.I):
target_url = "https://" + target_url
results.append((target_url, title_text))
if len(results) >= num_pages:
break
return results
def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
"""Perform web search and return results with content"""
try:
search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
]
req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
with urllib.request.urlopen(req, timeout=timeout) as response:
response_text = response.read().decode('utf-8', errors='replace')
# Extract (url, title) pairs from the proper anchors
download_tasks = _extract_results_from_duckduckgo(response_text, num_pages)
if not download_tasks:
return []
search_results = [None] * len(download_tasks)
# Download pages in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_index = {
executor.submit(download_web_page, url, timeout): (i, url, title)
for i, (url, title) in enumerate(download_tasks)
}
for future in as_completed(future_to_index):
i, url, title = future_to_index[future]
try:
content = future.result()
except Exception:
content = ""
search_results[i] = {
"title": title,
"url": url,
"content": content or ""
}
return search_results
except Exception as e:
logger.error(f"Error performing web search: {e}")
return []
def truncate_content_by_tokens(content, max_tokens=8192):
"""Truncate content to fit within token limit using binary search"""
if len(shared.tokenizer.encode(content)) <= max_tokens:
return content
left, right = 0, len(content)
while left < right:
mid = (left + right + 1) // 2
if len(shared.tokenizer.encode(content[:mid])) <= max_tokens:
left = mid
else:
right = mid - 1
return content[:left]
def add_web_search_attachments(history, row_idx, user_message, search_query, state):
"""Perform web search and add results as attachments"""
if not search_query:
logger.warning("No search query provided")
return
try:
# Perform web search
num_pages = int(state.get('web_search_pages', 3))
search_results = perform_web_search(search_query, num_pages)
if not search_results:
logger.warning("No search results found")
return
# Filter out failed downloads before adding attachments
successful_results = [result for result in search_results if result['content'].strip()]
if not successful_results:
logger.warning("No successful downloads to add as attachments")
return
# Add search results as attachments
key = f"user_{row_idx}"
if key not in history['metadata']:
history['metadata'][key] = {"timestamp": get_current_timestamp()}
if "attachments" not in history['metadata'][key]:
history['metadata'][key]["attachments"] = []
for result in successful_results:
attachment = {
"name": result['title'],
"type": "text/html",
"url": result['url'],
"content": truncate_content_by_tokens(result['content'])
}
history['metadata'][key]["attachments"].append(attachment)
logger.info(f"Added {len(successful_results)} successful web search results as attachments.")
except Exception as e:
logger.error(f"Error in web search: {e}")