import concurrent.futures import html import random import re import urllib.request from concurrent.futures import as_completed from datetime import datetime from urllib.parse import quote_plus, urlparse, parse_qs, unquote import requests from modules import shared from modules.logging_colors import logger def get_current_timestamp(): """Returns the current time in 24-hour format""" return datetime.now().strftime('%b %d, %Y %H:%M') def download_web_page(url, timeout=10): """ Download a web page and convert its HTML content to Markdown text, handling Brotli/gzip and non-HTML content robustly. """ logger.info(f"Downloading {url}") # --- soft deps try: import html2text except Exception: logger.exception("html2text import failed") html2text = None try: from readability import Document except Exception: Document = None try: import brotli as _brotli have_brotli = True except Exception: _brotli = None have_brotli = False import gzip, zlib, re, html as _html headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.9", # IMPORTANT: only advertise br if brotli is installed "Accept-Encoding": "gzip, deflate" + (", br" if have_brotli else ""), "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", } try: resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() # --- bail out early if it's not HTML ctype = resp.headers.get("Content-Type", "").lower() if not any(t in ctype for t in ("text/html", "application/xhtml+xml")): logger.warning("Non-HTML content-type %r at %s", ctype, url) return "" # --- get raw bytes then decompress if server didn't/requests couldn't raw = resp.content # bytes enc_hdr = resp.headers.get("Content-Encoding", "").lower() # If requests didn't decode (it normally does gzip/deflate), handle manually. if "br" in enc_hdr and have_brotli: try: raw = _brotli.decompress(raw) except Exception: # it may already be decoded; ignore pass elif "gzip" in enc_hdr: try: raw = gzip.decompress(raw) except Exception: pass elif "deflate" in enc_hdr: try: raw = zlib.decompress(raw, -zlib.MAX_WBITS) except Exception: pass # --- decode text with a robust charset guess # use HTTP charset if present charset = None if "charset=" in ctype: charset = ctype.split("charset=")[-1].split(";")[0].strip() if not charset: # requests’ detector charset = resp.apparent_encoding or "utf-8" try: html_text = raw.decode(charset, errors="replace") except Exception: html_text = raw.decode("utf-8", errors="replace") # anti-bot shells (avoid empty output surprises) if re.search(r"(cf-chl|Just a moment|enable JavaScript)", html_text, re.I): logger.warning("Possible anti-bot/challenge page at %s", url) # --- extract readable text (readability -> html2text -> fallback) md_readability = "" if Document is not None: try: doc = Document(html_text) title = (doc.short_title() or "").strip() main_html = doc.summary(html_partial=True) main_text = re.sub(r"<[^>]+>", " ", main_html, flags=re.S) main_text = re.sub(r"\s+", " ", main_text).strip() if title: md_readability = f"# {title}\n\n{main_text}".strip() else: md_readability = main_text except Exception: logger.exception("readability failed on %s", url) md_html2text = "" if html2text is not None: try: h = html2text.HTML2Text() h.body_width = 0 h.ignore_images = True h.ignore_links = True h.single_line_break = True md_html2text = (h.handle(html_text) or "").strip() except Exception: logger.exception("html2text failed on %s", url) def _clean(s): s = re.sub(r"<[^>]+>", " ", s, flags=re.S) return _html.unescape(re.sub(r"\s+", " ", s)).strip() # fallback: meta/title/headers/paragraphs + noscript parts = [] t = re.search(r"]*>(.*?)", html_text, re.I | re.S) if t: parts.append(f"# {_clean(t.group(1))}") for pat in [ r']+property=["\']og:title["\'][^>]+content=["\'](.*?)["\']', r']+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']', r']+name=["\']description["\'][^>]+content=["\'](.*?)["\']', r']+name=["\']twitter:description["\'][^>]+content=["\'](.*?)["\']', ]: m = re.search(pat, html_text, re.I | re.S) if m: parts.append(_clean(m.group(1))) parts += [f"## {_clean(h)}" for h in re.findall(r"]*>(.*?)", html_text, re.I | re.S)[:4] if _clean(h)] parts += [_clean(p) for p in re.findall(r"]*>(.*?)

", html_text, re.I | re.S)[:8] if _clean(p)] for n in re.findall(r"]*>(.*?)", html_text, re.I | re.S): c = _clean(n) if c: parts.append(c) md_fallback = "\n\n".join([p for p in parts if p]).strip() best = max([md_readability, md_html2text, md_fallback], key=lambda s: len(s or "")) if not best.strip(): logger.warning("Empty content extracted from %s", url) return best except requests.exceptions.RequestException as e: logger.error(f"Error downloading {url}: {e}") return "" except Exception: logger.exception("Unexpected error while downloading %s", url) return "" def _extract_results_from_duckduckgo(response_text, num_pages): # 1) Grab the title anchors (they carry the real clickable href) # We capture both the inner text (title) and href. anchor_pattern = re.compile( r']*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)', re.DOTALL | re.IGNORECASE ) matches = anchor_pattern.findall(response_text) results = [] for href, title_html in matches: # 2) Resolve DuckDuckGo redirect: ?uddg= parsed = urlparse(href) target_url = href if parsed.netloc.endswith("duckduckgo.com"): qs = parse_qs(parsed.query) if "uddg" in qs and qs["uddg"]: target_url = unquote(qs["uddg"][0]) # 3) Clean title title_text = re.sub(r'<[^>]+>', '', title_html).strip() title_text = html.unescape(title_text) # 4) Basic normalization: add scheme if missing if target_url.startswith("//"): target_url = "https:" + target_url elif not re.match(r'^https?://', target_url, flags=re.I): target_url = "https://" + target_url results.append((target_url, title_text)) if len(results) >= num_pages: break return results def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): """Perform web search and return results with content""" try: search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" agents = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ] req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)}) with urllib.request.urlopen(req, timeout=timeout) as response: response_text = response.read().decode('utf-8', errors='replace') # Extract (url, title) pairs from the proper anchors download_tasks = _extract_results_from_duckduckgo(response_text, num_pages) if not download_tasks: return [] search_results = [None] * len(download_tasks) # Download pages in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_index = { executor.submit(download_web_page, url, timeout): (i, url, title) for i, (url, title) in enumerate(download_tasks) } for future in as_completed(future_to_index): i, url, title = future_to_index[future] try: content = future.result() except Exception: content = "" search_results[i] = { "title": title, "url": url, "content": content or "" } return search_results except Exception as e: logger.error(f"Error performing web search: {e}") return [] def truncate_content_by_tokens(content, max_tokens=8192): """Truncate content to fit within token limit using binary search""" if len(shared.tokenizer.encode(content)) <= max_tokens: return content left, right = 0, len(content) while left < right: mid = (left + right + 1) // 2 if len(shared.tokenizer.encode(content[:mid])) <= max_tokens: left = mid else: right = mid - 1 return content[:left] def add_web_search_attachments(history, row_idx, user_message, search_query, state): """Perform web search and add results as attachments""" if not search_query: logger.warning("No search query provided") return try: # Perform web search num_pages = int(state.get('web_search_pages', 3)) search_results = perform_web_search(search_query, num_pages) if not search_results: logger.warning("No search results found") return # Filter out failed downloads before adding attachments successful_results = [result for result in search_results if result['content'].strip()] if not successful_results: logger.warning("No successful downloads to add as attachments") return # Add search results as attachments key = f"user_{row_idx}" if key not in history['metadata']: history['metadata'][key] = {"timestamp": get_current_timestamp()} if "attachments" not in history['metadata'][key]: history['metadata'][key]["attachments"] = [] for result in successful_results: attachment = { "name": result['title'], "type": "text/html", "url": result['url'], "content": truncate_content_by_tokens(result['content']) } history['metadata'][key]["attachments"].append(attachment) logger.info(f"Added {len(successful_results)} successful web search results as attachments.") except Exception as e: logger.error(f"Error in web search: {e}")