text-generation-webui/modules/web_search.py

import concurrent.futures
import html
import random
import re
import urllib.request
from concurrent.futures import as_completed
from datetime import datetime
from urllib.parse import quote_plus

import requests

from modules import shared
from modules.logging_colors import logger


def get_current_timestamp():
    """Returns the current time in 24-hour format"""
    return datetime.now().strftime('%b %d, %Y %H:%M')


def download_web_page(url, timeout=10):
    """
    Download a web page and convert its HTML content to structured Markdown text.
    """
    import html2text

    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Initialize the HTML to Markdown converter
        h = html2text.HTML2Text()
        h.body_width = 0
        h.ignore_images = True
        h.ignore_links = True

        # Convert the HTML to Markdown
        markdown_text = h.handle(response.text)

        return markdown_text
    except requests.exceptions.RequestException as e:
        logger.error(f"Error downloading {url}: {e}")
        return ""
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        return ""


def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
    """Perform web search and return results with content"""
    try:
        search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"

        agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
        ]

        response_text = ""
        req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
        with urllib.request.urlopen(req, timeout=timeout) as response:
            response_text = response.read().decode('utf-8')

        # Extract results with regex
        titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
        urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)

        # Prepare download tasks
        download_tasks = []
        for i in range(min(len(titles), len(urls), num_pages)):
            url = f"https://{urls[i].strip()}"
            title = re.sub(r'<[^>]+>', '', titles[i]).strip()
            title = html.unescape(title)
            download_tasks.append((url, title, i))

        search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order

        # Download pages in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            # Submit all download tasks
            future_to_task = {
                executor.submit(download_web_page, task[0]): task
                for task in download_tasks
            }

            # Collect results as they complete
            for future in as_completed(future_to_task):
                url, title, index = future_to_task[future]
                try:
                    content = future.result()
                    search_results[index] = {
                        'title': title,
                        'url': url,
                        'content': content
                    }
                except Exception:
                    search_results[index] = {
                        'title': title,
                        'url': url,
                        'content': ''
                    }

        return search_results

    except Exception as e:
        logger.error(f"Error performing web search: {e}")
        return []


def truncate_content_by_tokens(content, max_tokens=8192):
    """Truncate content to fit within token limit using binary search"""
    if len(shared.tokenizer.encode(content)) <= max_tokens:
        return content

    left, right = 0, len(content)
    while left < right:
        mid = (left + right + 1) // 2
        if len(shared.tokenizer.encode(content[:mid])) <= max_tokens:
            left = mid
        else:
            right = mid - 1

    return content[:left]


def add_web_search_attachments(history, row_idx, user_message, search_query, state):
    """Perform web search and add results as attachments"""
    if not search_query:
        logger.warning("No search query provided")
        return

    try:
        logger.info(f"Using search query: {search_query}")

        # Perform web search
        num_pages = int(state.get('web_search_pages', 3))
        search_results = perform_web_search(search_query, num_pages)

        if not search_results:
            logger.warning("No search results found")
            return

        # Filter out failed downloads before adding attachments
        successful_results = [result for result in search_results if result['content'].strip()]

        if not successful_results:
            logger.warning("No successful downloads to add as attachments")
            return

        # Add search results as attachments
        key = f"user_{row_idx}"
        if key not in history['metadata']:
            history['metadata'][key] = {"timestamp": get_current_timestamp()}
        if "attachments" not in history['metadata'][key]:
            history['metadata'][key]["attachments"] = []

        for result in successful_results:
            attachment = {
                "name": result['title'],
                "type": "text/html",
                "url": result['url'],
                "content": truncate_content_by_tokens(result['content'])
            }
            history['metadata'][key]["attachments"].append(attachment)

        logger.info(f"Added {len(successful_results)} successful web search results as attachments.")

    except Exception as e:
        logger.error(f"Error in web search: {e}")
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00			`import concurrent.futures`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`import html`
Fix web search (attempt) 2025-08-14 21:02:30 +02:00			`import random`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`import re`
Fix web search (attempt) 2025-08-14 21:02:30 +02:00			`import urllib.request`
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00			`from concurrent.futures import as_completed`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`from datetime import datetime`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`from urllib.parse import quote_plus`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
			`import requests`

Truncate web search results to at most 8192 tokens 2025-06-15 04:34:07 +02:00			`from modules import shared`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`from modules.logging_colors import logger`


			`def get_current_timestamp():`
			`"""Returns the current time in 24-hour format"""`
			`return datetime.now().strftime('%b %d, %Y %H:%M')`


Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`def download_web_page(url, timeout=10):`
			`"""`
			`Download a web page and convert its HTML content to structured Markdown text.`
			`"""`
			`import html2text`

Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`try:`
			`headers = {`
			`'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'`
			`}`
			`response = requests.get(url, headers=headers, timeout=timeout)`
Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`response.raise_for_status() # Raise an exception for bad status codes`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`# Initialize the HTML to Markdown converter`
			`h = html2text.HTML2Text()`
			`h.body_width = 0`
Remove images and links from websearch results This reduces noise a lot 2025-06-15 05:00:11 +02:00			`h.ignore_images = True`
			`h.ignore_links = True`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`# Convert the HTML to Markdown`
			`markdown_text = h.handle(response.text)`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`return markdown_text`
			`except requests.exceptions.RequestException as e:`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`logger.error(f"Error downloading {url}: {e}")`
Properly filter out failed web search downloads from attachments 2025-06-09 04:25:23 +02:00			`return ""`
Use html2text to extract the text of web searches without losing formatting 2025-06-10 02:55:26 +02:00			`except Exception as e:`
			`logger.error(f"An unexpected error occurred: {e}")`
			`return ""`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00

Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`"""Perform web search and return results with content"""`
			`try:`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"`

Fix web search (attempt) 2025-08-14 21:02:30 +02:00			`agents = [`
			`"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",`
			`"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"`
			`]`

			`response_text = ""`
			`req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})`
			`with urllib.request.urlopen(req, timeout=timeout) as response:`
			`response_text = response.read().decode('utf-8')`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00
			`# Extract results with regex`
Fix web search (attempt) 2025-08-14 21:02:30 +02:00			`titles = re.findall(r'<a[^>]class="[^"]result__a[^"]"[^>]>(.*?)</a>', response_text, re.DOTALL)`
			`urls = re.findall(r'<a[^>]class="[^"]result__url[^"]"[^>]>(.*?)</a>', response_text, re.DOTALL)`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00			`# Prepare download tasks`
			`download_tasks = []`
Fix the duckduckgo search 2025-07-07 01:24:57 +02:00			`for i in range(min(len(titles), len(urls), num_pages)):`
			`url = f"https://{urls[i].strip()}"`
			`title = re.sub(r'<[^>]+>', '', titles[i]).strip()`
			`title = html.unescape(title)`
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00			`download_tasks.append((url, title, i))`

			`search_results = [None] * len(download_tasks) # Pre-allocate to maintain order`

			`# Download pages in parallel`
			`with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:`
			`# Submit all download tasks`
			`future_to_task = {`
			`executor.submit(download_web_page, task[0]): task`
			`for task in download_tasks`
			`}`

			`# Collect results as they complete`
			`for future in as_completed(future_to_task):`
			`url, title, index = future_to_task[future]`
			`try:`
			`content = future.result()`
			`search_results[index] = {`
			`'title': title,`
			`'url': url,`
			`'content': content`
			`}`
Properly filter out failed web search downloads from attachments 2025-06-09 04:25:23 +02:00			`except Exception:`
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00			`search_results[index] = {`
			`'title': title,`
			`'url': url,`
			`'content': ''`
			`}`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
			`return search_results`
Download fetched web search results in parallel 2025-05-29 05:34:14 +02:00
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`except Exception as e:`
			`logger.error(f"Error performing web search: {e}")`
			`return []`


Truncate web search results to at most 8192 tokens 2025-06-15 04:34:07 +02:00			`def truncate_content_by_tokens(content, max_tokens=8192):`
			`"""Truncate content to fit within token limit using binary search"""`
			`if len(shared.tokenizer.encode(content)) <= max_tokens:`
			`return content`

			`left, right = 0, len(content)`
			`while left < right:`
			`mid = (left + right + 1) // 2`
			`if len(shared.tokenizer.encode(content[:mid])) <= max_tokens:`
			`left = mid`
			`else:`
			`right = mid - 1`

			`return content[:left]`


Improve the web search query generation 2025-05-29 03:14:51 +02:00			`def add_web_search_attachments(history, row_idx, user_message, search_query, state):`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`"""Perform web search and add results as attachments"""`
Improve the web search query generation 2025-05-29 03:14:51 +02:00			`if not search_query:`
			`logger.warning("No search query provided")`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`return`

			`try:`
Improve the web search query generation 2025-05-29 03:14:51 +02:00			`logger.info(f"Using search query: {search_query}")`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
			`# Perform web search`
			`num_pages = int(state.get('web_search_pages', 3))`
			`search_results = perform_web_search(search_query, num_pages)`

			`if not search_results:`
			`logger.warning("No search results found")`
			`return`

Filter out failed web search downloads from attachments 2025-06-07 07:32:07 +02:00			`# Filter out failed downloads before adding attachments`
Properly filter out failed web search downloads from attachments 2025-06-09 04:25:23 +02:00			`successful_results = [result for result in search_results if result['content'].strip()]`
Filter out failed web search downloads from attachments 2025-06-07 07:32:07 +02:00
			`if not successful_results:`
			`logger.warning("No successful downloads to add as attachments")`
			`return`

Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`# Add search results as attachments`
			`key = f"user_{row_idx}"`
			`if key not in history['metadata']:`
			`history['metadata'][key] = {"timestamp": get_current_timestamp()}`
			`if "attachments" not in history['metadata'][key]:`
			`history['metadata'][key]["attachments"] = []`

Filter out failed web search downloads from attachments 2025-06-07 07:32:07 +02:00			`for result in successful_results:`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`attachment = {`
Make web search attachments clickable 2025-05-28 14:28:15 +02:00			`"name": result['title'],`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`"type": "text/html",`
Make web search attachments clickable 2025-05-28 14:28:15 +02:00			`"url": result['url'],`
Truncate web search results to at most 8192 tokens 2025-06-15 04:34:07 +02:00			`"content": truncate_content_by_tokens(result['content'])`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00			`}`
			`history['metadata'][key]["attachments"].append(attachment)`

Properly filter out failed web search downloads from attachments 2025-06-09 04:25:23 +02:00			`logger.info(f"Added {len(successful_results)} successful web search results as attachments.")`
Add web search support (#7023) 2025-05-28 09:27:28 +02:00
			`except Exception as e:`
			`logger.error(f"Error in web search: {e}")`