web_search: Return all results and improve URL extraction

2026-04-20 22:13:43 +00:00 · 2026-03-15 13:14:53 -07:00 · 2026-03-15 13:14:53 -07:00 · 92d376e420
commit 92d376e420
parent f6a749a151
2 changed files with 20 additions and 17 deletions
--- a/modules/web_search.py
+++ b/modules/web_search.py
@ -4,10 +4,9 @@ import ipaddress
 import random
 import re
 import socket
-import urllib.request
 from concurrent.futures import as_completed
 from datetime import datetime
-from urllib.parse import quote_plus, urljoin, urlparse
+from urllib.parse import parse_qs, quote_plus, urljoin, urlparse

 import requests

@ -87,22 +86,28 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_cont
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
        ]

-        response_text = ""
-        req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
-        with urllib.request.urlopen(req, timeout=timeout) as response:
-            response_text = response.read().decode('utf-8')
+        response = requests.get(search_url, headers={'User-Agent': random.choice(agents)}, timeout=timeout)
+        response.raise_for_status()
+        response_text = response.text

-        # Extract results with regex
-        titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
-        urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
+        # Extract results - title and URL come from the same <a class="result__a"> element
+        result_links = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
+        result_tags = re.findall(r'<a([^>]*class="[^"]*result__a[^"]*"[^>]*)>', response_text, re.DOTALL)

        # Prepare download tasks
        download_tasks = []
-        for i in range(min(len(titles), len(urls), num_pages)):
-            url = f"https://{urls[i].strip()}"
-            title = re.sub(r'<[^>]+>', '', titles[i]).strip()
-            title = html.unescape(title)
-            download_tasks.append((url, title, i))
+        for i, (tag_attrs, raw_title) in enumerate(zip(result_tags, result_links)):
+            if num_pages is not None and i >= num_pages:
+                break
+            # Extract href and resolve the actual URL from DuckDuckGo's redirect link
+            href_match = re.search(r'href="([^"]*)"', tag_attrs)
+            if not href_match:
+                continue
+            uddg = parse_qs(urlparse(html.unescape(href_match.group(1))).query).get('uddg', [''])[0]
+            if not uddg:
+                continue
+            title = html.unescape(re.sub(r'<[^>]+>', '', raw_title).strip())
+            download_tasks.append((uddg, title, len(download_tasks)))

        search_results = [None] * len(download_tasks)  # Pre-allocate to maintain order

--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@ -9,7 +9,6 @@ tool = {
            "type": "object",
            "properties": {
                "query": {"type": "string", "description": "The search query."},
-                "num_pages": {"type": "integer", "description": "Number of search results to return (default: 3)."},
            },
            "required": ["query"]
        }
@ -19,8 +18,7 @@ tool = {

 def execute(arguments):
    query = arguments.get("query", "")
-    num_pages = arguments.get("num_pages", 3)
-    results = perform_web_search(query, num_pages=num_pages, fetch_content=False)
+    results = perform_web_search(query, num_pages=None, fetch_content=False)
    output = []
    for r in results:
        if r: