diff --git a/modules/web_search.py b/modules/web_search.py
index 6d005496..9bebc846 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -4,10 +4,9 @@ import ipaddress
import random
import re
import socket
-import urllib.request
from concurrent.futures import as_completed
from datetime import datetime
-from urllib.parse import quote_plus, urljoin, urlparse
+from urllib.parse import parse_qs, quote_plus, urljoin, urlparse
import requests
@@ -87,22 +86,28 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_cont
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
]
- response_text = ""
- req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
- with urllib.request.urlopen(req, timeout=timeout) as response:
- response_text = response.read().decode('utf-8')
+ response = requests.get(search_url, headers={'User-Agent': random.choice(agents)}, timeout=timeout)
+ response.raise_for_status()
+ response_text = response.text
- # Extract results with regex
- titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
- urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
+ # Extract results - title and URL come from the same element
+ result_links = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
+ result_tags = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*)>', response_text, re.DOTALL)
# Prepare download tasks
download_tasks = []
- for i in range(min(len(titles), len(urls), num_pages)):
- url = f"https://{urls[i].strip()}"
- title = re.sub(r'<[^>]+>', '', titles[i]).strip()
- title = html.unescape(title)
- download_tasks.append((url, title, i))
+ for i, (tag_attrs, raw_title) in enumerate(zip(result_tags, result_links)):
+ if num_pages is not None and i >= num_pages:
+ break
+ # Extract href and resolve the actual URL from DuckDuckGo's redirect link
+ href_match = re.search(r'href="([^"]*)"', tag_attrs)
+ if not href_match:
+ continue
+ uddg = parse_qs(urlparse(html.unescape(href_match.group(1))).query).get('uddg', [''])[0]
+ if not uddg:
+ continue
+ title = html.unescape(re.sub(r'<[^>]+>', '', raw_title).strip())
+ download_tasks.append((uddg, title, len(download_tasks)))
search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
diff --git a/user_data/tools/web_search.py b/user_data/tools/web_search.py
index 30d13473..6c2b0f0b 100644
--- a/user_data/tools/web_search.py
+++ b/user_data/tools/web_search.py
@@ -9,7 +9,6 @@ tool = {
"type": "object",
"properties": {
"query": {"type": "string", "description": "The search query."},
- "num_pages": {"type": "integer", "description": "Number of search results to return (default: 3)."},
},
"required": ["query"]
}
@@ -19,8 +18,7 @@ tool = {
def execute(arguments):
query = arguments.get("query", "")
- num_pages = arguments.get("num_pages", 3)
- results = perform_web_search(query, num_pages=num_pages, fetch_content=False)
+ results = perform_web_search(query, num_pages=None, fetch_content=False)
output = []
for r in results:
if r: