From d771ca4a13b9837e169cd44815bb3a86bc6c8a4b Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 14 Aug 2025 12:02:30 -0700 Subject: [PATCH] Fix web search (attempt) --- modules/web_search.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index 3b1f6e18..597af4b2 100644 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -1,6 +1,8 @@ import concurrent.futures import html +import random import re +import urllib.request from concurrent.futures import as_completed from datetime import datetime from urllib.parse import quote_plus @@ -50,16 +52,21 @@ def download_web_page(url, timeout=10): def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): """Perform web search and return results with content""" try: - # Use DuckDuckGo HTML search endpoint search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}" - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} - response = requests.get(search_url, headers=headers, timeout=timeout) - response.raise_for_status() + agents = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" + ] + + response_text = "" + req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)}) + with urllib.request.urlopen(req, timeout=timeout) as response: + response_text = response.read().decode('utf-8') # Extract results with regex - titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response.text, re.DOTALL) - urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response.text, re.DOTALL) + titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) + urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) # Prepare download tasks download_tasks = []