From d771ca4a13b9837e169cd44815bb3a86bc6c8a4b Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 14 Aug 2025 12:02:30 -0700
Subject: [PATCH] Fix web search (attempt)
---
modules/web_search.py | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/modules/web_search.py b/modules/web_search.py
index 3b1f6e18..597af4b2 100644
--- a/modules/web_search.py
+++ b/modules/web_search.py
@@ -1,6 +1,8 @@
import concurrent.futures
import html
+import random
import re
+import urllib.request
from concurrent.futures import as_completed
from datetime import datetime
from urllib.parse import quote_plus
@@ -50,16 +52,21 @@ def download_web_page(url, timeout=10):
def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
"""Perform web search and return results with content"""
try:
- # Use DuckDuckGo HTML search endpoint
search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
- response = requests.get(search_url, headers=headers, timeout=timeout)
- response.raise_for_status()
+ agents = [
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
+ ]
+
+ response_text = ""
+ req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
+ with urllib.request.urlopen(req, timeout=timeout) as response:
+ response_text = response.read().decode('utf-8')
# Extract results with regex
- titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response.text, re.DOTALL)
- urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response.text, re.DOTALL)
+ titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
+ urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL)
# Prepare download tasks
download_tasks = []