mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2025-12-06 07:12:10 +01:00
Fix web search (attempt)
This commit is contained in:
parent
73a8a737b2
commit
d771ca4a13
|
|
@ -1,6 +1,8 @@
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import html
|
import html
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
|
import urllib.request
|
||||||
from concurrent.futures import as_completed
|
from concurrent.futures import as_completed
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.parse import quote_plus
|
from urllib.parse import quote_plus
|
||||||
|
|
@ -50,16 +52,21 @@ def download_web_page(url, timeout=10):
|
||||||
def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
|
def perform_web_search(query, num_pages=3, max_workers=5, timeout=10):
|
||||||
"""Perform web search and return results with content"""
|
"""Perform web search and return results with content"""
|
||||||
try:
|
try:
|
||||||
# Use DuckDuckGo HTML search endpoint
|
|
||||||
search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
|
search_url = f"https://html.duckduckgo.com/html/?q={quote_plus(query)}"
|
||||||
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
|
|
||||||
|
|
||||||
response = requests.get(search_url, headers=headers, timeout=timeout)
|
agents = [
|
||||||
response.raise_for_status()
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
||||||
|
]
|
||||||
|
|
||||||
|
response_text = ""
|
||||||
|
req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
|
||||||
|
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||||
|
response_text = response.read().decode('utf-8')
|
||||||
|
|
||||||
# Extract results with regex
|
# Extract results with regex
|
||||||
titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response.text, re.DOTALL)
|
titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
|
||||||
urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response.text, re.DOTALL)
|
urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
|
||||||
|
|
||||||
# Prepare download tasks
|
# Prepare download tasks
|
||||||
download_tasks = []
|
download_tasks = []
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue