mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-18 03:14:39 +01:00
web_search: Return all results and improve URL extraction
This commit is contained in:
parent
f6a749a151
commit
92d376e420
|
|
@ -4,10 +4,9 @@ import ipaddress
|
|||
import random
|
||||
import re
|
||||
import socket
|
||||
import urllib.request
|
||||
from concurrent.futures import as_completed
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote_plus, urljoin, urlparse
|
||||
from urllib.parse import parse_qs, quote_plus, urljoin, urlparse
|
||||
|
||||
import requests
|
||||
|
||||
|
|
@ -87,22 +86,28 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10, fetch_cont
|
|||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
||||
]
|
||||
|
||||
response_text = ""
|
||||
req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
response_text = response.read().decode('utf-8')
|
||||
response = requests.get(search_url, headers={'User-Agent': random.choice(agents)}, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
response_text = response.text
|
||||
|
||||
# Extract results with regex
|
||||
titles = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
|
||||
urls = re.findall(r'<a[^>]*class="[^"]*result__url[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
|
||||
# Extract results - title and URL come from the same <a class="result__a"> element
|
||||
result_links = re.findall(r'<a[^>]*class="[^"]*result__a[^"]*"[^>]*>(.*?)</a>', response_text, re.DOTALL)
|
||||
result_tags = re.findall(r'<a([^>]*class="[^"]*result__a[^"]*"[^>]*)>', response_text, re.DOTALL)
|
||||
|
||||
# Prepare download tasks
|
||||
download_tasks = []
|
||||
for i in range(min(len(titles), len(urls), num_pages)):
|
||||
url = f"https://{urls[i].strip()}"
|
||||
title = re.sub(r'<[^>]+>', '', titles[i]).strip()
|
||||
title = html.unescape(title)
|
||||
download_tasks.append((url, title, i))
|
||||
for i, (tag_attrs, raw_title) in enumerate(zip(result_tags, result_links)):
|
||||
if num_pages is not None and i >= num_pages:
|
||||
break
|
||||
# Extract href and resolve the actual URL from DuckDuckGo's redirect link
|
||||
href_match = re.search(r'href="([^"]*)"', tag_attrs)
|
||||
if not href_match:
|
||||
continue
|
||||
uddg = parse_qs(urlparse(html.unescape(href_match.group(1))).query).get('uddg', [''])[0]
|
||||
if not uddg:
|
||||
continue
|
||||
title = html.unescape(re.sub(r'<[^>]+>', '', raw_title).strip())
|
||||
download_tasks.append((uddg, title, len(download_tasks)))
|
||||
|
||||
search_results = [None] * len(download_tasks) # Pre-allocate to maintain order
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,6 @@ tool = {
|
|||
"type": "object",
|
||||
"properties": {
|
||||
"query": {"type": "string", "description": "The search query."},
|
||||
"num_pages": {"type": "integer", "description": "Number of search results to return (default: 3)."},
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
|
|
@ -19,8 +18,7 @@ tool = {
|
|||
|
||||
def execute(arguments):
|
||||
query = arguments.get("query", "")
|
||||
num_pages = arguments.get("num_pages", 3)
|
||||
results = perform_web_search(query, num_pages=num_pages, fetch_content=False)
|
||||
results = perform_web_search(query, num_pages=None, fetch_content=False)
|
||||
output = []
|
||||
for r in results:
|
||||
if r:
|
||||
|
|
|
|||
Loading…
Reference in a new issue