mirror of
https://github.com/oobabooga/text-generation-webui.git
synced 2026-03-17 19:04:39 +01:00
Replace html2text with trafilatura for better web content extraction
After this change a lot of boilerplate is removed from web pages, saving tokens on agentic loops.
This commit is contained in:
parent
8bff331893
commit
c908ac00d7
|
|
@ -42,9 +42,9 @@ def get_current_timestamp():
|
|||
|
||||
def download_web_page(url, timeout=10, include_links=False):
|
||||
"""
|
||||
Download a web page and convert its HTML content to structured Markdown text.
|
||||
Download a web page and extract its main content as Markdown text.
|
||||
"""
|
||||
import html2text
|
||||
import trafilatura
|
||||
|
||||
try:
|
||||
_validate_url(url)
|
||||
|
|
@ -62,16 +62,13 @@ def download_web_page(url, timeout=10, include_links=False):
|
|||
|
||||
response.raise_for_status()
|
||||
|
||||
# Initialize the HTML to Markdown converter
|
||||
h = html2text.HTML2Text()
|
||||
h.body_width = 0
|
||||
h.ignore_images = True
|
||||
h.ignore_links = not include_links
|
||||
|
||||
# Convert the HTML to Markdown
|
||||
markdown_text = h.handle(response.text)
|
||||
|
||||
return markdown_text
|
||||
result = trafilatura.extract(
|
||||
response.text,
|
||||
include_links=include_links,
|
||||
output_format='markdown',
|
||||
url=url
|
||||
)
|
||||
return result or ""
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Error downloading {url}: {e}")
|
||||
return ""
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ diffusers==0.37.*
|
|||
einops
|
||||
fastapi==0.112.4
|
||||
flash-linear-attention==0.4.*
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ datasets
|
|||
diffusers==0.37.*
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ datasets
|
|||
diffusers==0.37.*
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ datasets
|
|||
diffusers==0.37.*
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ datasets
|
|||
diffusers==0.37.*
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ datasets
|
|||
diffusers==0.37.*
|
||||
einops
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
audioop-lts<1.0; python_version >= "3.13"
|
||||
fastapi==0.112.4
|
||||
html2text==2025.4.15
|
||||
huggingface-hub==1.5.*
|
||||
jinja2==3.1.6
|
||||
markdown
|
||||
|
|
@ -11,6 +10,7 @@ python-docx==1.1.2
|
|||
pyyaml
|
||||
requests
|
||||
rich
|
||||
trafilatura==2.0.0
|
||||
tqdm
|
||||
|
||||
# Gradio
|
||||
|
|
|
|||
Loading…
Reference in a new issue