Replace html2text with trafilatura for better web content extraction

After this change a lot of boilerplate is removed from web pages, saving tokens on agentic loops.
This commit is contained in:
oobabooga 2026-03-14 09:29:17 -07:00
parent 8bff331893
commit c908ac00d7
15 changed files with 17 additions and 26 deletions

View file

@ -42,9 +42,9 @@ def get_current_timestamp():
def download_web_page(url, timeout=10, include_links=False):
"""
Download a web page and convert its HTML content to structured Markdown text.
Download a web page and extract its main content as Markdown text.
"""
import html2text
import trafilatura
try:
_validate_url(url)
@ -62,16 +62,13 @@ def download_web_page(url, timeout=10, include_links=False):
response.raise_for_status()
# Initialize the HTML to Markdown converter
h = html2text.HTML2Text()
h.body_width = 0
h.ignore_images = True
h.ignore_links = not include_links
# Convert the HTML to Markdown
markdown_text = h.handle(response.text)
return markdown_text
result = trafilatura.extract(
response.text,
include_links=include_links,
output_format='markdown',
url=url
)
return result or ""
except requests.exceptions.RequestException as e:
logger.error(f"Error downloading {url}: {e}")
return ""

View file

@ -6,7 +6,6 @@ diffusers==0.37.*
einops
fastapi==0.112.4
flash-linear-attention==0.4.*
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -4,7 +4,6 @@ datasets
diffusers==0.37.*
einops
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -4,7 +4,6 @@ datasets
diffusers==0.37.*
einops
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -4,7 +4,6 @@ datasets
diffusers==0.37.*
einops
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -4,7 +4,6 @@ datasets
diffusers==0.37.*
einops
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -4,7 +4,6 @@ datasets
diffusers==0.37.*
einops
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio

View file

@ -1,6 +1,5 @@
audioop-lts<1.0; python_version >= "3.13"
fastapi==0.112.4
html2text==2025.4.15
huggingface-hub==1.5.*
jinja2==3.1.6
markdown
@ -11,6 +10,7 @@ python-docx==1.1.2
pyyaml
requests
rich
trafilatura==2.0.0
tqdm
# Gradio