"""Web fetch tool."""
import logging
from pydantic import BaseModel, Field
from ..bot import BotContext
from .base import ToolResult
from .registry import folder_bot
logger = logging.getLogger(__name__)
# Check if web fetch dependencies are available
_FETCH_AVAILABLE = False
try:
import httpx
from bs4 import BeautifulSoup
_FETCH_AVAILABLE = True
except ImportError:
httpx = None # type: ignore[assignment]
BeautifulSoup = None # type: ignore[assignment, misc]
[docs]
class WebFetchRequest(BaseModel, frozen=True):
"""Request for fetching content from a URL."""
url: str = Field(description="URL to fetch content from")
max_chars: int = Field(
default=10000,
description="Maximum characters to return from the page content",
)
[docs]
@folder_bot.tool(
name="web_fetch",
request_type=WebFetchRequest,
response_type=ToolResult,
)
async def web_fetch(
request: WebFetchRequest, _context: BotContext | None = None
) -> ToolResult:
"""Fetch and extract text content from a URL.
Returns the main text content of the page, stripping HTML.
Use this to read articles, documentation, or other web pages.
"""
if not _FETCH_AVAILABLE:
return ToolResult(
content="Web fetch not available. Install with: pip install folderbot[web]",
is_error=True,
)
try:
# Fetch the URL
headers = {
"User-Agent": (
"Mozilla/5.0 (compatible; Folderbot/1.0; "
"+https://gitlab.com/jorgeecardona/folderbot)"
)
}
with httpx.Client(timeout=30.0, follow_redirects=True) as client:
response = client.get(request.url, headers=headers)
response.raise_for_status()
content_type = response.headers.get("content-type", "")
# Handle plain text
if "text/plain" in content_type:
text = response.text[: request.max_chars]
if len(response.text) > request.max_chars:
text += f"\n\n[Truncated at {request.max_chars} characters]"
return ToolResult(content=text)
# Handle HTML
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "footer", "header"]):
element.decompose()
# Get text content
text = soup.get_text(separator="\n", strip=True)
# Clean up multiple newlines
lines = [line.strip() for line in text.split("\n") if line.strip()]
text = "\n".join(lines)
# Truncate if needed
if len(text) > request.max_chars:
text = (
text[: request.max_chars]
+ f"\n\n[Truncated at {request.max_chars} characters]"
)
# Get title if available
title = soup.title.string if soup.title else None
if title:
text = f"Title: {title}\n\n{text}"
return ToolResult(content=text)
except Exception as e:
if hasattr(e, "response") and hasattr(e.response, "status_code"):
return ToolResult(
content=f"HTTP error {e.response.status_code}: {request.url}",
is_error=True,
)
logger.exception("Web fetch error")
return ToolResult(content=f"Fetch error: {e}", is_error=True)
[docs]
def is_available() -> bool:
"""Check if web fetch is available."""
return _FETCH_AVAILABLE