Source code for folderbot.tools.web_fetch

"""Web fetch tool."""

import logging

from pydantic import BaseModel, Field

from ..bot import BotContext
from .base import ToolResult
from .registry import folder_bot

logger = logging.getLogger(__name__)

# Check if web fetch dependencies are available
_FETCH_AVAILABLE = False
try:
    import httpx
    from bs4 import BeautifulSoup

    _FETCH_AVAILABLE = True
except ImportError:
    httpx = None  # type: ignore[assignment]
    BeautifulSoup = None  # type: ignore[assignment, misc]


[docs] class WebFetchRequest(BaseModel, frozen=True): """Request for fetching content from a URL.""" url: str = Field(description="URL to fetch content from") max_chars: int = Field( default=10000, description="Maximum characters to return from the page content", )
[docs] @folder_bot.tool( name="web_fetch", request_type=WebFetchRequest, response_type=ToolResult, ) async def web_fetch( request: WebFetchRequest, _context: BotContext | None = None ) -> ToolResult: """Fetch and extract text content from a URL. Returns the main text content of the page, stripping HTML. Use this to read articles, documentation, or other web pages. """ if not _FETCH_AVAILABLE: return ToolResult( content="Web fetch not available. Install with: pip install folderbot[web]", is_error=True, ) try: # Fetch the URL headers = { "User-Agent": ( "Mozilla/5.0 (compatible; Folderbot/1.0; " "+https://gitlab.com/jorgeecardona/folderbot)" ) } with httpx.Client(timeout=30.0, follow_redirects=True) as client: response = client.get(request.url, headers=headers) response.raise_for_status() content_type = response.headers.get("content-type", "") # Handle plain text if "text/plain" in content_type: text = response.text[: request.max_chars] if len(response.text) > request.max_chars: text += f"\n\n[Truncated at {request.max_chars} characters]" return ToolResult(content=text) # Handle HTML soup = BeautifulSoup(response.text, "html.parser") # Remove script and style elements for element in soup(["script", "style", "nav", "footer", "header"]): element.decompose() # Get text content text = soup.get_text(separator="\n", strip=True) # Clean up multiple newlines lines = [line.strip() for line in text.split("\n") if line.strip()] text = "\n".join(lines) # Truncate if needed if len(text) > request.max_chars: text = ( text[: request.max_chars] + f"\n\n[Truncated at {request.max_chars} characters]" ) # Get title if available title = soup.title.string if soup.title else None if title: text = f"Title: {title}\n\n{text}" return ToolResult(content=text) except Exception as e: if hasattr(e, "response") and hasattr(e.response, "status_code"): return ToolResult( content=f"HTTP error {e.response.status_code}: {request.url}", is_error=True, ) logger.exception("Web fetch error") return ToolResult(content=f"Fetch error: {e}", is_error=True)
[docs] def is_available() -> bool: """Check if web fetch is available.""" return _FETCH_AVAILABLE