Source code for folderbot.tools.web_fetch

"""Web fetch tool."""

import logging

from pydantic import BaseModel, Field

from ..bot import BotContext
from .base import ToolResult
from .registry import folder_bot

logger = logging.getLogger(__name__)

# Check if web fetch dependencies are available
_FETCH_AVAILABLE = False
try:
    import httpx
    from bs4 import BeautifulSoup

    _FETCH_AVAILABLE = True
except ImportError:
    httpx = None  # type: ignore[assignment]
    BeautifulSoup = None  # type: ignore[assignment, misc]



[docs]
class WebFetchRequest(BaseModel, frozen=True):
    """Request for fetching content from a URL."""

    url: str = Field(description="URL to fetch content from")
    max_chars: int = Field(
        default=10000,
        description="Maximum characters to return from the page content",
    )




[docs]
@folder_bot.tool(
    name="web_fetch",
    request_type=WebFetchRequest,
    response_type=ToolResult,
)
async def web_fetch(
    request: WebFetchRequest, _context: BotContext | None = None
) -> ToolResult:
    """Fetch and extract text content from a URL.

    Returns the main text content of the page, stripping HTML.
    Use this to read articles, documentation, or other web pages.
    """
    if not _FETCH_AVAILABLE:
        return ToolResult(
            content="Web fetch not available. Install with: pip install folderbot[web]",
            is_error=True,
        )

    try:
        # Fetch the URL
        headers = {
            "User-Agent": (
                "Mozilla/5.0 (compatible; Folderbot/1.0; "
                "+https://gitlab.com/jorgeecardona/folderbot)"
            )
        }
        with httpx.Client(timeout=30.0, follow_redirects=True) as client:
            response = client.get(request.url, headers=headers)
            response.raise_for_status()

        content_type = response.headers.get("content-type", "")

        # Handle plain text
        if "text/plain" in content_type:
            text = response.text[: request.max_chars]
            if len(response.text) > request.max_chars:
                text += f"\n\n[Truncated at {request.max_chars} characters]"
            return ToolResult(content=text)

        # Handle HTML
        soup = BeautifulSoup(response.text, "html.parser")

        # Remove script and style elements
        for element in soup(["script", "style", "nav", "footer", "header"]):
            element.decompose()

        # Get text content
        text = soup.get_text(separator="\n", strip=True)

        # Clean up multiple newlines
        lines = [line.strip() for line in text.split("\n") if line.strip()]
        text = "\n".join(lines)

        # Truncate if needed
        if len(text) > request.max_chars:
            text = (
                text[: request.max_chars]
                + f"\n\n[Truncated at {request.max_chars} characters]"
            )

        # Get title if available
        title = soup.title.string if soup.title else None
        if title:
            text = f"Title: {title}\n\n{text}"

        return ToolResult(content=text)

    except Exception as e:
        if hasattr(e, "response") and hasattr(e.response, "status_code"):
            return ToolResult(
                content=f"HTTP error {e.response.status_code}: {request.url}",
                is_error=True,
            )
        logger.exception("Web fetch error")
        return ToolResult(content=f"Fetch error: {e}", is_error=True)




[docs]
def is_available() -> bool:
    """Check if web fetch is available."""
    return _FETCH_AVAILABLE