import requests from bs4 import BeautifulSoup def scrape_url(url: str, timeout: int = 15) -> tuple[str, str]: """ Fetch a URL and return (title, plain_text). Raises requests.RequestException on network errors. """ headers = { "User-Agent": ( "Mozilla/5.0 (compatible; KIContextTool/1.0; " "+https://github.com/user/ki-context-tool)" ) } resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Remove script/style noise for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): tag.decompose() title = soup.title.string.strip() if soup.title and soup.title.string else url # Extract readable text text = soup.get_text(separator="\n", strip=True) # Collapse excessive blank lines lines = [line for line in text.splitlines() if line.strip()] return title, "\n".join(lines)