new file: .env.example new file: Dockerfile new file: app.py new file: blueprints/__init__.py new file: blueprints/auth.py new file: blueprints/chat.py new file: blueprints/context.py new file: blueprints/documents.py new file: blueprints/main.py new file: config.py new file: docker-compose.yml new file: models/__init__.py new file: models/chat_session.py new file: models/document.py new file: models/user.py new file: requirements.txt new file: services/__init__.py new file: services/document_parser.py new file: services/llm_service.py new file: services/rag_service.py new file: services/url_scraper.py new file: static/css/style.css new file: static/js/chat.js new file: static/js/inline_chat.js new file: static/js/main.js new file: templates/base.html new file: templates/document_view.html new file: templates/index.html new file: templates/login.html new file: templates/register.html
32 lines
984 B
Python
32 lines
984 B
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def scrape_url(url: str, timeout: int = 15) -> tuple[str, str]:
|
|
"""
|
|
Fetch a URL and return (title, plain_text).
|
|
Raises requests.RequestException on network errors.
|
|
"""
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (compatible; KIContextTool/1.0; "
|
|
"+https://github.com/user/ki-context-tool)"
|
|
)
|
|
}
|
|
resp = requests.get(url, headers=headers, timeout=timeout)
|
|
resp.raise_for_status()
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
# Remove script/style noise
|
|
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
|
tag.decompose()
|
|
|
|
title = soup.title.string.strip() if soup.title and soup.title.string else url
|
|
|
|
# Extract readable text
|
|
text = soup.get_text(separator="\n", strip=True)
|
|
# Collapse excessive blank lines
|
|
lines = [line for line in text.splitlines() if line.strip()]
|
|
return title, "\n".join(lines)
|