new file: .dockerignore

new file:   .env.example
	new file:   Dockerfile
	new file:   app.py
	new file:   blueprints/__init__.py
	new file:   blueprints/auth.py
	new file:   blueprints/chat.py
	new file:   blueprints/context.py
	new file:   blueprints/documents.py
	new file:   blueprints/main.py
	new file:   config.py
	new file:   docker-compose.yml
	new file:   models/__init__.py
	new file:   models/chat_session.py
	new file:   models/document.py
	new file:   models/user.py
	new file:   requirements.txt
	new file:   services/__init__.py
	new file:   services/document_parser.py
	new file:   services/llm_service.py
	new file:   services/rag_service.py
	new file:   services/url_scraper.py
	new file:   static/css/style.css
	new file:   static/js/chat.js
	new file:   static/js/inline_chat.js
	new file:   static/js/main.js
	new file:   templates/base.html
	new file:   templates/document_view.html
	new file:   templates/index.html
	new file:   templates/login.html
	new file:   templates/register.html
This commit is contained in:
SimolZimol
2026-05-22 16:03:50 +02:00
commit 939cc13689
31 changed files with 2025 additions and 0 deletions

31
services/url_scraper.py Normal file
View File

@@ -0,0 +1,31 @@
import requests
from bs4 import BeautifulSoup
def scrape_url(url: str, timeout: int = 15) -> tuple[str, str]:
"""
Fetch a URL and return (title, plain_text).
Raises requests.RequestException on network errors.
"""
headers = {
"User-Agent": (
"Mozilla/5.0 (compatible; KIContextTool/1.0; "
"+https://github.com/user/ki-context-tool)"
)
}
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Remove script/style noise
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
title = soup.title.string.strip() if soup.title and soup.title.string else url
# Extract readable text
text = soup.get_text(separator="\n", strip=True)
# Collapse excessive blank lines
lines = [line for line in text.splitlines() if line.strip()]
return title, "\n".join(lines)