new file: .env.example new file: Dockerfile new file: app.py new file: blueprints/__init__.py new file: blueprints/auth.py new file: blueprints/chat.py new file: blueprints/context.py new file: blueprints/documents.py new file: blueprints/main.py new file: config.py new file: docker-compose.yml new file: models/__init__.py new file: models/chat_session.py new file: models/document.py new file: models/user.py new file: requirements.txt new file: services/__init__.py new file: services/document_parser.py new file: services/llm_service.py new file: services/rag_service.py new file: services/url_scraper.py new file: static/css/style.css new file: static/js/chat.js new file: static/js/inline_chat.js new file: static/js/main.js new file: templates/base.html new file: templates/document_view.html new file: templates/index.html new file: templates/login.html new file: templates/register.html
33 lines
838 B
Python
33 lines
838 B
Python
import os
|
|
import io
|
|
|
|
|
|
def parse_document(file_path: str, file_type: str) -> str:
|
|
"""Extract plain text from a document file."""
|
|
ext = file_type.lower().lstrip(".")
|
|
|
|
if ext == "txt" or ext == "md":
|
|
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
|
return f.read()
|
|
|
|
if ext == "pdf":
|
|
return _parse_pdf(file_path)
|
|
|
|
if ext == "docx":
|
|
return _parse_docx(file_path)
|
|
|
|
raise ValueError(f"Unsupported file type: {ext}")
|
|
|
|
|
|
def _parse_pdf(file_path: str) -> str:
|
|
from pdfminer.high_level import extract_text
|
|
text = extract_text(file_path)
|
|
return text or ""
|
|
|
|
|
|
def _parse_docx(file_path: str) -> str:
|
|
from docx import Document
|
|
doc = Document(file_path)
|
|
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
return "\n".join(paragraphs)
|