import os import io def parse_document(file_path: str, file_type: str) -> str: """Extract plain text from a document file.""" ext = file_type.lower().lstrip(".") if ext == "txt" or ext == "md": with open(file_path, "r", encoding="utf-8", errors="replace") as f: return f.read() if ext == "pdf": return _parse_pdf(file_path) if ext == "docx": return _parse_docx(file_path) raise ValueError(f"Unsupported file type: {ext}") def _parse_pdf(file_path: str) -> str: from pdfminer.high_level import extract_text text = extract_text(file_path) return text or "" def _parse_docx(file_path: str) -> str: from docx import Document doc = Document(file_path) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return "\n".join(paragraphs)