notes/services/document_parser.py

import os
import io


def parse_document(file_path: str, file_type: str) -> str:
    """Extract plain text from a document file."""
    ext = file_type.lower().lstrip(".")

    if ext == "txt" or ext == "md":
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()

    if ext == "pdf":
        return _parse_pdf(file_path)

    if ext == "docx":
        return _parse_docx(file_path)

    raise ValueError(f"Unsupported file type: {ext}")


def _parse_pdf(file_path: str) -> str:
    from pdfminer.high_level import extract_text
    text = extract_text(file_path)
    return text or ""


def _parse_docx(file_path: str) -> str:
    from docx import Document
    doc = Document(file_path)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(paragraphs)