From 32a2ce26b87c7ba918c8b75bfc489610417c1342 Mon Sep 17 00:00:00 2001 From: SimolZimol <70102430+SimolZimol@users.noreply.github.com> Date: Sat, 23 May 2026 12:27:28 +0200 Subject: [PATCH] modified: config.py modified: services/rag_service.py --- config.py | 6 ++--- services/rag_service.py | 55 ++++++++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/config.py b/config.py index 0237ca6..b0cca86 100644 --- a/config.py +++ b/config.py @@ -30,6 +30,6 @@ class Config: OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o") - RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "5")) - RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "500")) - RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "50")) + RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "6")) + RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "300")) + RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "75")) diff --git a/services/rag_service.py b/services/rag_service.py index 9a1da67..23904b5 100644 --- a/services/rag_service.py +++ b/services/rag_service.py @@ -49,14 +49,45 @@ def _get_collection(): def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]: - words = text.split() - chunks = [] - start = 0 - while start < len(words): - end = start + chunk_size - chunks.append(" ".join(words[start:end])) - start += chunk_size - overlap - return [c for c in chunks if c.strip()] + """ + Paragraph-aware chunking: + 1. Split on blank lines to keep paragraphs intact. + 2. Paragraphs shorter than chunk_size are merged together. + 3. Paragraphs longer than chunk_size are split by sentence, then by word. + Overlap is applied by re-including the last words of the previous chunk. + """ + import re as _re + + # Split into paragraphs (one or more blank lines) + paragraphs = [p.strip() for p in _re.split(r'\n{2,}', text) if p.strip()] + + # Further split very long paragraphs at sentence boundaries + sentences: list[str] = [] + for para in paragraphs: + # Split on ". ", "! ", "? " followed by uppercase or end-of-string + parts = _re.split(r'(?<=[.!?])\s+(?=[A-ZÜÖÄ\u00C0-\u00FF"])', para) + sentences.extend([s.strip() for s in parts if s.strip()]) + + chunks: list[str] = [] + current_words: list[str] = [] + + for sentence in sentences: + words = sentence.split() + # If adding this sentence would exceed chunk_size, flush current chunk + if current_words and len(current_words) + len(words) > chunk_size: + chunks.append(" ".join(current_words)) + # Keep last words as context for the next chunk + current_words = current_words[-overlap:] if overlap else [] + current_words.extend(words) + # If a single sentence is already longer than chunk_size, flush immediately + if len(current_words) >= chunk_size: + chunks.append(" ".join(current_words)) + current_words = current_words[-overlap:] if overlap else [] + + if current_words: + chunks.append(" ".join(current_words)) + + return [c for c in chunks if len(c.split()) >= 10] # drop near-empty chunks def index_source( @@ -129,7 +160,13 @@ def similarity_search( query_texts=[query], n_results=top_k, where=where, + include=["documents", "distances"], ) - return results["documents"][0] if results["documents"] else [] + docs = results["documents"][0] if results["documents"] else [] + distances = results["distances"][0] if results.get("distances") else [] + # Cosine distance: 0 = identical, 2 = opposite. Filter out poor matches (> 0.6). + if distances: + docs = [d for d, dist in zip(docs, distances) if dist < 0.6] + return docs except Exception: return []