From 32a2ce26b87c7ba918c8b75bfc489610417c1342 Mon Sep 17 00:00:00 2001
From: SimolZimol <70102430+SimolZimol@users.noreply.github.com>
Date: Sat, 23 May 2026 12:27:28 +0200
Subject: [PATCH] 	modified:   config.py 	modified:  
 services/rag_service.py

---
 config.py               |  6 ++---
 services/rag_service.py | 55 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 49 insertions(+), 12 deletions(-)
diff --git a/config.py b/config.py
index 0237ca6..b0cca86 100644
--- a/config.py
+++ b/config.py
@@ -30,6 +30,6 @@ class Config:
     OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
     OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
 
-    RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "5"))
-    RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "500"))
-    RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "50"))
+    RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "6"))
+    RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "300"))
+    RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "75"))
diff --git a/services/rag_service.py b/services/rag_service.py
index 9a1da67..23904b5 100644
--- a/services/rag_service.py
+++ b/services/rag_service.py
@@ -49,14 +49,45 @@ def _get_collection():
 
 
 def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
-    words = text.split()
-    chunks = []
-    start = 0
-    while start < len(words):
-        end = start + chunk_size
-        chunks.append(" ".join(words[start:end]))
-        start += chunk_size - overlap
-    return [c for c in chunks if c.strip()]
+    """
+    Paragraph-aware chunking:
+    1. Split on blank lines to keep paragraphs intact.
+    2. Paragraphs shorter than chunk_size are merged together.
+    3. Paragraphs longer than chunk_size are split by sentence, then by word.
+    Overlap is applied by re-including the last <overlap> words of the previous chunk.
+    """
+    import re as _re
+
+    # Split into paragraphs (one or more blank lines)
+    paragraphs = [p.strip() for p in _re.split(r'\n{2,}', text) if p.strip()]
+
+    # Further split very long paragraphs at sentence boundaries
+    sentences: list[str] = []
+    for para in paragraphs:
+        # Split on ". ", "! ", "? " followed by uppercase or end-of-string
+        parts = _re.split(r'(?<=[.!?])\s+(?=[A-ZÜÖÄ\u00C0-\u00FF"])', para)
+        sentences.extend([s.strip() for s in parts if s.strip()])
+
+    chunks: list[str] = []
+    current_words: list[str] = []
+
+    for sentence in sentences:
+        words = sentence.split()
+        # If adding this sentence would exceed chunk_size, flush current chunk
+        if current_words and len(current_words) + len(words) > chunk_size:
+            chunks.append(" ".join(current_words))
+            # Keep last <overlap> words as context for the next chunk
+            current_words = current_words[-overlap:] if overlap else []
+        current_words.extend(words)
+        # If a single sentence is already longer than chunk_size, flush immediately
+        if len(current_words) >= chunk_size:
+            chunks.append(" ".join(current_words))
+            current_words = current_words[-overlap:] if overlap else []
+
+    if current_words:
+        chunks.append(" ".join(current_words))
+
+    return [c for c in chunks if len(c.split()) >= 10]  # drop near-empty chunks
 
 
 def index_source(
@@ -129,7 +160,13 @@ def similarity_search(
             query_texts=[query],
             n_results=top_k,
             where=where,
+            include=["documents", "distances"],
         )
-        return results["documents"][0] if results["documents"] else []
+        docs = results["documents"][0] if results["documents"] else []
+        distances = results["distances"][0] if results.get("distances") else []
+        # Cosine distance: 0 = identical, 2 = opposite. Filter out poor matches (> 0.6).
+        if distances:
+            docs = [d for d, dist in zip(docs, distances) if dist < 0.6]
+        return docs
     except Exception:
         return []