modified: config.py

modified:   services/rag_service.py
This commit is contained in:
SimolZimol
2026-05-23 12:27:28 +02:00
parent 552f8bc5d3
commit 32a2ce26b8
2 changed files with 49 additions and 12 deletions

View File

@@ -30,6 +30,6 @@ class Config:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "") OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o") OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "5")) RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "6"))
RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "500")) RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "300"))
RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "50")) RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "75"))

View File

@@ -49,14 +49,45 @@ def _get_collection():
def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]: def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
words = text.split() """
chunks = [] Paragraph-aware chunking:
start = 0 1. Split on blank lines to keep paragraphs intact.
while start < len(words): 2. Paragraphs shorter than chunk_size are merged together.
end = start + chunk_size 3. Paragraphs longer than chunk_size are split by sentence, then by word.
chunks.append(" ".join(words[start:end])) Overlap is applied by re-including the last <overlap> words of the previous chunk.
start += chunk_size - overlap """
return [c for c in chunks if c.strip()] import re as _re
# Split into paragraphs (one or more blank lines)
paragraphs = [p.strip() for p in _re.split(r'\n{2,}', text) if p.strip()]
# Further split very long paragraphs at sentence boundaries
sentences: list[str] = []
for para in paragraphs:
# Split on ". ", "! ", "? " followed by uppercase or end-of-string
parts = _re.split(r'(?<=[.!?])\s+(?=[A-ZÜÖÄ\u00C0-\u00FF"])', para)
sentences.extend([s.strip() for s in parts if s.strip()])
chunks: list[str] = []
current_words: list[str] = []
for sentence in sentences:
words = sentence.split()
# If adding this sentence would exceed chunk_size, flush current chunk
if current_words and len(current_words) + len(words) > chunk_size:
chunks.append(" ".join(current_words))
# Keep last <overlap> words as context for the next chunk
current_words = current_words[-overlap:] if overlap else []
current_words.extend(words)
# If a single sentence is already longer than chunk_size, flush immediately
if len(current_words) >= chunk_size:
chunks.append(" ".join(current_words))
current_words = current_words[-overlap:] if overlap else []
if current_words:
chunks.append(" ".join(current_words))
return [c for c in chunks if len(c.split()) >= 10] # drop near-empty chunks
def index_source( def index_source(
@@ -129,7 +160,13 @@ def similarity_search(
query_texts=[query], query_texts=[query],
n_results=top_k, n_results=top_k,
where=where, where=where,
include=["documents", "distances"],
) )
return results["documents"][0] if results["documents"] else [] docs = results["documents"][0] if results["documents"] else []
distances = results["distances"][0] if results.get("distances") else []
# Cosine distance: 0 = identical, 2 = opposite. Filter out poor matches (> 0.6).
if distances:
docs = [d for d, dist in zip(docs, distances) if dist < 0.6]
return docs
except Exception: except Exception:
return [] return []