modified: config.py
modified: services/rag_service.py
This commit is contained in:
@@ -30,6 +30,6 @@ class Config:
|
|||||||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
||||||
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
|
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4o")
|
||||||
|
|
||||||
RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "5"))
|
RAG_TOP_K = int(os.environ.get("RAG_TOP_K", "6"))
|
||||||
RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "500"))
|
RAG_CHUNK_SIZE = int(os.environ.get("RAG_CHUNK_SIZE", "300"))
|
||||||
RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "50"))
|
RAG_CHUNK_OVERLAP = int(os.environ.get("RAG_CHUNK_OVERLAP", "75"))
|
||||||
|
|||||||
@@ -49,14 +49,45 @@ def _get_collection():
|
|||||||
|
|
||||||
|
|
||||||
def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
|
def chunk_text(text: str, chunk_size: int, overlap: int) -> list[str]:
|
||||||
words = text.split()
|
"""
|
||||||
chunks = []
|
Paragraph-aware chunking:
|
||||||
start = 0
|
1. Split on blank lines to keep paragraphs intact.
|
||||||
while start < len(words):
|
2. Paragraphs shorter than chunk_size are merged together.
|
||||||
end = start + chunk_size
|
3. Paragraphs longer than chunk_size are split by sentence, then by word.
|
||||||
chunks.append(" ".join(words[start:end]))
|
Overlap is applied by re-including the last <overlap> words of the previous chunk.
|
||||||
start += chunk_size - overlap
|
"""
|
||||||
return [c for c in chunks if c.strip()]
|
import re as _re
|
||||||
|
|
||||||
|
# Split into paragraphs (one or more blank lines)
|
||||||
|
paragraphs = [p.strip() for p in _re.split(r'\n{2,}', text) if p.strip()]
|
||||||
|
|
||||||
|
# Further split very long paragraphs at sentence boundaries
|
||||||
|
sentences: list[str] = []
|
||||||
|
for para in paragraphs:
|
||||||
|
# Split on ". ", "! ", "? " followed by uppercase or end-of-string
|
||||||
|
parts = _re.split(r'(?<=[.!?])\s+(?=[A-ZÜÖÄ\u00C0-\u00FF"])', para)
|
||||||
|
sentences.extend([s.strip() for s in parts if s.strip()])
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
current_words: list[str] = []
|
||||||
|
|
||||||
|
for sentence in sentences:
|
||||||
|
words = sentence.split()
|
||||||
|
# If adding this sentence would exceed chunk_size, flush current chunk
|
||||||
|
if current_words and len(current_words) + len(words) > chunk_size:
|
||||||
|
chunks.append(" ".join(current_words))
|
||||||
|
# Keep last <overlap> words as context for the next chunk
|
||||||
|
current_words = current_words[-overlap:] if overlap else []
|
||||||
|
current_words.extend(words)
|
||||||
|
# If a single sentence is already longer than chunk_size, flush immediately
|
||||||
|
if len(current_words) >= chunk_size:
|
||||||
|
chunks.append(" ".join(current_words))
|
||||||
|
current_words = current_words[-overlap:] if overlap else []
|
||||||
|
|
||||||
|
if current_words:
|
||||||
|
chunks.append(" ".join(current_words))
|
||||||
|
|
||||||
|
return [c for c in chunks if len(c.split()) >= 10] # drop near-empty chunks
|
||||||
|
|
||||||
|
|
||||||
def index_source(
|
def index_source(
|
||||||
@@ -129,7 +160,13 @@ def similarity_search(
|
|||||||
query_texts=[query],
|
query_texts=[query],
|
||||||
n_results=top_k,
|
n_results=top_k,
|
||||||
where=where,
|
where=where,
|
||||||
|
include=["documents", "distances"],
|
||||||
)
|
)
|
||||||
return results["documents"][0] if results["documents"] else []
|
docs = results["documents"][0] if results["documents"] else []
|
||||||
|
distances = results["distances"][0] if results.get("distances") else []
|
||||||
|
# Cosine distance: 0 = identical, 2 = opposite. Filter out poor matches (> 0.6).
|
||||||
|
if distances:
|
||||||
|
docs = [d for d, dist in zip(docs, distances) if dist < 0.6]
|
||||||
|
return docs
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return []
|
||||||
|
|||||||
Reference in New Issue
Block a user