Retrieval and also update on russian language

This commit is contained in:
2026-02-04 16:51:50 +03:00
parent 3dea3605ad
commit ea4ce23cd9
7 changed files with 572 additions and 17 deletions

View File

@@ -33,7 +33,7 @@ Chosen data folder: relatve ./../../../data - from the current folder
# Phase 5 (preparation for the retrieval feature)
- [ ] Create file `retrieval.py` with the configuration for chosen RAG framework, that will retrieve data from the vector storage based on the query. Use retrieving library/plugin, that supports chosen vector storage within the chosen RAG framework. Retrieving configuration should search for the provided text in the query as argument in the function and return found information with the stored meta data, like paragraph, section, page etc. Important: if for chosen RAG framework, there is no need in separation of search, separation of retrieving from the chosen vector storage, this step may be skipped and marked done.
- [x] Create file `retrieval.py` with the configuration for chosen RAG framework, that will retrieve data from the vector storage based on the query. Use retrieving library/plugin, that supports chosen vector storage within the chosen RAG framework. Retrieving configuration should search for the provided text in the query as argument in the function and return found information with the stored meta data, like paragraph, section, page etc. Important: if for chosen RAG framework, there is no need in separation of search, separation of retrieving from the chosen vector storage, this step may be skipped and marked done.
# Phase 6 (chat feature, as agent, for usage in the cli)

View File

@@ -4,6 +4,8 @@
This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex as the primary framework and Qdrant as the vector storage. The project is designed to load documents from a shared data directory, store them in a vector database, and enable semantic search and chat capabilities using local Ollama models.
The system has been enhanced to properly handle Russian language documents with Cyrillic characters, ensuring proper encoding during document loading, storage, and retrieval.
### Key Technologies
- **RAG Framework**: LlamaIndex
- **Vector Storage**: Qdrant
@@ -64,6 +66,7 @@ This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex a
- Use text splitters appropriate for each document type
- Store metadata (filename, page, section, paragraph) with embeddings
- Track processed documents to avoid re-processing (using SQLite if needed)
- Proper encoding handling for Russian/Cyrillic text during loading and retrieval
### Vector Storage
- Collection name: "documents_llamaindex"
@@ -95,10 +98,12 @@ This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex a
- [x] Text splitting strategies implementation
- [x] Document tracking mechanism
- [x] CLI command for enrichment
- [x] Russian language/Cyrillic text encoding support during document loading
### Phase 5: Retrieval Feature
- [ ] Retrieval module configuration
- [ ] Query processing with metadata retrieval
- [x] Retrieval module configuration
- [x] Query processing with metadata retrieval
- [x] Russian language/Cyrillic text encoding support
### Phase 6: Chat Agent
- [ ] Agent module with Ollama integration
@@ -135,3 +140,4 @@ The system expects documents to be placed in `./../../../data` relative to the p
- Verify Qdrant is accessible on ports 6333 (REST) and 6334 (gRPC)
- Check that the data directory contains supported file types
- Review logs in `logs/dev.log` for detailed error information
- For Russian/Cyrillic text issues, ensure proper encoding handling is configured in both enrichment and retrieval modules

View File

@@ -87,5 +87,48 @@ def enrich(data_path, recursive, verbose):
click.echo(f"Error during document enrichment: {e}")
@main.command(help="Retrieve documents from vector storage based on a query")
@click.argument('query', type=str)
@click.option('--top-k', '-k', default=5, help="Number of top similar documents to retrieve")
@click.option('--verbose', '-v', is_flag=True, help="Enable verbose output")
def retrieve(query, top_k, verbose):
"""Retrieve documents from vector storage based on a query."""
if verbose:
logger.enable("__main__")
logger.info(f"Starting document retrieval for query: {query}")
logger.info(f"Top-K results: {top_k}")
try:
# Import the retrieval module
from retrieval import retrieve_documents_with_query_engine
logger.info("Retrieval module imported successfully")
# Call the retrieval function
results = retrieve_documents_with_query_engine(query=query, top_k=top_k)
logger.info(f"Retrieved {len(results)} documents for query: {query}")
# Display results
click.echo(f"\nFound {len(results)} results for query: '{query}'\n")
for i, result in enumerate(results, 1):
click.echo(f"Result {i}:")
click.echo(f" Content preview: {result['content'][:200]}{'...' if len(result['content']) > 200 else ''}")
click.echo(f" Score: {result['score']}")
click.echo(f" Metadata:")
for key, value in result['metadata'].items():
click.echo(f" {key}: {value}")
click.echo("")
click.echo("Document retrieval completed successfully")
except ImportError as e:
logger.error(f"Failed to import retrieval module: {e}")
click.echo(f"Error: Could not import retrieval module: {e}")
except Exception as e:
logger.error(f"Error during document retrieval: {e}")
click.echo(f"Error during document retrieval: {e}")
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,115 @@
#!/usr/bin/env python3
"""
Diagnostic script to examine how content is stored in the vector database.
"""
import os
import sys
from pathlib import Path
# Add the project root to the path
sys.path.insert(0, str(Path(__file__).parent))
from retrieval import retrieve_documents_with_query_engine
from loguru import logger
from vector_storage import get_vector_store_and_index
# Setup logging
logs_dir = Path("logs")
logs_dir.mkdir(exist_ok=True)
logger.remove()
logger.add(
"logs/dev.log",
rotation="10 MB",
retention="10 days",
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
)
logger.add(
sys.stdout,
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
colorize=True
)
def diagnose_content_storage():
"""Diagnose how content is stored in the vector database."""
logger.info("Starting content storage diagnosis...")
try:
# Get the vector store and index
vector_store, index = get_vector_store_and_index()
logger.info("Successfully connected to vector storage")
# Let's try to access the raw storage to see what's there
from llama_index.vector_stores.qdrant import QdrantVectorStore
if isinstance(vector_store, QdrantVectorStore):
client = vector_store.client
collection_name = vector_store.collection_name
# Get collection info
collection_info = client.get_collection(collection_name)
logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points")
# Sample some points to see what's stored
points_response = client.scroll(
collection_name=collection_name,
limit=5, # Get first 5 points
with_payload=True,
with_vectors=False
)
# The scroll method returns (points, next_offset), so we need the first element
points = points_response[0] if isinstance(points_response, tuple) else points_response
logger.info("Sample of stored points:")
for i, point in enumerate(points):
# The point structure depends on the Qdrant client version
if hasattr(point, 'id') and hasattr(point, 'payload'):
point_id = point.id
payload = point.payload
else:
# Fallback for different structure
point_id = getattr(point, 'id', 'unknown')
payload = getattr(point, 'payload', {})
logger.info(f"Point {i+1} ID: {point_id}")
logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}")
# Check for content in various possible keys
content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content']
content_found = False
for key in content_keys:
if key in payload:
content = payload[key]
logger.info(f"Content found in key '{key}': {str(content)[:100]}...")
content_found = True
break
if not content_found:
logger.info("No content found in standard keys")
logger.info(f"Full payload keys: {list(payload.keys())}")
print("-" * 50)
# Also test retrieval to see what comes back
logger.info("\nTesting retrieval with a simple query...")
results = retrieve_documents_with_query_engine("Баканов", top_k=2)
logger.info(f"Retrieved {len(results)} results")
for i, result in enumerate(results):
logger.info(f"Result {i+1}:")
logger.info(f" Content length: {len(result.get('content', ''))}")
logger.info(f" Content preview: {result.get('content', '')[:200]}...")
logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}")
logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}")
except Exception as e:
logger.error(f"Error during diagnosis: {e}")
import traceback
logger.error(traceback.format_exc())
if __name__ == "__main__":
diagnose_content_storage()

View File

@@ -158,6 +158,50 @@ def get_text_splitter(file_extension: str):
)
def ensure_proper_encoding(text):
"""
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
Args:
text: Text that may need encoding correction
Returns:
Properly encoded text string
"""
if text is None:
return "unknown"
if isinstance(text, bytes):
# Decode bytes to string with proper encoding
try:
return text.decode('utf-8')
except UnicodeDecodeError:
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
try:
return text.decode('cp1251') # Windows Cyrillic encoding
except UnicodeDecodeError:
try:
return text.decode('koi8-r') # Russian encoding
except UnicodeDecodeError:
# If all else fails, decode with errors='replace'
return text.decode('utf-8', errors='replace')
elif isinstance(text, str):
# Ensure the string is properly encoded
try:
# Try to encode and decode to ensure it's valid UTF-8
return text.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
# If there are encoding issues, try to fix them
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
else:
# Convert other types to string and ensure proper encoding
text_str = str(text)
try:
return text_str.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
"""
Process all documents from the data folder using appropriate loaders and store in vector DB.
@@ -228,7 +272,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Load the document using SimpleDirectoryReader
# This automatically selects the appropriate reader based on file extension
def file_metadata_func(file_path_str):
return {"filename": Path(file_path_str).name}
# Apply proper encoding to filename
filename = ensure_proper_encoding(Path(file_path_str).name)
return {"filename": filename}
reader = SimpleDirectoryReader(
input_files=[file_path],
@@ -241,29 +287,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
# Extract additional metadata based on document type
file_ext = Path(file_path).suffix
# Apply proper encoding to file path
encoded_file_path = ensure_proper_encoding(file_path)
# Add additional metadata
doc.metadata["file_path"] = file_path
doc.metadata["file_path"] = encoded_file_path
doc.metadata["processed_at"] = datetime.now().isoformat()
# Handle document-type-specific metadata
if file_ext.lower() == '.pdf':
# PDF-specific metadata
doc.metadata["page_label"] = doc.metadata.get("page_label", "unknown")
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
doc.metadata["file_type"] = "pdf"
elif file_ext.lower() in ['.docx', '.odt']:
# Word document metadata
doc.metadata["section"] = doc.metadata.get("section", "unknown")
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
doc.metadata["file_type"] = "document"
elif file_ext.lower() == '.pptx':
# PowerPoint metadata
doc.metadata["slide_id"] = doc.metadata.get("slide_id", "unknown")
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
doc.metadata["file_type"] = "presentation"
elif file_ext.lower() == '.xlsx':
# Excel metadata
doc.metadata["sheet_name"] = doc.metadata.get("sheet_name", "unknown")
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
doc.metadata["file_type"] = "spreadsheet"
# Determine the appropriate text splitter based on file type
@@ -276,17 +325,21 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
nodes_with_enhanced_metadata = []
for i, node in enumerate(nodes):
# Enhance node metadata with additional information
node.metadata["original_doc_id"] = doc.doc_id
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
node.metadata["chunk_number"] = i
node.metadata["total_chunks"] = len(nodes)
node.metadata["file_path"] = file_path
node.metadata["file_path"] = encoded_file_path
# Ensure the text content is properly encoded
node.text = ensure_proper_encoding(node.text)
nodes_with_enhanced_metadata.append(node)
# Add all nodes to the index at once
if nodes_with_enhanced_metadata:
index.insert_nodes(nodes_with_enhanced_metadata)
logger.info(f"Processed {len(nodes)} nodes from {file_path}")
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
# Mark document as processed only after successful insertion
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})

View File

@@ -0,0 +1,338 @@
"""
Retrieval module for the RAG solution using LlamaIndex and Qdrant.
This module provides functionality to retrieve relevant documents
from the vector storage based on a query text.
"""
import os
from typing import List, Dict, Any
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from loguru import logger
from pathlib import Path
from vector_storage import get_vector_store_and_index
from llama_index.embeddings.ollama import OllamaEmbedding
import os
def setup_global_models():
"""Set up the global models to prevent defaulting to OpenAI."""
# Set up the embedding model
ollama_embed_model = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
ollama_base_url = "http://localhost:11434"
embed_model = OllamaEmbedding(
model_name=ollama_embed_model,
base_url=ollama_base_url
)
# Set as the global embedding model
Settings.embed_model = embed_model
# Set up the LLM model
ollama_chat_model = os.getenv("OLLAMA_CHAT_MODEL", "nemotron-mini:4b")
from llama_index.llms.ollama import Ollama
llm = Ollama(model=ollama_chat_model, base_url=ollama_base_url)
# Set as the global LLM
Settings.llm = llm
def initialize_retriever(
collection_name: str = "documents_llamaindex",
similarity_top_k: int = 5,
host: str = "localhost",
port: int = 6333
) -> RetrieverQueryEngine:
"""
Initialize the retriever query engine with the vector store.
Args:
collection_name: Name of the Qdrant collection
similarity_top_k: Number of top similar documents to retrieve
host: Qdrant host address
port: Qdrant REST API port
Returns:
RetrieverQueryEngine configured with the vector store
"""
logger.info(f"Initializing retriever for collection: {collection_name}")
try:
# Set up the global models to prevent defaulting to OpenAI
setup_global_models()
# Get the vector store and index from the existing configuration
vector_store, index = get_vector_store_and_index()
# Create a retriever from the index
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=similarity_top_k
)
# Create the query engine
query_engine = RetrieverQueryEngine(
retriever=retriever
)
logger.info("Retriever initialized successfully")
return query_engine
except Exception as e:
logger.error(f"Failed to initialize retriever: {str(e)}")
raise
def retrieve_documents(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""
Retrieve documents from the vector storage based on the query text.
Args:
query: The query text to search for
top_k: Number of top similar documents to retrieve
Returns:
List of dictionaries containing document content and metadata
"""
logger.info(f"Retrieving documents for query: '{query[:50]}...' (top_k={top_k})")
try:
# Initialize the query engine
query_engine = initialize_retriever(similarity_top_k=top_k)
# Perform the query
response = query_engine.query(query)
# Extract documents and their metadata
results = []
# If response is a single text response, we need to get the source nodes
if hasattr(response, 'source_nodes'):
for node in response.source_nodes:
doc_info = {
"content": node.text,
"metadata": node.metadata,
"score": node.score if hasattr(node, 'score') else None
}
results.append(doc_info)
else:
# If the response doesn't have source nodes, try to extract text content
results.append({
"content": str(response),
"metadata": {},
"score": None
})
logger.info(f"Retrieved {len(results)} documents for query: '{query[:30]}...'")
return results
except Exception as e:
logger.error(f"Error retrieving documents for query '{query[:30]}...': {str(e)}")
raise
def retrieve_documents_with_query_engine(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
"""
Alternative method to retrieve documents using a direct query engine approach.
Args:
query: The query text to search for
top_k: Number of top similar documents to retrieve
Returns:
List of dictionaries containing document content and metadata
"""
logger.info(f"Retrieving documents with direct query engine for query: '{query[:50]}...' (top_k={top_k})")
try:
# Set up the global models to prevent defaulting to OpenAI
setup_global_models()
# Get the vector store and index from the existing configuration
vector_store, index = get_vector_store_and_index()
# Create a retriever from the index
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=top_k
)
# Create the query engine
query_engine = RetrieverQueryEngine(
retriever=retriever
)
# Set the global models again right before the query to ensure they're used
setup_global_models()
# Perform the query
response = query_engine.query(query)
# Extract documents and their metadata
results = []
# Process source nodes to extract content and metadata
if hasattr(response, 'source_nodes'):
for node in response.source_nodes:
# Extract node information
# Get all available metadata from the node
node_metadata = node.metadata or {}
# The actual text content is in node.text
content = node.text or ""
# Ensure proper encoding for content
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')
elif not isinstance(content, str):
content = str(content)
# Apply the encoding fix to clean up any garbled characters
content = _ensure_proper_encoding(content)
# Create a comprehensive metadata dictionary with proper encoding
doc_info = {
"content": content,
"metadata": {
"filename": _ensure_proper_encoding(node_metadata.get("filename", "unknown")),
"file_path": _ensure_proper_encoding(node_metadata.get("file_path", "unknown")),
"page_label": _ensure_proper_encoding(node_metadata.get("page_label",
node_metadata.get("page", "unknown"))),
"section": _ensure_proper_encoding(node_metadata.get("section", "unknown")),
"paragraph": _ensure_proper_encoding(node_metadata.get("paragraph", "unknown")),
"chunk_number": _ensure_proper_encoding(node_metadata.get("chunk_number", "unknown")),
"total_chunks": _ensure_proper_encoding(node_metadata.get("total_chunks", "unknown")),
"file_type": _ensure_proper_encoding(node_metadata.get("file_type", "unknown")),
"original_doc_id": _ensure_proper_encoding(node_metadata.get("original_doc_id", "unknown")),
"slide_id": _ensure_proper_encoding(node_metadata.get("slide_id",
node_metadata.get("slide_id", "unknown"))),
"sheet_name": _ensure_proper_encoding(node_metadata.get("sheet_name",
node_metadata.get("sheet_name", "unknown"))),
"processed_at": _ensure_proper_encoding(node_metadata.get("processed_at", "unknown")),
# Include any additional metadata that might be present
**{_ensure_proper_encoding(k): _ensure_proper_encoding(v) for k, v in node_metadata.items()
if k not in ["filename", "file_path", "page_label", "page",
"section", "paragraph", "chunk_number",
"total_chunks", "file_type", "original_doc_id",
"slide_id", "sheet_name", "processed_at"]}
},
"score": getattr(node, 'score', None)
}
results.append(doc_info)
else:
# Fallback if no source nodes are available
content = str(response)
if isinstance(content, bytes):
content = content.decode('utf-8', errors='replace')
results.append({
"content": content,
"metadata": {},
"score": None
})
logger.info(f"Retrieved {len(results)} documents for query: '{query[:30]}...'")
return results
except Exception as e:
logger.error(f"Error retrieving documents for query '{query[:30]}...': {str(e)}")
raise
def _ensure_proper_encoding(text):
"""
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
Args:
text: Text that may need encoding correction
Returns:
Properly encoded text string
"""
if text is None:
return "unknown"
if isinstance(text, bytes):
# Decode bytes to string with proper encoding
try:
return text.decode('utf-8')
except UnicodeDecodeError:
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
try:
return text.decode('cp1251') # Windows Cyrillic encoding
except UnicodeDecodeError:
try:
return text.decode('koi8-r') # Russian encoding
except UnicodeDecodeError:
# If all else fails, decode with errors='replace'
return text.decode('utf-8', errors='replace')
elif isinstance(text, str):
# Ensure the string is properly encoded
try:
# Try to encode and decode to ensure it's valid UTF-8
return text.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
# If there are encoding issues, try to fix them
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
else:
# Convert other types to string and ensure proper encoding
text_str = str(text)
try:
return text_str.encode('utf-8').decode('utf-8')
except UnicodeEncodeError:
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
if __name__ == "__main__":
# Example usage
from loguru import logger
import sys
# Create logs directory if it doesn't exist
logs_dir = Path("logs")
logs_dir.mkdir(exist_ok=True)
# Remove default logger to customize it
logger.remove()
# Add file handler with rotation
logger.add(
"logs/dev.log",
rotation="10 MB",
retention="10 days",
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
)
# Add stdout handler
logger.add(
sys.stdout,
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
colorize=True
)
logger.info("Testing retrieval functionality...")
try:
# Test query
test_query = "What is this document about?"
results = retrieve_documents_with_query_engine(test_query, top_k=3)
print(f"Found {len(results)} results for query: '{test_query}'")
for i, result in enumerate(results):
print(f"\nResult {i+1}:")
print(f"Content preview: {result['content'][:200]}...")
print(f"Metadata: {result['metadata']}")
print(f"Score: {result['score']}")
except Exception as e:
logger.error(f"Error in test run: {e}")
print(f"Error: {e}")

View File

@@ -57,7 +57,7 @@ def initialize_vector_storage(
base_url=ollama_base_url
)
# Get a test embedding to determine the correct size
test_embedding = embed_model.get_query_embedding("test")
test_embedding = embed_model.get_text_embedding("test")
embedding_dimension = len(test_embedding)
logger.info(f"Detected embedding dimension: {embedding_dimension}")
@@ -152,9 +152,9 @@ def get_vector_store_and_index() -> tuple[QdrantVectorStore, VectorStoreIndex]:
Returns:
Tuple of (QdrantVectorStore, VectorStoreIndex)
"""
# Get the embedding model from environment variables
embed_model = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
return initialize_vector_storage(ollama_embed_model=embed_model)
# Get the embedding model name from environment variables
embed_model_name = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
return initialize_vector_storage(ollama_embed_model=embed_model_name)
if __name__ == "__main__":