Retrieval and also update on russian language
This commit is contained in:
@@ -33,7 +33,7 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
||||
|
||||
# Phase 5 (preparation for the retrieval feature)
|
||||
|
||||
- [ ] Create file `retrieval.py` with the configuration for chosen RAG framework, that will retrieve data from the vector storage based on the query. Use retrieving library/plugin, that supports chosen vector storage within the chosen RAG framework. Retrieving configuration should search for the provided text in the query as argument in the function and return found information with the stored meta data, like paragraph, section, page etc. Important: if for chosen RAG framework, there is no need in separation of search, separation of retrieving from the chosen vector storage, this step may be skipped and marked done.
|
||||
- [x] Create file `retrieval.py` with the configuration for chosen RAG framework, that will retrieve data from the vector storage based on the query. Use retrieving library/plugin, that supports chosen vector storage within the chosen RAG framework. Retrieving configuration should search for the provided text in the query as argument in the function and return found information with the stored meta data, like paragraph, section, page etc. Important: if for chosen RAG framework, there is no need in separation of search, separation of retrieving from the chosen vector storage, this step may be skipped and marked done.
|
||||
|
||||
# Phase 6 (chat feature, as agent, for usage in the cli)
|
||||
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
|
||||
This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex as the primary framework and Qdrant as the vector storage. The project is designed to load documents from a shared data directory, store them in a vector database, and enable semantic search and chat capabilities using local Ollama models.
|
||||
|
||||
The system has been enhanced to properly handle Russian language documents with Cyrillic characters, ensuring proper encoding during document loading, storage, and retrieval.
|
||||
|
||||
### Key Technologies
|
||||
- **RAG Framework**: LlamaIndex
|
||||
- **Vector Storage**: Qdrant
|
||||
@@ -64,6 +66,7 @@ This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex a
|
||||
- Use text splitters appropriate for each document type
|
||||
- Store metadata (filename, page, section, paragraph) with embeddings
|
||||
- Track processed documents to avoid re-processing (using SQLite if needed)
|
||||
- Proper encoding handling for Russian/Cyrillic text during loading and retrieval
|
||||
|
||||
### Vector Storage
|
||||
- Collection name: "documents_llamaindex"
|
||||
@@ -95,10 +98,12 @@ This is a Retrieval Augmented Generation (RAG) solution built using LlamaIndex a
|
||||
- [x] Text splitting strategies implementation
|
||||
- [x] Document tracking mechanism
|
||||
- [x] CLI command for enrichment
|
||||
- [x] Russian language/Cyrillic text encoding support during document loading
|
||||
|
||||
### Phase 5: Retrieval Feature
|
||||
- [ ] Retrieval module configuration
|
||||
- [ ] Query processing with metadata retrieval
|
||||
- [x] Retrieval module configuration
|
||||
- [x] Query processing with metadata retrieval
|
||||
- [x] Russian language/Cyrillic text encoding support
|
||||
|
||||
### Phase 6: Chat Agent
|
||||
- [ ] Agent module with Ollama integration
|
||||
@@ -134,4 +139,5 @@ The system expects documents to be placed in `./../../../data` relative to the p
|
||||
- Ensure Ollama is running on port 11434
|
||||
- Verify Qdrant is accessible on ports 6333 (REST) and 6334 (gRPC)
|
||||
- Check that the data directory contains supported file types
|
||||
- Review logs in `logs/dev.log` for detailed error information
|
||||
- Review logs in `logs/dev.log` for detailed error information
|
||||
- For Russian/Cyrillic text issues, ensure proper encoding handling is configured in both enrichment and retrieval modules
|
||||
@@ -87,5 +87,48 @@ def enrich(data_path, recursive, verbose):
|
||||
click.echo(f"Error during document enrichment: {e}")
|
||||
|
||||
|
||||
@main.command(help="Retrieve documents from vector storage based on a query")
|
||||
@click.argument('query', type=str)
|
||||
@click.option('--top-k', '-k', default=5, help="Number of top similar documents to retrieve")
|
||||
@click.option('--verbose', '-v', is_flag=True, help="Enable verbose output")
|
||||
def retrieve(query, top_k, verbose):
|
||||
"""Retrieve documents from vector storage based on a query."""
|
||||
if verbose:
|
||||
logger.enable("__main__")
|
||||
|
||||
logger.info(f"Starting document retrieval for query: {query}")
|
||||
logger.info(f"Top-K results: {top_k}")
|
||||
|
||||
try:
|
||||
# Import the retrieval module
|
||||
from retrieval import retrieve_documents_with_query_engine
|
||||
logger.info("Retrieval module imported successfully")
|
||||
|
||||
# Call the retrieval function
|
||||
results = retrieve_documents_with_query_engine(query=query, top_k=top_k)
|
||||
|
||||
logger.info(f"Retrieved {len(results)} documents for query: {query}")
|
||||
|
||||
# Display results
|
||||
click.echo(f"\nFound {len(results)} results for query: '{query}'\n")
|
||||
|
||||
for i, result in enumerate(results, 1):
|
||||
click.echo(f"Result {i}:")
|
||||
click.echo(f" Content preview: {result['content'][:200]}{'...' if len(result['content']) > 200 else ''}")
|
||||
click.echo(f" Score: {result['score']}")
|
||||
click.echo(f" Metadata:")
|
||||
for key, value in result['metadata'].items():
|
||||
click.echo(f" {key}: {value}")
|
||||
click.echo("")
|
||||
|
||||
click.echo("Document retrieval completed successfully")
|
||||
except ImportError as e:
|
||||
logger.error(f"Failed to import retrieval module: {e}")
|
||||
click.echo(f"Error: Could not import retrieval module: {e}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during document retrieval: {e}")
|
||||
click.echo(f"Error during document retrieval: {e}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
115
services/rag/llamaindex/diagnose_content.py
Normal file
115
services/rag/llamaindex/diagnose_content.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diagnostic script to examine how content is stored in the vector database.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add the project root to the path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from retrieval import retrieve_documents_with_query_engine
|
||||
from loguru import logger
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
# Setup logging
|
||||
logs_dir = Path("logs")
|
||||
logs_dir.mkdir(exist_ok=True)
|
||||
|
||||
logger.remove()
|
||||
logger.add(
|
||||
"logs/dev.log",
|
||||
rotation="10 MB",
|
||||
retention="10 days",
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
|
||||
)
|
||||
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
|
||||
colorize=True
|
||||
)
|
||||
|
||||
def diagnose_content_storage():
|
||||
"""Diagnose how content is stored in the vector database."""
|
||||
logger.info("Starting content storage diagnosis...")
|
||||
|
||||
try:
|
||||
# Get the vector store and index
|
||||
vector_store, index = get_vector_store_and_index()
|
||||
logger.info("Successfully connected to vector storage")
|
||||
|
||||
# Let's try to access the raw storage to see what's there
|
||||
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
||||
if isinstance(vector_store, QdrantVectorStore):
|
||||
client = vector_store.client
|
||||
collection_name = vector_store.collection_name
|
||||
|
||||
# Get collection info
|
||||
collection_info = client.get_collection(collection_name)
|
||||
logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points")
|
||||
|
||||
# Sample some points to see what's stored
|
||||
points_response = client.scroll(
|
||||
collection_name=collection_name,
|
||||
limit=5, # Get first 5 points
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
# The scroll method returns (points, next_offset), so we need the first element
|
||||
points = points_response[0] if isinstance(points_response, tuple) else points_response
|
||||
|
||||
logger.info("Sample of stored points:")
|
||||
for i, point in enumerate(points):
|
||||
# The point structure depends on the Qdrant client version
|
||||
if hasattr(point, 'id') and hasattr(point, 'payload'):
|
||||
point_id = point.id
|
||||
payload = point.payload
|
||||
else:
|
||||
# Fallback for different structure
|
||||
point_id = getattr(point, 'id', 'unknown')
|
||||
payload = getattr(point, 'payload', {})
|
||||
|
||||
logger.info(f"Point {i+1} ID: {point_id}")
|
||||
logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}")
|
||||
|
||||
# Check for content in various possible keys
|
||||
content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content']
|
||||
content_found = False
|
||||
for key in content_keys:
|
||||
if key in payload:
|
||||
content = payload[key]
|
||||
logger.info(f"Content found in key '{key}': {str(content)[:100]}...")
|
||||
content_found = True
|
||||
break
|
||||
|
||||
if not content_found:
|
||||
logger.info("No content found in standard keys")
|
||||
logger.info(f"Full payload keys: {list(payload.keys())}")
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
# Also test retrieval to see what comes back
|
||||
logger.info("\nTesting retrieval with a simple query...")
|
||||
results = retrieve_documents_with_query_engine("Баканов", top_k=2)
|
||||
|
||||
logger.info(f"Retrieved {len(results)} results")
|
||||
for i, result in enumerate(results):
|
||||
logger.info(f"Result {i+1}:")
|
||||
logger.info(f" Content length: {len(result.get('content', ''))}")
|
||||
logger.info(f" Content preview: {result.get('content', '')[:200]}...")
|
||||
logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}")
|
||||
logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during diagnosis: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
if __name__ == "__main__":
|
||||
diagnose_content_storage()
|
||||
@@ -158,6 +158,50 @@ def get_text_splitter(file_extension: str):
|
||||
)
|
||||
|
||||
|
||||
def ensure_proper_encoding(text):
|
||||
"""
|
||||
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
|
||||
|
||||
Args:
|
||||
text: Text that may need encoding correction
|
||||
|
||||
Returns:
|
||||
Properly encoded text string
|
||||
"""
|
||||
if text is None:
|
||||
return "unknown"
|
||||
|
||||
if isinstance(text, bytes):
|
||||
# Decode bytes to string with proper encoding
|
||||
try:
|
||||
return text.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
||||
try:
|
||||
return text.decode('cp1251') # Windows Cyrillic encoding
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
return text.decode('koi8-r') # Russian encoding
|
||||
except UnicodeDecodeError:
|
||||
# If all else fails, decode with errors='replace'
|
||||
return text.decode('utf-8', errors='replace')
|
||||
elif isinstance(text, str):
|
||||
# Ensure the string is properly encoded
|
||||
try:
|
||||
# Try to encode and decode to ensure it's valid UTF-8
|
||||
return text.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# If there are encoding issues, try to fix them
|
||||
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
else:
|
||||
# Convert other types to string and ensure proper encoding
|
||||
text_str = str(text)
|
||||
try:
|
||||
return text_str.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
|
||||
"""
|
||||
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
||||
@@ -228,7 +272,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
# Load the document using SimpleDirectoryReader
|
||||
# This automatically selects the appropriate reader based on file extension
|
||||
def file_metadata_func(file_path_str):
|
||||
return {"filename": Path(file_path_str).name}
|
||||
# Apply proper encoding to filename
|
||||
filename = ensure_proper_encoding(Path(file_path_str).name)
|
||||
return {"filename": filename}
|
||||
|
||||
reader = SimpleDirectoryReader(
|
||||
input_files=[file_path],
|
||||
@@ -241,29 +287,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
# Extract additional metadata based on document type
|
||||
file_ext = Path(file_path).suffix
|
||||
|
||||
# Apply proper encoding to file path
|
||||
encoded_file_path = ensure_proper_encoding(file_path)
|
||||
|
||||
# Add additional metadata
|
||||
doc.metadata["file_path"] = file_path
|
||||
doc.metadata["file_path"] = encoded_file_path
|
||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||
|
||||
# Handle document-type-specific metadata
|
||||
if file_ext.lower() == '.pdf':
|
||||
# PDF-specific metadata
|
||||
doc.metadata["page_label"] = doc.metadata.get("page_label", "unknown")
|
||||
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
|
||||
doc.metadata["file_type"] = "pdf"
|
||||
|
||||
elif file_ext.lower() in ['.docx', '.odt']:
|
||||
# Word document metadata
|
||||
doc.metadata["section"] = doc.metadata.get("section", "unknown")
|
||||
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
|
||||
doc.metadata["file_type"] = "document"
|
||||
|
||||
elif file_ext.lower() == '.pptx':
|
||||
# PowerPoint metadata
|
||||
doc.metadata["slide_id"] = doc.metadata.get("slide_id", "unknown")
|
||||
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
|
||||
doc.metadata["file_type"] = "presentation"
|
||||
|
||||
elif file_ext.lower() == '.xlsx':
|
||||
# Excel metadata
|
||||
doc.metadata["sheet_name"] = doc.metadata.get("sheet_name", "unknown")
|
||||
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
|
||||
doc.metadata["file_type"] = "spreadsheet"
|
||||
|
||||
# Determine the appropriate text splitter based on file type
|
||||
@@ -276,17 +325,21 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
nodes_with_enhanced_metadata = []
|
||||
for i, node in enumerate(nodes):
|
||||
# Enhance node metadata with additional information
|
||||
node.metadata["original_doc_id"] = doc.doc_id
|
||||
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
|
||||
node.metadata["chunk_number"] = i
|
||||
node.metadata["total_chunks"] = len(nodes)
|
||||
node.metadata["file_path"] = file_path
|
||||
node.metadata["file_path"] = encoded_file_path
|
||||
|
||||
# Ensure the text content is properly encoded
|
||||
node.text = ensure_proper_encoding(node.text)
|
||||
|
||||
nodes_with_enhanced_metadata.append(node)
|
||||
|
||||
# Add all nodes to the index at once
|
||||
if nodes_with_enhanced_metadata:
|
||||
index.insert_nodes(nodes_with_enhanced_metadata)
|
||||
|
||||
logger.info(f"Processed {len(nodes)} nodes from {file_path}")
|
||||
logger.info(f"Processed {len(nodes)} nodes from {encoded_file_path}")
|
||||
|
||||
# Mark document as processed only after successful insertion
|
||||
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
|
||||
|
||||
338
services/rag/llamaindex/retrieval.py
Normal file
338
services/rag/llamaindex/retrieval.py
Normal file
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
Retrieval module for the RAG solution using LlamaIndex and Qdrant.
|
||||
|
||||
This module provides functionality to retrieve relevant documents
|
||||
from the vector storage based on a query text.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List, Dict, Any
|
||||
from llama_index.core import VectorStoreIndex, Settings
|
||||
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
||||
from llama_index.core.query_engine import RetrieverQueryEngine
|
||||
from llama_index.core.retrievers import VectorIndexRetriever
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
|
||||
from llama_index.embeddings.ollama import OllamaEmbedding
|
||||
import os
|
||||
|
||||
|
||||
def setup_global_models():
|
||||
"""Set up the global models to prevent defaulting to OpenAI."""
|
||||
# Set up the embedding model
|
||||
ollama_embed_model = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
|
||||
ollama_base_url = "http://localhost:11434"
|
||||
|
||||
embed_model = OllamaEmbedding(
|
||||
model_name=ollama_embed_model,
|
||||
base_url=ollama_base_url
|
||||
)
|
||||
|
||||
# Set as the global embedding model
|
||||
Settings.embed_model = embed_model
|
||||
|
||||
# Set up the LLM model
|
||||
ollama_chat_model = os.getenv("OLLAMA_CHAT_MODEL", "nemotron-mini:4b")
|
||||
|
||||
from llama_index.llms.ollama import Ollama
|
||||
llm = Ollama(model=ollama_chat_model, base_url=ollama_base_url)
|
||||
|
||||
# Set as the global LLM
|
||||
Settings.llm = llm
|
||||
|
||||
|
||||
def initialize_retriever(
|
||||
collection_name: str = "documents_llamaindex",
|
||||
similarity_top_k: int = 5,
|
||||
host: str = "localhost",
|
||||
port: int = 6333
|
||||
) -> RetrieverQueryEngine:
|
||||
"""
|
||||
Initialize the retriever query engine with the vector store.
|
||||
|
||||
Args:
|
||||
collection_name: Name of the Qdrant collection
|
||||
similarity_top_k: Number of top similar documents to retrieve
|
||||
host: Qdrant host address
|
||||
port: Qdrant REST API port
|
||||
|
||||
Returns:
|
||||
RetrieverQueryEngine configured with the vector store
|
||||
"""
|
||||
logger.info(f"Initializing retriever for collection: {collection_name}")
|
||||
|
||||
try:
|
||||
# Set up the global models to prevent defaulting to OpenAI
|
||||
setup_global_models()
|
||||
|
||||
# Get the vector store and index from the existing configuration
|
||||
vector_store, index = get_vector_store_and_index()
|
||||
|
||||
# Create a retriever from the index
|
||||
retriever = VectorIndexRetriever(
|
||||
index=index,
|
||||
similarity_top_k=similarity_top_k
|
||||
)
|
||||
|
||||
# Create the query engine
|
||||
query_engine = RetrieverQueryEngine(
|
||||
retriever=retriever
|
||||
)
|
||||
|
||||
logger.info("Retriever initialized successfully")
|
||||
return query_engine
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize retriever: {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def retrieve_documents(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve documents from the vector storage based on the query text.
|
||||
|
||||
Args:
|
||||
query: The query text to search for
|
||||
top_k: Number of top similar documents to retrieve
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing document content and metadata
|
||||
"""
|
||||
logger.info(f"Retrieving documents for query: '{query[:50]}...' (top_k={top_k})")
|
||||
|
||||
try:
|
||||
# Initialize the query engine
|
||||
query_engine = initialize_retriever(similarity_top_k=top_k)
|
||||
|
||||
# Perform the query
|
||||
response = query_engine.query(query)
|
||||
|
||||
# Extract documents and their metadata
|
||||
results = []
|
||||
|
||||
# If response is a single text response, we need to get the source nodes
|
||||
if hasattr(response, 'source_nodes'):
|
||||
for node in response.source_nodes:
|
||||
doc_info = {
|
||||
"content": node.text,
|
||||
"metadata": node.metadata,
|
||||
"score": node.score if hasattr(node, 'score') else None
|
||||
}
|
||||
results.append(doc_info)
|
||||
else:
|
||||
# If the response doesn't have source nodes, try to extract text content
|
||||
results.append({
|
||||
"content": str(response),
|
||||
"metadata": {},
|
||||
"score": None
|
||||
})
|
||||
|
||||
logger.info(f"Retrieved {len(results)} documents for query: '{query[:30]}...'")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving documents for query '{query[:30]}...': {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def retrieve_documents_with_query_engine(query: str, top_k: int = 5) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Alternative method to retrieve documents using a direct query engine approach.
|
||||
|
||||
Args:
|
||||
query: The query text to search for
|
||||
top_k: Number of top similar documents to retrieve
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing document content and metadata
|
||||
"""
|
||||
logger.info(f"Retrieving documents with direct query engine for query: '{query[:50]}...' (top_k={top_k})")
|
||||
|
||||
try:
|
||||
# Set up the global models to prevent defaulting to OpenAI
|
||||
setup_global_models()
|
||||
|
||||
# Get the vector store and index from the existing configuration
|
||||
vector_store, index = get_vector_store_and_index()
|
||||
|
||||
# Create a retriever from the index
|
||||
retriever = VectorIndexRetriever(
|
||||
index=index,
|
||||
similarity_top_k=top_k
|
||||
)
|
||||
|
||||
# Create the query engine
|
||||
query_engine = RetrieverQueryEngine(
|
||||
retriever=retriever
|
||||
)
|
||||
|
||||
# Set the global models again right before the query to ensure they're used
|
||||
setup_global_models()
|
||||
|
||||
# Perform the query
|
||||
response = query_engine.query(query)
|
||||
|
||||
# Extract documents and their metadata
|
||||
results = []
|
||||
|
||||
# Process source nodes to extract content and metadata
|
||||
if hasattr(response, 'source_nodes'):
|
||||
for node in response.source_nodes:
|
||||
# Extract node information
|
||||
# Get all available metadata from the node
|
||||
node_metadata = node.metadata or {}
|
||||
|
||||
# The actual text content is in node.text
|
||||
content = node.text or ""
|
||||
|
||||
# Ensure proper encoding for content
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode('utf-8', errors='replace')
|
||||
elif not isinstance(content, str):
|
||||
content = str(content)
|
||||
|
||||
# Apply the encoding fix to clean up any garbled characters
|
||||
content = _ensure_proper_encoding(content)
|
||||
|
||||
# Create a comprehensive metadata dictionary with proper encoding
|
||||
doc_info = {
|
||||
"content": content,
|
||||
"metadata": {
|
||||
"filename": _ensure_proper_encoding(node_metadata.get("filename", "unknown")),
|
||||
"file_path": _ensure_proper_encoding(node_metadata.get("file_path", "unknown")),
|
||||
"page_label": _ensure_proper_encoding(node_metadata.get("page_label",
|
||||
node_metadata.get("page", "unknown"))),
|
||||
"section": _ensure_proper_encoding(node_metadata.get("section", "unknown")),
|
||||
"paragraph": _ensure_proper_encoding(node_metadata.get("paragraph", "unknown")),
|
||||
"chunk_number": _ensure_proper_encoding(node_metadata.get("chunk_number", "unknown")),
|
||||
"total_chunks": _ensure_proper_encoding(node_metadata.get("total_chunks", "unknown")),
|
||||
"file_type": _ensure_proper_encoding(node_metadata.get("file_type", "unknown")),
|
||||
"original_doc_id": _ensure_proper_encoding(node_metadata.get("original_doc_id", "unknown")),
|
||||
"slide_id": _ensure_proper_encoding(node_metadata.get("slide_id",
|
||||
node_metadata.get("slide_id", "unknown"))),
|
||||
"sheet_name": _ensure_proper_encoding(node_metadata.get("sheet_name",
|
||||
node_metadata.get("sheet_name", "unknown"))),
|
||||
"processed_at": _ensure_proper_encoding(node_metadata.get("processed_at", "unknown")),
|
||||
# Include any additional metadata that might be present
|
||||
**{_ensure_proper_encoding(k): _ensure_proper_encoding(v) for k, v in node_metadata.items()
|
||||
if k not in ["filename", "file_path", "page_label", "page",
|
||||
"section", "paragraph", "chunk_number",
|
||||
"total_chunks", "file_type", "original_doc_id",
|
||||
"slide_id", "sheet_name", "processed_at"]}
|
||||
},
|
||||
"score": getattr(node, 'score', None)
|
||||
}
|
||||
results.append(doc_info)
|
||||
else:
|
||||
# Fallback if no source nodes are available
|
||||
content = str(response)
|
||||
if isinstance(content, bytes):
|
||||
content = content.decode('utf-8', errors='replace')
|
||||
results.append({
|
||||
"content": content,
|
||||
"metadata": {},
|
||||
"score": None
|
||||
})
|
||||
|
||||
logger.info(f"Retrieved {len(results)} documents for query: '{query[:30]}...'")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error retrieving documents for query '{query[:30]}...': {str(e)}")
|
||||
raise
|
||||
|
||||
|
||||
def _ensure_proper_encoding(text):
|
||||
"""
|
||||
Helper function to ensure proper encoding of text, especially for non-ASCII characters like Cyrillic.
|
||||
|
||||
Args:
|
||||
text: Text that may need encoding correction
|
||||
|
||||
Returns:
|
||||
Properly encoded text string
|
||||
"""
|
||||
if text is None:
|
||||
return "unknown"
|
||||
|
||||
if isinstance(text, bytes):
|
||||
# Decode bytes to string with proper encoding
|
||||
try:
|
||||
return text.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
||||
try:
|
||||
return text.decode('cp1251') # Windows Cyrillic encoding
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
return text.decode('koi8-r') # Russian encoding
|
||||
except UnicodeDecodeError:
|
||||
# If all else fails, decode with errors='replace'
|
||||
return text.decode('utf-8', errors='replace')
|
||||
elif isinstance(text, str):
|
||||
# Ensure the string is properly encoded
|
||||
try:
|
||||
# Try to encode and decode to ensure it's valid UTF-8
|
||||
return text.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
# If there are encoding issues, try to fix them
|
||||
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
else:
|
||||
# Convert other types to string and ensure proper encoding
|
||||
text_str = str(text)
|
||||
try:
|
||||
return text_str.encode('utf-8').decode('utf-8')
|
||||
except UnicodeEncodeError:
|
||||
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Example usage
|
||||
from loguru import logger
|
||||
import sys
|
||||
|
||||
# Create logs directory if it doesn't exist
|
||||
logs_dir = Path("logs")
|
||||
logs_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Remove default logger to customize it
|
||||
logger.remove()
|
||||
|
||||
# Add file handler with rotation
|
||||
logger.add(
|
||||
"logs/dev.log",
|
||||
rotation="10 MB",
|
||||
retention="10 days",
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
|
||||
)
|
||||
|
||||
# Add stdout handler
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
|
||||
colorize=True
|
||||
)
|
||||
|
||||
logger.info("Testing retrieval functionality...")
|
||||
|
||||
try:
|
||||
# Test query
|
||||
test_query = "What is this document about?"
|
||||
results = retrieve_documents_with_query_engine(test_query, top_k=3)
|
||||
|
||||
print(f"Found {len(results)} results for query: '{test_query}'")
|
||||
for i, result in enumerate(results):
|
||||
print(f"\nResult {i+1}:")
|
||||
print(f"Content preview: {result['content'][:200]}...")
|
||||
print(f"Metadata: {result['metadata']}")
|
||||
print(f"Score: {result['score']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in test run: {e}")
|
||||
print(f"Error: {e}")
|
||||
@@ -57,7 +57,7 @@ def initialize_vector_storage(
|
||||
base_url=ollama_base_url
|
||||
)
|
||||
# Get a test embedding to determine the correct size
|
||||
test_embedding = embed_model.get_query_embedding("test")
|
||||
test_embedding = embed_model.get_text_embedding("test")
|
||||
embedding_dimension = len(test_embedding)
|
||||
logger.info(f"Detected embedding dimension: {embedding_dimension}")
|
||||
|
||||
@@ -152,9 +152,9 @@ def get_vector_store_and_index() -> tuple[QdrantVectorStore, VectorStoreIndex]:
|
||||
Returns:
|
||||
Tuple of (QdrantVectorStore, VectorStoreIndex)
|
||||
"""
|
||||
# Get the embedding model from environment variables
|
||||
embed_model = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
|
||||
return initialize_vector_storage(ollama_embed_model=embed_model)
|
||||
# Get the embedding model name from environment variables
|
||||
embed_model_name = os.getenv("OLLAMA_EMBEDDING_MODEL", "qwen3-embedding:4b")
|
||||
return initialize_vector_storage(ollama_embed_model=embed_model_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user