Retrieval and also update on russian language
This commit is contained in:
115
services/rag/llamaindex/diagnose_content.py
Normal file
115
services/rag/llamaindex/diagnose_content.py
Normal file
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Diagnostic script to examine how content is stored in the vector database.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add the project root to the path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from retrieval import retrieve_documents_with_query_engine
|
||||
from loguru import logger
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
# Setup logging
|
||||
logs_dir = Path("logs")
|
||||
logs_dir.mkdir(exist_ok=True)
|
||||
|
||||
logger.remove()
|
||||
logger.add(
|
||||
"logs/dev.log",
|
||||
rotation="10 MB",
|
||||
retention="10 days",
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
|
||||
)
|
||||
|
||||
logger.add(
|
||||
sys.stdout,
|
||||
level="INFO",
|
||||
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
|
||||
colorize=True
|
||||
)
|
||||
|
||||
def diagnose_content_storage():
|
||||
"""Diagnose how content is stored in the vector database."""
|
||||
logger.info("Starting content storage diagnosis...")
|
||||
|
||||
try:
|
||||
# Get the vector store and index
|
||||
vector_store, index = get_vector_store_and_index()
|
||||
logger.info("Successfully connected to vector storage")
|
||||
|
||||
# Let's try to access the raw storage to see what's there
|
||||
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
||||
if isinstance(vector_store, QdrantVectorStore):
|
||||
client = vector_store.client
|
||||
collection_name = vector_store.collection_name
|
||||
|
||||
# Get collection info
|
||||
collection_info = client.get_collection(collection_name)
|
||||
logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points")
|
||||
|
||||
# Sample some points to see what's stored
|
||||
points_response = client.scroll(
|
||||
collection_name=collection_name,
|
||||
limit=5, # Get first 5 points
|
||||
with_payload=True,
|
||||
with_vectors=False
|
||||
)
|
||||
|
||||
# The scroll method returns (points, next_offset), so we need the first element
|
||||
points = points_response[0] if isinstance(points_response, tuple) else points_response
|
||||
|
||||
logger.info("Sample of stored points:")
|
||||
for i, point in enumerate(points):
|
||||
# The point structure depends on the Qdrant client version
|
||||
if hasattr(point, 'id') and hasattr(point, 'payload'):
|
||||
point_id = point.id
|
||||
payload = point.payload
|
||||
else:
|
||||
# Fallback for different structure
|
||||
point_id = getattr(point, 'id', 'unknown')
|
||||
payload = getattr(point, 'payload', {})
|
||||
|
||||
logger.info(f"Point {i+1} ID: {point_id}")
|
||||
logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}")
|
||||
|
||||
# Check for content in various possible keys
|
||||
content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content']
|
||||
content_found = False
|
||||
for key in content_keys:
|
||||
if key in payload:
|
||||
content = payload[key]
|
||||
logger.info(f"Content found in key '{key}': {str(content)[:100]}...")
|
||||
content_found = True
|
||||
break
|
||||
|
||||
if not content_found:
|
||||
logger.info("No content found in standard keys")
|
||||
logger.info(f"Full payload keys: {list(payload.keys())}")
|
||||
|
||||
print("-" * 50)
|
||||
|
||||
# Also test retrieval to see what comes back
|
||||
logger.info("\nTesting retrieval with a simple query...")
|
||||
results = retrieve_documents_with_query_engine("Баканов", top_k=2)
|
||||
|
||||
logger.info(f"Retrieved {len(results)} results")
|
||||
for i, result in enumerate(results):
|
||||
logger.info(f"Result {i+1}:")
|
||||
logger.info(f" Content length: {len(result.get('content', ''))}")
|
||||
logger.info(f" Content preview: {result.get('content', '')[:200]}...")
|
||||
logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}")
|
||||
logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during diagnosis: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
|
||||
if __name__ == "__main__":
|
||||
diagnose_content_storage()
|
||||
Reference in New Issue
Block a user