Files
rag-solution/services/rag/llamaindex/diagnose_content.py

115 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Diagnostic script to examine how content is stored in the vector database.
"""
import os
import sys
from pathlib import Path
# Add the project root to the path
sys.path.insert(0, str(Path(__file__).parent))
from retrieval import retrieve_documents_with_query_engine
from loguru import logger
from vector_storage import get_vector_store_and_index
# Setup logging
logs_dir = Path("logs")
logs_dir.mkdir(exist_ok=True)
logger.remove()
logger.add(
"logs/dev.log",
rotation="10 MB",
retention="10 days",
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
)
logger.add(
sys.stdout,
level="INFO",
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
colorize=True
)
def diagnose_content_storage():
"""Diagnose how content is stored in the vector database."""
logger.info("Starting content storage diagnosis...")
try:
# Get the vector store and index
vector_store, index = get_vector_store_and_index()
logger.info("Successfully connected to vector storage")
# Let's try to access the raw storage to see what's there
from llama_index.vector_stores.qdrant import QdrantVectorStore
if isinstance(vector_store, QdrantVectorStore):
client = vector_store.client
collection_name = vector_store.collection_name
# Get collection info
collection_info = client.get_collection(collection_name)
logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points")
# Sample some points to see what's stored
points_response = client.scroll(
collection_name=collection_name,
limit=5, # Get first 5 points
with_payload=True,
with_vectors=False
)
# The scroll method returns (points, next_offset), so we need the first element
points = points_response[0] if isinstance(points_response, tuple) else points_response
logger.info("Sample of stored points:")
for i, point in enumerate(points):
# The point structure depends on the Qdrant client version
if hasattr(point, 'id') and hasattr(point, 'payload'):
point_id = point.id
payload = point.payload
else:
# Fallback for different structure
point_id = getattr(point, 'id', 'unknown')
payload = getattr(point, 'payload', {})
logger.info(f"Point {i+1} ID: {point_id}")
logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}")
# Check for content in various possible keys
content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content']
content_found = False
for key in content_keys:
if key in payload:
content = payload[key]
logger.info(f"Content found in key '{key}': {str(content)[:100]}...")
content_found = True
break
if not content_found:
logger.info("No content found in standard keys")
logger.info(f"Full payload keys: {list(payload.keys())}")
print("-" * 50)
# Also test retrieval to see what comes back
logger.info("\nTesting retrieval with a simple query...")
results = retrieve_documents_with_query_engine("Баканов", top_k=2)
logger.info(f"Retrieved {len(results)} results")
for i, result in enumerate(results):
logger.info(f"Result {i+1}:")
logger.info(f" Content length: {len(result.get('content', ''))}")
logger.info(f" Content preview: {result.get('content', '')[:200]}...")
logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}")
logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}")
except Exception as e:
logger.error(f"Error during diagnosis: {e}")
import traceback
logger.error(traceback.format_exc())
if __name__ == "__main__":
diagnose_content_storage()