#!/usr/bin/env python3 """ Diagnostic script to examine how content is stored in the vector database. """ import os import sys from pathlib import Path # Add the project root to the path sys.path.insert(0, str(Path(__file__).parent)) from retrieval import retrieve_documents_with_query_engine from loguru import logger from vector_storage import get_vector_store_and_index # Setup logging logs_dir = Path("logs") logs_dir.mkdir(exist_ok=True) logger.remove() logger.add( "logs/dev.log", rotation="10 MB", retention="10 days", level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}" ) logger.add( sys.stdout, level="INFO", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}", colorize=True ) def diagnose_content_storage(): """Diagnose how content is stored in the vector database.""" logger.info("Starting content storage diagnosis...") try: # Get the vector store and index vector_store, index = get_vector_store_and_index() logger.info("Successfully connected to vector storage") # Let's try to access the raw storage to see what's there from llama_index.vector_stores.qdrant import QdrantVectorStore if isinstance(vector_store, QdrantVectorStore): client = vector_store.client collection_name = vector_store.collection_name # Get collection info collection_info = client.get_collection(collection_name) logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points") # Sample some points to see what's stored points_response = client.scroll( collection_name=collection_name, limit=5, # Get first 5 points with_payload=True, with_vectors=False ) # The scroll method returns (points, next_offset), so we need the first element points = points_response[0] if isinstance(points_response, tuple) else points_response logger.info("Sample of stored points:") for i, point in enumerate(points): # The point structure depends on the Qdrant client version if hasattr(point, 'id') and hasattr(point, 'payload'): point_id = point.id payload = point.payload else: # Fallback for different structure point_id = getattr(point, 'id', 'unknown') payload = getattr(point, 'payload', {}) logger.info(f"Point {i+1} ID: {point_id}") logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}") # Check for content in various possible keys content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content'] content_found = False for key in content_keys: if key in payload: content = payload[key] logger.info(f"Content found in key '{key}': {str(content)[:100]}...") content_found = True break if not content_found: logger.info("No content found in standard keys") logger.info(f"Full payload keys: {list(payload.keys())}") print("-" * 50) # Also test retrieval to see what comes back logger.info("\nTesting retrieval with a simple query...") results = retrieve_documents_with_query_engine("Баканов", top_k=2) logger.info(f"Retrieved {len(results)} results") for i, result in enumerate(results): logger.info(f"Result {i+1}:") logger.info(f" Content length: {len(result.get('content', ''))}") logger.info(f" Content preview: {result.get('content', '')[:200]}...") logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}") logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}") except Exception as e: logger.error(f"Error during diagnosis: {e}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": diagnose_content_storage()