115 lines
4.3 KiB
Python
115 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Diagnostic script to examine how content is stored in the vector database.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Add the project root to the path
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from retrieval import retrieve_documents_with_query_engine
|
|
from loguru import logger
|
|
from vector_storage import get_vector_store_and_index
|
|
|
|
# Setup logging
|
|
logs_dir = Path("logs")
|
|
logs_dir.mkdir(exist_ok=True)
|
|
|
|
logger.remove()
|
|
logger.add(
|
|
"logs/dev.log",
|
|
rotation="10 MB",
|
|
retention="10 days",
|
|
level="INFO",
|
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {file}:{line} | {message}"
|
|
)
|
|
|
|
logger.add(
|
|
sys.stdout,
|
|
level="INFO",
|
|
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
|
|
colorize=True
|
|
)
|
|
|
|
def diagnose_content_storage():
|
|
"""Diagnose how content is stored in the vector database."""
|
|
logger.info("Starting content storage diagnosis...")
|
|
|
|
try:
|
|
# Get the vector store and index
|
|
vector_store, index = get_vector_store_and_index()
|
|
logger.info("Successfully connected to vector storage")
|
|
|
|
# Let's try to access the raw storage to see what's there
|
|
from llama_index.vector_stores.qdrant import QdrantVectorStore
|
|
if isinstance(vector_store, QdrantVectorStore):
|
|
client = vector_store.client
|
|
collection_name = vector_store.collection_name
|
|
|
|
# Get collection info
|
|
collection_info = client.get_collection(collection_name)
|
|
logger.info(f"Collection '{collection_name}' has {collection_info.points_count} points")
|
|
|
|
# Sample some points to see what's stored
|
|
points_response = client.scroll(
|
|
collection_name=collection_name,
|
|
limit=5, # Get first 5 points
|
|
with_payload=True,
|
|
with_vectors=False
|
|
)
|
|
|
|
# The scroll method returns (points, next_offset), so we need the first element
|
|
points = points_response[0] if isinstance(points_response, tuple) else points_response
|
|
|
|
logger.info("Sample of stored points:")
|
|
for i, point in enumerate(points):
|
|
# The point structure depends on the Qdrant client version
|
|
if hasattr(point, 'id') and hasattr(point, 'payload'):
|
|
point_id = point.id
|
|
payload = point.payload
|
|
else:
|
|
# Fallback for different structure
|
|
point_id = getattr(point, 'id', 'unknown')
|
|
payload = getattr(point, 'payload', {})
|
|
|
|
logger.info(f"Point {i+1} ID: {point_id}")
|
|
logger.info(f"Payload keys: {list(payload.keys()) if payload else 'None'}")
|
|
|
|
# Check for content in various possible keys
|
|
content_keys = ['text', 'content', 'doc_text', 'page_content', '_node_content']
|
|
content_found = False
|
|
for key in content_keys:
|
|
if key in payload:
|
|
content = payload[key]
|
|
logger.info(f"Content found in key '{key}': {str(content)[:100]}...")
|
|
content_found = True
|
|
break
|
|
|
|
if not content_found:
|
|
logger.info("No content found in standard keys")
|
|
logger.info(f"Full payload keys: {list(payload.keys())}")
|
|
|
|
print("-" * 50)
|
|
|
|
# Also test retrieval to see what comes back
|
|
logger.info("\nTesting retrieval with a simple query...")
|
|
results = retrieve_documents_with_query_engine("Баканов", top_k=2)
|
|
|
|
logger.info(f"Retrieved {len(results)} results")
|
|
for i, result in enumerate(results):
|
|
logger.info(f"Result {i+1}:")
|
|
logger.info(f" Content length: {len(result.get('content', ''))}")
|
|
logger.info(f" Content preview: {result.get('content', '')[:200]}...")
|
|
logger.info(f" Metadata: {list(result.get('metadata', {}).keys())}")
|
|
logger.info(f" Filename: {result.get('metadata', {}).get('filename', 'N/A')}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during diagnosis: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
|
|
if __name__ == "__main__":
|
|
diagnose_content_storage() |