118 lines
3.5 KiB
Python
118 lines
3.5 KiB
Python
import os
|
|
from pathlib import Path
|
|
|
|
import click
|
|
from loguru import logger
|
|
|
|
|
|
# Configure logging to output to both file and stdout as specified in requirements
|
|
def setup_logging():
|
|
# Create logs directory if it doesn't exist
|
|
logs_dir = Path("logs")
|
|
logs_dir.mkdir(exist_ok=True)
|
|
|
|
# Add file logging with rotation
|
|
logger.add("logs/dev.log", rotation="10 MB", retention="10 days")
|
|
|
|
|
|
@click.group()
|
|
def cli():
|
|
"""Main CLI group"""
|
|
setup_logging()
|
|
pass
|
|
|
|
|
|
@cli.command(name="ping", help="Ping command that outputs pong")
|
|
def ping():
|
|
"""Ping command that outputs pong"""
|
|
logger.info("Ping command executed")
|
|
click.echo("pong")
|
|
|
|
|
|
@cli.command(
|
|
name="enrich",
|
|
help="Load documents from data directory and store in vector database",
|
|
)
|
|
@click.option("--data-dir", default="../../../data", help="Path to the data directory")
|
|
@click.option(
|
|
"--collection-name",
|
|
default="documents_langchain",
|
|
help="Name of the vector store collection",
|
|
)
|
|
def enrich(data_dir, collection_name):
|
|
"""Load documents from data directory and store in vector database"""
|
|
logger.info(f"Starting enrichment process for directory: {data_dir}")
|
|
|
|
try:
|
|
# Import here to avoid circular dependencies
|
|
from enrichment import run_enrichment_process
|
|
from vector_storage import initialize_vector_store
|
|
|
|
# Initialize vector store
|
|
vector_store = initialize_vector_store(collection_name=collection_name)
|
|
|
|
# Run enrichment process
|
|
run_enrichment_process(vector_store, data_dir=data_dir)
|
|
|
|
logger.info("Enrichment process completed successfully!")
|
|
click.echo("Documents have been successfully loaded into the vector store.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during enrichment process: {str(e)}")
|
|
click.echo(f"Error: {str(e)}")
|
|
|
|
|
|
@cli.command(
|
|
name="retrieve",
|
|
help="Retrieve documents from vector database based on a query",
|
|
)
|
|
@click.argument("query")
|
|
@click.option(
|
|
"--collection-name",
|
|
default="documents_langchain",
|
|
help="Name of the vector store collection",
|
|
)
|
|
@click.option(
|
|
"--top-k",
|
|
default=5,
|
|
help="Number of documents to retrieve",
|
|
)
|
|
def retrieve(query, collection_name, top_k):
|
|
"""Retrieve documents from vector database based on a query"""
|
|
logger.info(f"Starting retrieval process for query: {query}")
|
|
|
|
try:
|
|
# Import here to avoid circular dependencies
|
|
from retrieval import search_documents_with_metadata
|
|
|
|
# Perform retrieval
|
|
results = search_documents_with_metadata(
|
|
query=query,
|
|
collection_name=collection_name,
|
|
top_k=top_k
|
|
)
|
|
|
|
if not results:
|
|
click.echo("No relevant documents found for the query.")
|
|
return
|
|
|
|
click.echo(f"Found {len(results)} relevant documents:\n")
|
|
|
|
for i, result in enumerate(results, 1):
|
|
click.echo(f"{i}. Source: {result['source']}")
|
|
click.echo(f" Filename: {result['filename']}")
|
|
click.echo(f" Page: {result['page_number']}")
|
|
click.echo(f" File Extension: {result['file_extension']}")
|
|
click.echo(f" Content Preview: {result['content'][:200]}...")
|
|
click.echo(f" Metadata: {result['metadata']}\n")
|
|
|
|
logger.info("Retrieval process completed successfully!")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during retrieval process: {str(e)}")
|
|
click.echo(f"Error: {str(e)}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|