rag-solution/services/rag/langchain/cli.py

import os
from pathlib import Path

import click
from loguru import logger


# Configure logging to output to both file and stdout as specified in requirements
def setup_logging():
    # Create logs directory if it doesn't exist
    logs_dir = Path("logs")
    logs_dir.mkdir(exist_ok=True)

    # Add file logging with rotation
    logger.add("logs/dev.log", rotation="10 MB", retention="10 days")


@click.group()
def cli():
    """Main CLI group"""
    setup_logging()
    pass


@cli.command(name="ping", help="Ping command that outputs pong")
def ping():
    """Ping command that outputs pong"""
    logger.info("Ping command executed")
    click.echo("pong")


@cli.command(
    name="enrich",
    help="Load documents from data directory and store in vector database",
)
@click.option("--data-dir", default="../../../data", help="Path to the data directory")
@click.option(
    "--collection-name",
    default="documents_langchain",
    help="Name of the vector store collection",
)
def enrich(data_dir, collection_name):
    """Load documents from data directory and store in vector database"""
    logger.info(f"Starting enrichment process for directory: {data_dir}")

    try:
        # Import here to avoid circular dependencies
        from enrichment import run_enrichment_process
        from vector_storage import initialize_vector_store

        # Initialize vector store
        vector_store = initialize_vector_store(collection_name=collection_name)

        # Run enrichment process
        run_enrichment_process(vector_store, data_dir=data_dir)

        logger.info("Enrichment process completed successfully!")
        click.echo("Documents have been successfully loaded into the vector store.")

    except Exception as e:
        logger.error(f"Error during enrichment process: {str(e)}")
        click.echo(f"Error: {str(e)}")


@cli.command(
    name="retrieve",
    help="Retrieve documents from vector database based on a query",
)
@click.argument("query")
@click.option(
    "--collection-name",
    default="documents_langchain",
    help="Name of the vector store collection",
)
@click.option(
    "--top-k",
    default=5,
    help="Number of documents to retrieve",
)
def retrieve(query, collection_name, top_k):
    """Retrieve documents from vector database based on a query"""
    logger.info(f"Starting retrieval process for query: {query}")

    try:
        # Import here to avoid circular dependencies
        from retrieval import search_documents_with_metadata

        # Perform retrieval
        results = search_documents_with_metadata(
            query=query,
            collection_name=collection_name,
            top_k=top_k
        )

        if not results:
            click.echo("No relevant documents found for the query.")
            return

        click.echo(f"Found {len(results)} relevant documents:\n")

        for i, result in enumerate(results, 1):
            click.echo(f"{i}. Source: {result['source']}")
            click.echo(f"   Filename: {result['filename']}")
            click.echo(f"   Page: {result['page_number']}")
            click.echo(f"   File Extension: {result['file_extension']}")
            click.echo(f"   Content Preview: {result['content'][:200]}...")
            click.echo(f"   Metadata: {result['metadata']}\n")

        logger.info("Retrieval process completed successfully!")

    except Exception as e:
        logger.error(f"Error during retrieval process: {str(e)}")
        click.echo(f"Error: {str(e)}")


if __name__ == "__main__":
    cli()