rag-solution/services/rag/langchain/cli.py

import os
import csv
import json
from pathlib import Path

import click
from dotenv import load_dotenv
from loguru import logger

# Load environment variables
load_dotenv()


# Configure logging to output to both file and stdout as specified in requirements
def setup_logging():
    # Create logs directory if it doesn't exist
    logs_dir = Path("logs")
    logs_dir.mkdir(exist_ok=True)

    # Add file logging with rotation
    logger.add("logs/dev.log", rotation="10 MB", retention="10 days")


@click.group()
def cli():
    """Main CLI group"""
    setup_logging()
    pass


@cli.command(name="ping", help="Ping command that outputs pong")
def ping():
    """Ping command that outputs pong"""
    logger.info("Ping command executed")
    click.echo("pong")


@cli.command(
    name="enrich",
    help="Load documents from data directory and store in vector database",
)
@click.option(
    "--collection-name",
    default="documents_langchain",
    help="Name of the vector store collection",
)
def enrich(collection_name):
    """Load documents from data directory and store in vector database"""
    logger.info(
        f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}"
    )

    try:
        # Import here to avoid circular dependencies
        from enrichment import run_enrichment_process
        from vector_storage import initialize_vector_store

        # Initialize vector store
        vector_store = initialize_vector_store(collection_name=collection_name)

        # Run enrichment process
        run_enrichment_process(vector_store)

        logger.info("Enrichment process completed successfully!")
        click.echo("Documents have been successfully loaded into the vector store.")

    except Exception as e:
        logger.error(f"Error during enrichment process: {str(e)}")
        click.echo(f"Error: {str(e)}")


@cli.command(
    name="retrieve",
    help="Retrieve documents from vector database based on a query",
)
@click.argument("query")
@click.option(
    "--collection-name",
    default="documents_langchain",
    help="Name of the vector store collection",
)
@click.option(
    "--top-k",
    default=5,
    help="Number of documents to retrieve",
)
def retrieve(query, collection_name, top_k):
    """Retrieve documents from vector database based on a query"""
    logger.info(f"Starting retrieval process for query: {query}")

    click.echo(
        "WARNING: Retrieval disabled, since it is no longer relevant for the testing of the retrieving feature. Use chat with agent instead. xoxo"
    )


@cli.command(
    name="chat",
    help="Start an interactive chat session with the RAG agent",
)
@click.option(
    "--collection-name",
    default="documents_langchain",
    help="Name of the vector store collection",
)
@click.option(
    "--model",
    default=None,
    help="Name of the Ollama model to use for chat",
)
def chat(collection_name, model):
    """Start an interactive chat session with the RAG agent"""
    logger.info("Starting chat session with RAG agent")

    try:
        # Import here to avoid circular dependencies and only when needed
        from agent import run_chat_loop

        click.echo("Initializing chat agent...")
        click.echo("Type 'quit' or 'exit' to end the conversation.\n")

        # Run the interactive chat loop
        run_chat_loop(collection_name=collection_name, llm_model=model)

        logger.info("Chat session ended")

    except Exception as e:
        logger.error(f"Error during chat session: {str(e)}")
        click.echo(f"Error: {str(e)}")


@cli.command(
    name="export-supported-paths",
    help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",
)
@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))
def export_supported_paths(input_json: Path):
    """Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""
    logger.info(f"Filtering supported paths from input file: {input_json}")

    try:
        from enrichment import SUPPORTED_EXTENSIONS

        with input_json.open("r", encoding="utf-8") as source_file:
            raw_data = json.load(source_file)

        if not isinstance(raw_data, list):
            raise ValueError("Input JSON must contain an array of file paths")

        filtered_paths = []
        seen_paths = set()
        for item in raw_data:
            path_str = str(item).strip()
            if not path_str:
                continue
            if path_str in seen_paths:
                continue

            extension = Path(path_str).suffix.lower()
            if extension in SUPPORTED_EXTENSIONS:
                filtered_paths.append(path_str)
                seen_paths.add(path_str)

        output_json = Path.cwd() / "yadisk_imported_paths.json"
        output_csv = Path.cwd() / "yadisk_imported_paths.csv"

        with output_json.open("w", encoding="utf-8") as output_json_file:
            json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)

        with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:
            writer = csv.writer(output_csv_file)
            writer.writerow(["path"])
            for path_item in filtered_paths:
                writer.writerow([path_item])

        click.echo(
            f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"
        )
        logger.info(
            f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"
        )
    except Exception as error:
        logger.error(f"Failed to export supported paths: {error}")
        click.echo(f"Error: {error}")


if __name__ == "__main__":
    cli()
langchain done cli 2026-02-03 19:51:35 +03:00			`import os`
evaluation for rag systems 2026-03-11 22:30:02 +03:00			`import csv`
			`import json`
langchain done cli 2026-02-03 19:51:35 +03:00			`from pathlib import Path`

Working enrichment 2026-02-03 22:55:12 +03:00			`import click`
removed test retrieval feature. off you go 2026-02-09 21:17:42 +03:00			`from dotenv import load_dotenv`
Working enrichment 2026-02-03 22:55:12 +03:00			`from loguru import logger`

properly loading .env file with dotenv 2026-02-05 00:08:59 +03:00			`# Load environment variables`
			`load_dotenv()`

langchain done cli 2026-02-03 19:51:35 +03:00
			`# Configure logging to output to both file and stdout as specified in requirements`
			`def setup_logging():`
			`# Create logs directory if it doesn't exist`
			`logs_dir = Path("logs")`
			`logs_dir.mkdir(exist_ok=True)`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00
langchain done cli 2026-02-03 19:51:35 +03:00			`# Add file logging with rotation`
			`logger.add("logs/dev.log", rotation="10 MB", retention="10 days")`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00
langchain done cli 2026-02-03 19:51:35 +03:00
			`@click.group()`
			`def cli():`
			`"""Main CLI group"""`
			`setup_logging()`
			`pass`


			`@cli.command(name="ping", help="Ping command that outputs pong")`
			`def ping():`
			`"""Ping command that outputs pong"""`
			`logger.info("Ping command executed")`
			`click.echo("pong")`


Working enrichment 2026-02-03 22:55:12 +03:00			`@cli.command(`
			`name="enrich",`
			`help="Load documents from data directory and store in vector database",`
			`)`
			`@click.option(`
			`"--collection-name",`
			`default="documents_langchain",`
			`help="Name of the vector store collection",`
			`)`
Enrichment now processed via chunks. 2 documents -> into the vector storage. Also geussing source from the file extension 2026-02-11 11:23:50 +03:00			`def enrich(collection_name):`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00			`"""Load documents from data directory and store in vector database"""`
Enrichment now processed via chunks. 2 documents -> into the vector storage. Also geussing source from the file extension 2026-02-11 11:23:50 +03:00			`logger.info(`
			`f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}"`
			`)`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00
			`try:`
			`# Import here to avoid circular dependencies`
			`from enrichment import run_enrichment_process`
Working enrichment 2026-02-03 22:55:12 +03:00			`from vector_storage import initialize_vector_store`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00
			`# Initialize vector store`
			`vector_store = initialize_vector_store(collection_name=collection_name)`

			`# Run enrichment process`
Enrichment now processed via chunks. 2 documents -> into the vector storage. Also geussing source from the file extension 2026-02-11 11:23:50 +03:00			`run_enrichment_process(vector_store)`
langchain loading documents into vector storage 2026-02-03 20:52:08 +03:00
			`logger.info("Enrichment process completed successfully!")`
			`click.echo("Documents have been successfully loaded into the vector store.")`

			`except Exception as e:`
			`logger.error(f"Error during enrichment process: {str(e)}")`
			`click.echo(f"Error: {str(e)}")`


Working retrieval with the cli 2026-02-03 23:25:24 +03:00			`@cli.command(`
			`name="retrieve",`
			`help="Retrieve documents from vector database based on a query",`
			`)`
			`@click.argument("query")`
			`@click.option(`
			`"--collection-name",`
			`default="documents_langchain",`
			`help="Name of the vector store collection",`
			`)`
			`@click.option(`
			`"--top-k",`
			`default=5,`
			`help="Number of documents to retrieve",`
			`)`
			`def retrieve(query, collection_name, top_k):`
			`"""Retrieve documents from vector database based on a query"""`
			`logger.info(f"Starting retrieval process for query: {query}")`

removed test retrieval feature. off you go 2026-02-09 21:17:42 +03:00			`click.echo(`
			`"WARNING: Retrieval disabled, since it is no longer relevant for the testing of the retrieving feature. Use chat with agent instead. xoxo"`
			`)`
Working retrieval with the cli 2026-02-03 23:25:24 +03:00

Working chat with AI agent with retrieving data 2026-02-04 00:02:53 +03:00			`@cli.command(`
			`name="chat",`
			`help="Start an interactive chat session with the RAG agent",`
			`)`
			`@click.option(`
			`"--collection-name",`
			`default="documents_langchain",`
			`help="Name of the vector store collection",`
			`)`
			`@click.option(`
			`"--model",`
			`default=None,`
			`help="Name of the Ollama model to use for chat",`
			`)`
			`def chat(collection_name, model):`
			`"""Start an interactive chat session with the RAG agent"""`
			`logger.info("Starting chat session with RAG agent")`

			`try:`
			`# Import here to avoid circular dependencies and only when needed`
			`from agent import run_chat_loop`

			`click.echo("Initializing chat agent...")`
			`click.echo("Type 'quit' or 'exit' to end the conversation.\n")`

			`# Run the interactive chat loop`
removed test retrieval feature. off you go 2026-02-09 21:17:42 +03:00			`run_chat_loop(collection_name=collection_name, llm_model=model)`
Working chat with AI agent with retrieving data 2026-02-04 00:02:53 +03:00
			`logger.info("Chat session ended")`

			`except Exception as e:`
			`logger.error(f"Error during chat session: {str(e)}")`
			`click.echo(f"Error: {str(e)}")`


evaluation for rag systems 2026-03-11 22:30:02 +03:00			`@cli.command(`
			`name="export-supported-paths",`
			`help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",`
			`)`
			`@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))`
			`def export_supported_paths(input_json: Path):`
			`"""Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""`
			`logger.info(f"Filtering supported paths from input file: {input_json}")`

			`try:`
			`from enrichment import SUPPORTED_EXTENSIONS`

			`with input_json.open("r", encoding="utf-8") as source_file:`
			`raw_data = json.load(source_file)`

			`if not isinstance(raw_data, list):`
			`raise ValueError("Input JSON must contain an array of file paths")`

			`filtered_paths = []`
			`seen_paths = set()`
			`for item in raw_data:`
			`path_str = str(item).strip()`
			`if not path_str:`
			`continue`
			`if path_str in seen_paths:`
			`continue`

			`extension = Path(path_str).suffix.lower()`
			`if extension in SUPPORTED_EXTENSIONS:`
			`filtered_paths.append(path_str)`
			`seen_paths.add(path_str)`

			`output_json = Path.cwd() / "yadisk_imported_paths.json"`
			`output_csv = Path.cwd() / "yadisk_imported_paths.csv"`

			`with output_json.open("w", encoding="utf-8") as output_json_file:`
			`json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)`

			`with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:`
			`writer = csv.writer(output_csv_file)`
			`writer.writerow(["path"])`
			`for path_item in filtered_paths:`
			`writer.writerow([path_item])`

			`click.echo(`
			`f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"`
			`)`
			`logger.info(`
			`f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"`
			`)`
			`except Exception as error:`
			`logger.error(f"Failed to export supported paths: {error}")`
			`click.echo(f"Error: {error}")`


langchain done cli 2026-02-03 19:51:35 +03:00			`if __name__ == "__main__":`
Working enrichment 2026-02-03 22:55:12 +03:00			`cli()`