import os import csv import json from pathlib import Path import click from dotenv import load_dotenv from loguru import logger # Load environment variables load_dotenv() # Configure logging to output to both file and stdout as specified in requirements def setup_logging(): # Create logs directory if it doesn't exist logs_dir = Path("logs") logs_dir.mkdir(exist_ok=True) # Add file logging with rotation logger.add("logs/dev.log", rotation="10 MB", retention="10 days") @click.group() def cli(): """Main CLI group""" setup_logging() pass @cli.command(name="ping", help="Ping command that outputs pong") def ping(): """Ping command that outputs pong""" logger.info("Ping command executed") click.echo("pong") @cli.command( name="enrich", help="Load documents from data directory and store in vector database", ) @click.option( "--collection-name", default="documents_langchain", help="Name of the vector store collection", ) def enrich(collection_name): """Load documents from data directory and store in vector database""" logger.info( f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}" ) try: # Import here to avoid circular dependencies from enrichment import run_enrichment_process from vector_storage import initialize_vector_store # Initialize vector store vector_store = initialize_vector_store(collection_name=collection_name) # Run enrichment process run_enrichment_process(vector_store) logger.info("Enrichment process completed successfully!") click.echo("Documents have been successfully loaded into the vector store.") except Exception as e: logger.error(f"Error during enrichment process: {str(e)}") click.echo(f"Error: {str(e)}") @cli.command( name="retrieve", help="Retrieve documents from vector database based on a query", ) @click.argument("query") @click.option( "--collection-name", default="documents_langchain", help="Name of the vector store collection", ) @click.option( "--top-k", default=5, help="Number of documents to retrieve", ) def retrieve(query, collection_name, top_k): """Retrieve documents from vector database based on a query""" logger.info(f"Starting retrieval process for query: {query}") click.echo( "WARNING: Retrieval disabled, since it is no longer relevant for the testing of the retrieving feature. Use chat with agent instead. xoxo" ) @cli.command( name="chat", help="Start an interactive chat session with the RAG agent", ) @click.option( "--collection-name", default="documents_langchain", help="Name of the vector store collection", ) @click.option( "--model", default=None, help="Name of the Ollama model to use for chat", ) def chat(collection_name, model): """Start an interactive chat session with the RAG agent""" logger.info("Starting chat session with RAG agent") try: # Import here to avoid circular dependencies and only when needed from agent import run_chat_loop click.echo("Initializing chat agent...") click.echo("Type 'quit' or 'exit' to end the conversation.\n") # Run the interactive chat loop run_chat_loop(collection_name=collection_name, llm_model=model) logger.info("Chat session ended") except Exception as e: logger.error(f"Error during chat session: {str(e)}") click.echo(f"Error: {str(e)}") @cli.command( name="export-supported-paths", help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV", ) @click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path)) def export_supported_paths(input_json: Path): """Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv.""" logger.info(f"Filtering supported paths from input file: {input_json}") try: from enrichment import SUPPORTED_EXTENSIONS with input_json.open("r", encoding="utf-8") as source_file: raw_data = json.load(source_file) if not isinstance(raw_data, list): raise ValueError("Input JSON must contain an array of file paths") filtered_paths = [] seen_paths = set() for item in raw_data: path_str = str(item).strip() if not path_str: continue if path_str in seen_paths: continue extension = Path(path_str).suffix.lower() if extension in SUPPORTED_EXTENSIONS: filtered_paths.append(path_str) seen_paths.add(path_str) output_json = Path.cwd() / "yadisk_imported_paths.json" output_csv = Path.cwd() / "yadisk_imported_paths.csv" with output_json.open("w", encoding="utf-8") as output_json_file: json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2) with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file: writer = csv.writer(output_csv_file) writer.writerow(["path"]) for path_item in filtered_paths: writer.writerow([path_item]) click.echo( f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}" ) logger.info( f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}" ) except Exception as error: logger.error(f"Failed to export supported paths: {error}") click.echo(f"Error: {error}") if __name__ == "__main__": cli()