2026-02-03 19:51:35 +03:00
|
|
|
import os
|
2026-03-11 22:30:02 +03:00
|
|
|
import csv
|
|
|
|
|
import json
|
2026-02-03 19:51:35 +03:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
2026-02-03 22:55:12 +03:00
|
|
|
import click
|
2026-02-09 21:17:42 +03:00
|
|
|
from dotenv import load_dotenv
|
2026-02-03 22:55:12 +03:00
|
|
|
from loguru import logger
|
|
|
|
|
|
2026-02-05 00:08:59 +03:00
|
|
|
# Load environment variables
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
2026-02-03 19:51:35 +03:00
|
|
|
|
|
|
|
|
# Configure logging to output to both file and stdout as specified in requirements
|
|
|
|
|
def setup_logging():
|
|
|
|
|
# Create logs directory if it doesn't exist
|
|
|
|
|
logs_dir = Path("logs")
|
|
|
|
|
logs_dir.mkdir(exist_ok=True)
|
2026-02-03 20:52:08 +03:00
|
|
|
|
2026-02-03 19:51:35 +03:00
|
|
|
# Add file logging with rotation
|
|
|
|
|
logger.add("logs/dev.log", rotation="10 MB", retention="10 days")
|
2026-02-03 20:52:08 +03:00
|
|
|
|
2026-02-03 19:51:35 +03:00
|
|
|
|
|
|
|
|
@click.group()
|
|
|
|
|
def cli():
|
|
|
|
|
"""Main CLI group"""
|
|
|
|
|
setup_logging()
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@cli.command(name="ping", help="Ping command that outputs pong")
|
|
|
|
|
def ping():
|
|
|
|
|
"""Ping command that outputs pong"""
|
|
|
|
|
logger.info("Ping command executed")
|
|
|
|
|
click.echo("pong")
|
|
|
|
|
|
|
|
|
|
|
2026-02-03 22:55:12 +03:00
|
|
|
@cli.command(
|
|
|
|
|
name="enrich",
|
|
|
|
|
help="Load documents from data directory and store in vector database",
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--collection-name",
|
|
|
|
|
default="documents_langchain",
|
|
|
|
|
help="Name of the vector store collection",
|
|
|
|
|
)
|
2026-02-11 11:23:50 +03:00
|
|
|
def enrich(collection_name):
|
2026-02-03 20:52:08 +03:00
|
|
|
"""Load documents from data directory and store in vector database"""
|
2026-02-11 11:23:50 +03:00
|
|
|
logger.info(
|
|
|
|
|
f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}"
|
|
|
|
|
)
|
2026-02-03 20:52:08 +03:00
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Import here to avoid circular dependencies
|
|
|
|
|
from enrichment import run_enrichment_process
|
2026-02-03 22:55:12 +03:00
|
|
|
from vector_storage import initialize_vector_store
|
2026-02-03 20:52:08 +03:00
|
|
|
|
|
|
|
|
# Initialize vector store
|
|
|
|
|
vector_store = initialize_vector_store(collection_name=collection_name)
|
|
|
|
|
|
|
|
|
|
# Run enrichment process
|
2026-02-11 11:23:50 +03:00
|
|
|
run_enrichment_process(vector_store)
|
2026-02-03 20:52:08 +03:00
|
|
|
|
|
|
|
|
logger.info("Enrichment process completed successfully!")
|
|
|
|
|
click.echo("Documents have been successfully loaded into the vector store.")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error during enrichment process: {str(e)}")
|
|
|
|
|
click.echo(f"Error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
2026-02-03 23:25:24 +03:00
|
|
|
@cli.command(
|
|
|
|
|
name="retrieve",
|
|
|
|
|
help="Retrieve documents from vector database based on a query",
|
|
|
|
|
)
|
|
|
|
|
@click.argument("query")
|
|
|
|
|
@click.option(
|
|
|
|
|
"--collection-name",
|
|
|
|
|
default="documents_langchain",
|
|
|
|
|
help="Name of the vector store collection",
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--top-k",
|
|
|
|
|
default=5,
|
|
|
|
|
help="Number of documents to retrieve",
|
|
|
|
|
)
|
|
|
|
|
def retrieve(query, collection_name, top_k):
|
|
|
|
|
"""Retrieve documents from vector database based on a query"""
|
|
|
|
|
logger.info(f"Starting retrieval process for query: {query}")
|
|
|
|
|
|
2026-02-09 21:17:42 +03:00
|
|
|
click.echo(
|
|
|
|
|
"WARNING: Retrieval disabled, since it is no longer relevant for the testing of the retrieving feature. Use chat with agent instead. xoxo"
|
|
|
|
|
)
|
2026-02-03 23:25:24 +03:00
|
|
|
|
|
|
|
|
|
2026-02-04 00:02:53 +03:00
|
|
|
@cli.command(
|
|
|
|
|
name="chat",
|
|
|
|
|
help="Start an interactive chat session with the RAG agent",
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--collection-name",
|
|
|
|
|
default="documents_langchain",
|
|
|
|
|
help="Name of the vector store collection",
|
|
|
|
|
)
|
|
|
|
|
@click.option(
|
|
|
|
|
"--model",
|
|
|
|
|
default=None,
|
|
|
|
|
help="Name of the Ollama model to use for chat",
|
|
|
|
|
)
|
|
|
|
|
def chat(collection_name, model):
|
|
|
|
|
"""Start an interactive chat session with the RAG agent"""
|
|
|
|
|
logger.info("Starting chat session with RAG agent")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Import here to avoid circular dependencies and only when needed
|
|
|
|
|
from agent import run_chat_loop
|
|
|
|
|
|
|
|
|
|
click.echo("Initializing chat agent...")
|
|
|
|
|
click.echo("Type 'quit' or 'exit' to end the conversation.\n")
|
|
|
|
|
|
|
|
|
|
# Run the interactive chat loop
|
2026-02-09 21:17:42 +03:00
|
|
|
run_chat_loop(collection_name=collection_name, llm_model=model)
|
2026-02-04 00:02:53 +03:00
|
|
|
|
|
|
|
|
logger.info("Chat session ended")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error during chat session: {str(e)}")
|
|
|
|
|
click.echo(f"Error: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
2026-03-11 22:30:02 +03:00
|
|
|
@cli.command(
|
|
|
|
|
name="export-supported-paths",
|
|
|
|
|
help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",
|
|
|
|
|
)
|
|
|
|
|
@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
|
|
|
|
def export_supported_paths(input_json: Path):
|
|
|
|
|
"""Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""
|
|
|
|
|
logger.info(f"Filtering supported paths from input file: {input_json}")
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
from enrichment import SUPPORTED_EXTENSIONS
|
|
|
|
|
|
|
|
|
|
with input_json.open("r", encoding="utf-8") as source_file:
|
|
|
|
|
raw_data = json.load(source_file)
|
|
|
|
|
|
|
|
|
|
if not isinstance(raw_data, list):
|
|
|
|
|
raise ValueError("Input JSON must contain an array of file paths")
|
|
|
|
|
|
|
|
|
|
filtered_paths = []
|
|
|
|
|
seen_paths = set()
|
|
|
|
|
for item in raw_data:
|
|
|
|
|
path_str = str(item).strip()
|
|
|
|
|
if not path_str:
|
|
|
|
|
continue
|
|
|
|
|
if path_str in seen_paths:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
extension = Path(path_str).suffix.lower()
|
|
|
|
|
if extension in SUPPORTED_EXTENSIONS:
|
|
|
|
|
filtered_paths.append(path_str)
|
|
|
|
|
seen_paths.add(path_str)
|
|
|
|
|
|
|
|
|
|
output_json = Path.cwd() / "yadisk_imported_paths.json"
|
|
|
|
|
output_csv = Path.cwd() / "yadisk_imported_paths.csv"
|
|
|
|
|
|
|
|
|
|
with output_json.open("w", encoding="utf-8") as output_json_file:
|
|
|
|
|
json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:
|
|
|
|
|
writer = csv.writer(output_csv_file)
|
|
|
|
|
writer.writerow(["path"])
|
|
|
|
|
for path_item in filtered_paths:
|
|
|
|
|
writer.writerow([path_item])
|
|
|
|
|
|
|
|
|
|
click.echo(
|
|
|
|
|
f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"
|
|
|
|
|
)
|
|
|
|
|
logger.info(
|
|
|
|
|
f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"
|
|
|
|
|
)
|
|
|
|
|
except Exception as error:
|
|
|
|
|
logger.error(f"Failed to export supported paths: {error}")
|
|
|
|
|
click.echo(f"Error: {error}")
|
|
|
|
|
|
|
|
|
|
|
2026-02-03 19:51:35 +03:00
|
|
|
if __name__ == "__main__":
|
2026-02-03 22:55:12 +03:00
|
|
|
cli()
|