evaluation for rag systems

2026-03-11 22:30:02 +03:00
parent 5721bad117
commit 6c953a327f
11 changed files with 31897 additions and 1 deletions
--- a/services/rag/langchain/.gitignore
+++ b/services/rag/langchain/.gitignore
@@ -216,3 +216,6 @@ __marimo__/
 .streamlit/secrets.toml
 document_tracking.db
 .env.test
+
+yadisk_imported_paths.csv
+yadisk_imported_paths.json
--- a/services/rag/langchain/PLANNING.md
+++ b/services/rag/langchain/PLANNING.md
@@ -125,3 +125,7 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
 - [x] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
 - [x] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
 - [x] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"
+
+# Phase 17 (creating json with list of documents that are supported for import)
+
+- [x] Make cli command that takes json file with list of paths, filters them to only those that are being imported into the vector storage (can be checked in enrichment), then this file should be saved in the current folder as "yadisk_imported_paths.json" and in "yadisk_imported_paths.csv" file. In case of CSV - it should be formatted as csv of course.
--- a/services/rag/langchain/cli.py
+++ b/services/rag/langchain/cli.py
@@ -1,4 +1,6 @@
 import os
+import csv
+import json
 from pathlib import Path

 import click
@@ -126,5 +128,60 @@ def chat(collection_name, model):
        click.echo(f"Error: {str(e)}")


+@cli.command(
+    name="export-supported-paths",
+    help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",
+)
+@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+def export_supported_paths(input_json: Path):
+    """Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""
+    logger.info(f"Filtering supported paths from input file: {input_json}")
+
+    try:
+        from enrichment import SUPPORTED_EXTENSIONS
+
+        with input_json.open("r", encoding="utf-8") as source_file:
+            raw_data = json.load(source_file)
+
+        if not isinstance(raw_data, list):
+            raise ValueError("Input JSON must contain an array of file paths")
+
+        filtered_paths = []
+        seen_paths = set()
+        for item in raw_data:
+            path_str = str(item).strip()
+            if not path_str:
+                continue
+            if path_str in seen_paths:
+                continue
+
+            extension = Path(path_str).suffix.lower()
+            if extension in SUPPORTED_EXTENSIONS:
+                filtered_paths.append(path_str)
+                seen_paths.add(path_str)
+
+        output_json = Path.cwd() / "yadisk_imported_paths.json"
+        output_csv = Path.cwd() / "yadisk_imported_paths.csv"
+
+        with output_json.open("w", encoding="utf-8") as output_json_file:
+            json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)
+
+        with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:
+            writer = csv.writer(output_csv_file)
+            writer.writerow(["path"])
+            for path_item in filtered_paths:
+                writer.writerow([path_item])
+
+        click.echo(
+            f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"
+        )
+        logger.info(
+            f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"
+        )
+    except Exception as error:
+        logger.error(f"Failed to export supported paths: {error}")
+        click.echo(f"Error: {error}")
+
+
 if __name__ == "__main__":
    cli()