evaluation for rag systems

This commit is contained in:
2026-03-11 22:30:02 +03:00
parent 5721bad117
commit 6c953a327f
11 changed files with 31897 additions and 1 deletions

View File

@@ -216,3 +216,6 @@ __marimo__/
.streamlit/secrets.toml
document_tracking.db
.env.test
yadisk_imported_paths.csv
yadisk_imported_paths.json

View File

@@ -125,3 +125,7 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
- [x] Make tabbed UI with top level tabs. First tab exists and is selected. Each tab should have copy of demo ui, meaning the chat window with ability to specify the api url
- [x] At the end of the tabs there should be button with plus sign, which will add new tab. Tabs to be called by numbers.
- [x] There should predefined 3 tabs opened. First one should have predefined api url "https://rag.langchain.overwatch.su/api/test-query", second "https://rag.llamaindex.overwatch.su/api/test-query", third "https://rag.haystack.overwatch.su/api/test-query"
# Phase 17 (creating json with list of documents that are supported for import)
- [x] Make cli command that takes json file with list of paths, filters them to only those that are being imported into the vector storage (can be checked in enrichment), then this file should be saved in the current folder as "yadisk_imported_paths.json" and in "yadisk_imported_paths.csv" file. In case of CSV - it should be formatted as csv of course.

View File

@@ -1,4 +1,6 @@
import os
import csv
import json
from pathlib import Path
import click
@@ -126,5 +128,60 @@ def chat(collection_name, model):
click.echo(f"Error: {str(e)}")
@cli.command(
name="export-supported-paths",
help="Filter JSON paths by enrichment-supported extensions and export JSON/CSV",
)
@click.argument("input_json", type=click.Path(exists=True, dir_okay=False, path_type=Path))
def export_supported_paths(input_json: Path):
"""Export supported document paths into yadisk_imported_paths.json and yadisk_imported_paths.csv."""
logger.info(f"Filtering supported paths from input file: {input_json}")
try:
from enrichment import SUPPORTED_EXTENSIONS
with input_json.open("r", encoding="utf-8") as source_file:
raw_data = json.load(source_file)
if not isinstance(raw_data, list):
raise ValueError("Input JSON must contain an array of file paths")
filtered_paths = []
seen_paths = set()
for item in raw_data:
path_str = str(item).strip()
if not path_str:
continue
if path_str in seen_paths:
continue
extension = Path(path_str).suffix.lower()
if extension in SUPPORTED_EXTENSIONS:
filtered_paths.append(path_str)
seen_paths.add(path_str)
output_json = Path.cwd() / "yadisk_imported_paths.json"
output_csv = Path.cwd() / "yadisk_imported_paths.csv"
with output_json.open("w", encoding="utf-8") as output_json_file:
json.dump(filtered_paths, output_json_file, ensure_ascii=False, indent=2)
with output_csv.open("w", encoding="utf-8", newline="") as output_csv_file:
writer = csv.writer(output_csv_file)
writer.writerow(["path"])
for path_item in filtered_paths:
writer.writerow([path_item])
click.echo(
f"Export complete: {len(filtered_paths)} supported paths saved to {output_json.name} and {output_csv.name}"
)
logger.info(
f"Exported {len(filtered_paths)} supported paths to {output_json} and {output_csv}"
)
except Exception as error:
logger.error(f"Failed to export supported paths: {error}")
click.echo(f"Error: {error}")
if __name__ == "__main__":
cli()