langchain uploading new way of predefined paths from yandex disk

2026-02-26 00:01:47 +03:00
parent 2c7ab06b3f
commit 3e29ea70ed
7 changed files with 365 additions and 17 deletions
--- a/services/rag/langchain/enrichment.py
+++ b/services/rag/langchain/enrichment.py
@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import List, Optional, Tuple

 from dotenv import load_dotenv
-from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
 from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from loguru import logger
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
 )

 SUPPORTED_EXTENSIONS = {
-    ".pdf",
-    ".docx",
+    ".csv",
    ".doc",
-    ".pptx",
-    ".xlsx",
-    ".xls",
-    # ".jpg",
-    # ".jpeg",
-    # ".png",
-    # ".gif",
-    # ".bmp",
-    # ".tiff",
-    # ".webp",
+    ".docx",
+    ".epub",
+    ".htm",
+    ".html",
+    ".json",
+    ".jsonl",
+    ".md",
    ".odt",
-    ".txt",  # this one is obvious but was unexpected to see in data lol
+    ".pdf",
+    ".ppt",
+    ".pptx",
+    ".rtf",
+    ".rst",
+    ".tsv",
+    ".txt",
+    ".xls",
+    ".xlsx",
+    ".xml",
 }

 Base = declarative_base()
@@ -261,6 +266,8 @@ class DocumentEnricher:
            return UnstructuredODTLoader(
                file_path, **{"strategy": "hi_res", "languages": ["rus"]}
            )
+        if ext in [".txt", ".md"]:
+            return TextLoader(file_path, encoding="utf-8")
        return None

    def _load_one_adaptive_file(