langchain uploading new way of predefined paths from yandex disk

This commit is contained in:
2026-02-26 00:01:47 +03:00
parent 2c7ab06b3f
commit 3e29ea70ed
7 changed files with 365 additions and 17 deletions

View File

@@ -8,7 +8,7 @@ from pathlib import Path
from typing import List, Optional, Tuple
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from loguru import logger
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
)
SUPPORTED_EXTENSIONS = {
".pdf",
".docx",
".csv",
".doc",
".pptx",
".xlsx",
".xls",
# ".jpg",
# ".jpeg",
# ".png",
# ".gif",
# ".bmp",
# ".tiff",
# ".webp",
".docx",
".epub",
".htm",
".html",
".json",
".jsonl",
".md",
".odt",
".txt", # this one is obvious but was unexpected to see in data lol
".pdf",
".ppt",
".pptx",
".rtf",
".rst",
".tsv",
".txt",
".xls",
".xlsx",
".xml",
}
Base = declarative_base()
@@ -261,6 +266,8 @@ class DocumentEnricher:
return UnstructuredODTLoader(
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
)
if ext in [".txt", ".md"]:
return TextLoader(file_path, encoding="utf-8")
return None
def _load_one_adaptive_file(