langchain uploading new way of predefined paths from yandex disk
This commit is contained in:
@@ -8,7 +8,7 @@ from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from langchain_community.document_loaders import PyPDFLoader
|
||||
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from loguru import logger
|
||||
@@ -75,21 +75,26 @@ ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
|
||||
)
|
||||
|
||||
SUPPORTED_EXTENSIONS = {
|
||||
".pdf",
|
||||
".docx",
|
||||
".csv",
|
||||
".doc",
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".xls",
|
||||
# ".jpg",
|
||||
# ".jpeg",
|
||||
# ".png",
|
||||
# ".gif",
|
||||
# ".bmp",
|
||||
# ".tiff",
|
||||
# ".webp",
|
||||
".docx",
|
||||
".epub",
|
||||
".htm",
|
||||
".html",
|
||||
".json",
|
||||
".jsonl",
|
||||
".md",
|
||||
".odt",
|
||||
".txt", # this one is obvious but was unexpected to see in data lol
|
||||
".pdf",
|
||||
".ppt",
|
||||
".pptx",
|
||||
".rtf",
|
||||
".rst",
|
||||
".tsv",
|
||||
".txt",
|
||||
".xls",
|
||||
".xlsx",
|
||||
".xml",
|
||||
}
|
||||
|
||||
Base = declarative_base()
|
||||
@@ -261,6 +266,8 @@ class DocumentEnricher:
|
||||
return UnstructuredODTLoader(
|
||||
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||
)
|
||||
if ext in [".txt", ".md"]:
|
||||
return TextLoader(file_path, encoding="utf-8")
|
||||
return None
|
||||
|
||||
def _load_one_adaptive_file(
|
||||
|
||||
Reference in New Issue
Block a user