Compare commits
5 Commits
06a3155b6b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 93d538ecc6 | |||
| f5659675ec | |||
| 7b52887558 | |||
| 1e6ab247b9 | |||
| e9dd28ad55 |
BIN
services/rag/langchain/.DS_Store
vendored
BIN
services/rag/langchain/.DS_Store
vendored
Binary file not shown.
@@ -7,3 +7,10 @@ QDRANT_HOST=HOST
|
|||||||
QDRANT_REST_PORT=PORT
|
QDRANT_REST_PORT=PORT
|
||||||
QDRANT_GRPC_PORT=PORT
|
QDRANT_GRPC_PORT=PORT
|
||||||
YADISK_TOKEN=TOKEN
|
YADISK_TOKEN=TOKEN
|
||||||
|
ENRICHMENT_SOURCE=local/yadisk
|
||||||
|
ENRICHMENT_LOCAL_PATH=path
|
||||||
|
ENRICHMENT_YADISK_PATH=path
|
||||||
|
ENRICHMENT_PROCESSING_MODE=async/sync
|
||||||
|
ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT=5
|
||||||
|
ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS=4
|
||||||
|
ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS=4
|
||||||
|
|||||||
@@ -73,3 +73,31 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
|||||||
- [x] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
- [x] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
||||||
- [x] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
- [x] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
||||||
- [x] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env.test has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
- [x] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env.test has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
||||||
|
|
||||||
|
# Phase 12 (using local file system or yandex disk)
|
||||||
|
|
||||||
|
During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper.
|
||||||
|
|
||||||
|
- [x] Adaptive file in helper now has filename in it, so tests should be adjusted for this
|
||||||
|
- [x] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local
|
||||||
|
- [x] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH
|
||||||
|
- [x] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system
|
||||||
|
- [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
|
||||||
|
- [x] Adaptive files has filename in them, so it should be used when extracting metadata
|
||||||
|
|
||||||
|
|
||||||
|
# Phase 13 (async processing of files)
|
||||||
|
|
||||||
|
During this Phase we create asynchronous process of enrichment, utilizing async/await
|
||||||
|
|
||||||
|
- [x] Prepare enrichment to be async process, so adjust neede libraries, etc. that are needed to be processed.
|
||||||
|
- [x] Create queue for adaptive files. It will store adaptive files that needs to be processed
|
||||||
|
- [x] Create queue for documents that were taken from the adaptive files.
|
||||||
|
- [x] Create function that iterates through the adaptive collection and adds it to the adaptive files queue ADAPTIVE_FILES_QUEUE. Let's call it insert_adaptive_files_queue
|
||||||
|
- [x] Create function that takes adaptive file from the adaptive files queue (PROCESSED_DOCUMENTS_QUEUE) and processed it, by splitting into chunks of documents. Let's call it process_adaptive_files_queue
|
||||||
|
- [x] Create function that takes chunk of documents from the processed documents queue, and sends them into the vector storage. It marks document, of which these chunks, as processed in the local database (existing feature adapted here. Let's call it upload_processed_documents_from_queue
|
||||||
|
- [x] Utilize Python threading machinery, to create threads for several our functions. There will be environment variables: ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT (default 5), ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS (default 4), ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS (default 4)
|
||||||
|
- [x] Function insert_adaptive_files_queue would not be in a thread. It will iterate through adaptive collection and wait while queue has less than ENRICHMENT_ADAPTIVE_FILE_LOAD_QUEUE_LIMIT.
|
||||||
|
- [x] Function process_adaptive_files_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS)
|
||||||
|
- [x] Function upload_processed_documents_from_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS)
|
||||||
|
- [x] Program should control threads. Function insert_adaptive_files_queue, after adaptive collection ends, then should wait untill all theads finish. What does finish mean? It means when our insert_adaptive_files_queue function realizes that there is no adaptive files left in collection, it marks shared variable between threads, that collection finished. When our other functions in threads sees that this variable became true - they deplete queue and do not go to the next loop to wait for new items in queue, and just finish. This would eventually finish the program. Each thread finishes, and main program too as usual after processing all of things.
|
||||||
|
|||||||
@@ -37,15 +37,16 @@ def ping():
|
|||||||
name="enrich",
|
name="enrich",
|
||||||
help="Load documents from data directory and store in vector database",
|
help="Load documents from data directory and store in vector database",
|
||||||
)
|
)
|
||||||
@click.option("--data-dir", default="../../../data", help="Path to the data directory")
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--collection-name",
|
"--collection-name",
|
||||||
default="documents_langchain",
|
default="documents_langchain",
|
||||||
help="Name of the vector store collection",
|
help="Name of the vector store collection",
|
||||||
)
|
)
|
||||||
def enrich(data_dir, collection_name):
|
def enrich(collection_name):
|
||||||
"""Load documents from data directory and store in vector database"""
|
"""Load documents from data directory and store in vector database"""
|
||||||
logger.info(f"Starting enrichment process for directory: {data_dir}")
|
logger.info(
|
||||||
|
f"Starting enrichment process. Enrichment source: {os.getenv('ENRICHMENT_SOURCE')}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Import here to avoid circular dependencies
|
# Import here to avoid circular dependencies
|
||||||
@@ -56,7 +57,7 @@ def enrich(data_dir, collection_name):
|
|||||||
vector_store = initialize_vector_store(collection_name=collection_name)
|
vector_store = initialize_vector_store(collection_name=collection_name)
|
||||||
|
|
||||||
# Run enrichment process
|
# Run enrichment process
|
||||||
run_enrichment_process(vector_store, data_dir=data_dir)
|
run_enrichment_process(vector_store)
|
||||||
|
|
||||||
logger.info("Enrichment process completed successfully!")
|
logger.info("Enrichment process completed successfully!")
|
||||||
click.echo("Documents have been successfully loaded into the vector store.")
|
click.echo("Documents have been successfully loaded into the vector store.")
|
||||||
|
|||||||
@@ -1,13 +1,21 @@
|
|||||||
"""Document enrichment module for loading documents into vector storage."""
|
"""Document enrichment module for loading documents into vector storage."""
|
||||||
|
|
||||||
import os
|
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
|
import queue
|
||||||
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
from loguru import logger
|
||||||
|
from sqlalchemy import Column, Integer, String, create_engine
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
# Dynamically import other loaders to handle optional dependencies
|
# Dynamically import other loaders to handle optional dependencies
|
||||||
try:
|
try:
|
||||||
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
||||||
@@ -33,37 +41,92 @@ try:
|
|||||||
from langchain_community.document_loaders import UnstructuredODTLoader
|
from langchain_community.document_loaders import UnstructuredODTLoader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
UnstructuredODTLoader = None
|
UnstructuredODTLoader = None
|
||||||
from sqlalchemy import create_engine, Column, Integer, String
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.orm import sessionmaker
|
|
||||||
from loguru import logger
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
from helpers import extract_russian_event_names, extract_years_from_text
|
from helpers import (
|
||||||
|
LocalFilesystemAdaptiveCollection,
|
||||||
|
YandexDiskAdaptiveCollection,
|
||||||
|
YandexDiskAdaptiveFile,
|
||||||
|
_AdaptiveCollection,
|
||||||
|
_AdaptiveFile,
|
||||||
|
extract_russian_event_names,
|
||||||
|
extract_years_from_text,
|
||||||
|
)
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
# Define the path to the data directory
|
# Define the path to the data directory
|
||||||
DATA_DIR = Path("../../../data").resolve()
|
DATA_DIR = Path("../../../data").resolve()
|
||||||
DB_PATH = Path("document_tracking.db").resolve()
|
DB_PATH = Path("document_tracking.db").resolve()
|
||||||
|
ENRICHMENT_SOURCE = os.getenv("ENRICHMENT_SOURCE", "local").lower()
|
||||||
|
ENRICHMENT_LOCAL_PATH = os.getenv("ENRICHMENT_LOCAL_PATH")
|
||||||
|
ENRICHMENT_YADISK_PATH = os.getenv("ENRICHMENT_YADISK_PATH")
|
||||||
|
YADISK_TOKEN = os.getenv("YADISK_TOKEN")
|
||||||
|
|
||||||
|
ENRICHMENT_PROCESSING_MODE = os.getenv("ENRICHMENT_PROCESSING_MODE", "async").lower()
|
||||||
|
ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT = int(
|
||||||
|
os.getenv("ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT", "5")
|
||||||
|
)
|
||||||
|
ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS = int(
|
||||||
|
os.getenv("ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS", "4")
|
||||||
|
)
|
||||||
|
ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS = int(
|
||||||
|
os.getenv("ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS", "4")
|
||||||
|
)
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".doc",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".xls",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".png",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".webp",
|
||||||
|
".odt",
|
||||||
|
".txt", # this one is obvious but was unexpected to see in data lol
|
||||||
|
}
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
class ProcessedDocument(Base):
|
class ProcessedDocument(Base):
|
||||||
"""Database model for tracking processed documents."""
|
"""Database model for tracking processed documents."""
|
||||||
|
|
||||||
__tablename__ = "processed_documents"
|
__tablename__ = "processed_documents"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True)
|
id = Column(Integer, primary_key=True)
|
||||||
file_path = Column(String, unique=True, nullable=False)
|
file_path = Column(String, unique=True, nullable=False)
|
||||||
file_hash = Column(String, nullable=False)
|
file_hash = Column(String, nullable=False)
|
||||||
|
|
||||||
|
|
||||||
|
# to guess the filetype in russian language, for searching it
|
||||||
|
def try_guess_file_type(extension: str) -> str:
|
||||||
|
if extension in [".xlsx", "xls"]:
|
||||||
|
return "таблица"
|
||||||
|
elif extension in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
|
||||||
|
return "изображение"
|
||||||
|
elif extension in [".pptx"]:
|
||||||
|
return "презентация"
|
||||||
|
else:
|
||||||
|
return "документ"
|
||||||
|
|
||||||
|
|
||||||
|
def identify_adaptive_file_source(adaptive_file: _AdaptiveFile) -> str:
|
||||||
|
if isinstance(adaptive_file, YandexDiskAdaptiveFile):
|
||||||
|
return "Яндекс Диск"
|
||||||
|
else:
|
||||||
|
return "Локальный Файл"
|
||||||
|
|
||||||
|
|
||||||
class DocumentEnricher:
|
class DocumentEnricher:
|
||||||
"""Class responsible for enriching documents and loading them to vector storage."""
|
"""Class responsible for enriching documents and loading them to vector storage."""
|
||||||
|
|
||||||
def __init__(self, vector_store):
|
def __init__(self, vector_store):
|
||||||
self.vector_store = vector_store
|
self.vector_store = vector_store
|
||||||
self.text_splitter = RecursiveCharacterTextSplitter(
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
||||||
@@ -71,219 +134,357 @@ class DocumentEnricher:
|
|||||||
chunk_overlap=200,
|
chunk_overlap=200,
|
||||||
length_function=len,
|
length_function=len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# In sync mode we force minimal concurrency values.
|
||||||
|
if ENRICHMENT_PROCESSING_MODE == "sync":
|
||||||
|
self.adaptive_files_queue_limit = 1
|
||||||
|
self.file_process_threads_count = 1
|
||||||
|
self.document_upload_threads_count = 1
|
||||||
|
else:
|
||||||
|
self.adaptive_files_queue_limit = max(
|
||||||
|
1, ENRICHMENT_ADAPTIVE_FILES_QUEUE_LIMIT
|
||||||
|
)
|
||||||
|
self.file_process_threads_count = max(
|
||||||
|
1, ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS
|
||||||
|
)
|
||||||
|
self.document_upload_threads_count = max(
|
||||||
|
1, ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phase 13 queues
|
||||||
|
self.ADAPTIVE_FILES_QUEUE: queue.Queue = queue.Queue(
|
||||||
|
maxsize=self.adaptive_files_queue_limit
|
||||||
|
)
|
||||||
|
self.PROCESSED_DOCUMENTS_QUEUE: queue.Queue = queue.Queue(
|
||||||
|
maxsize=max(1, self.adaptive_files_queue_limit * 2)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Shared state for thread lifecycle
|
||||||
|
self.collection_finished = threading.Event()
|
||||||
|
self.processing_finished = threading.Event()
|
||||||
|
|
||||||
# Initialize database for tracking processed documents
|
# Initialize database for tracking processed documents
|
||||||
self._init_db()
|
self._init_db()
|
||||||
|
|
||||||
def _init_db(self):
|
def _init_db(self):
|
||||||
"""Initialize the SQLite database for tracking processed documents."""
|
"""Initialize the SQLite database for tracking processed documents."""
|
||||||
self.engine = create_engine(f"sqlite:///{DB_PATH}")
|
self.engine = create_engine(f"sqlite:///{DB_PATH}")
|
||||||
Base.metadata.create_all(self.engine)
|
Base.metadata.create_all(self.engine)
|
||||||
Session = sessionmaker(bind=self.engine)
|
self.SessionLocal = sessionmaker(bind=self.engine)
|
||||||
self.session = Session()
|
|
||||||
|
|
||||||
def _get_file_hash(self, file_path: str) -> str:
|
def _get_file_hash(self, file_path: str) -> str:
|
||||||
"""Calculate SHA256 hash of a file."""
|
"""Calculate SHA256 hash of a file."""
|
||||||
hash_sha256 = hashlib.sha256()
|
hash_sha256 = hashlib.sha256()
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as file_handle:
|
||||||
# Read file in chunks to handle large files
|
for chunk in iter(lambda: file_handle.read(4096), b""):
|
||||||
for chunk in iter(lambda: f.read(4096), b""):
|
|
||||||
hash_sha256.update(chunk)
|
hash_sha256.update(chunk)
|
||||||
return hash_sha256.hexdigest()
|
return hash_sha256.hexdigest()
|
||||||
|
|
||||||
def _is_document_processed(self, file_path: str) -> bool:
|
def _is_document_hash_processed(self, file_hash: str) -> bool:
|
||||||
"""Check if a document has already been processed."""
|
"""Check if a document hash has already been processed."""
|
||||||
file_hash = self._get_file_hash(file_path)
|
session = self.SessionLocal()
|
||||||
existing = self.session.query(ProcessedDocument).filter_by(
|
try:
|
||||||
file_hash=file_hash
|
existing = (
|
||||||
).first()
|
session.query(ProcessedDocument).filter_by(file_hash=file_hash).first()
|
||||||
return existing is not None
|
)
|
||||||
|
return existing is not None
|
||||||
def _mark_document_processed(self, file_path: str):
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
def _mark_document_processed(self, file_identifier: str, file_hash: str):
|
||||||
"""Mark a document as processed in the database."""
|
"""Mark a document as processed in the database."""
|
||||||
file_hash = self._get_file_hash(file_path)
|
session = self.SessionLocal()
|
||||||
doc_record = ProcessedDocument(
|
try:
|
||||||
file_path=file_path,
|
existing = (
|
||||||
file_hash=file_hash
|
session.query(ProcessedDocument)
|
||||||
)
|
.filter_by(file_path=file_identifier)
|
||||||
self.session.add(doc_record)
|
.first()
|
||||||
self.session.commit()
|
)
|
||||||
|
if existing is not None:
|
||||||
|
existing.file_hash = file_hash
|
||||||
|
else:
|
||||||
|
session.add(
|
||||||
|
ProcessedDocument(file_path=file_identifier, file_hash=file_hash)
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
|
|
||||||
def _get_loader_for_extension(self, file_path: str):
|
def _get_loader_for_extension(self, file_path: str):
|
||||||
"""Get the appropriate loader for a given file extension."""
|
"""Get the appropriate loader for a given file extension."""
|
||||||
ext = Path(file_path).suffix.lower()
|
ext = Path(file_path).suffix.lower()
|
||||||
|
|
||||||
if ext == ".pdf":
|
if ext == ".pdf":
|
||||||
return PyPDFLoader(file_path)
|
return PyPDFLoader(file_path)
|
||||||
elif ext in [".docx", ".doc"]:
|
if ext in [".docx", ".doc"]:
|
||||||
if UnstructuredWordDocumentLoader is None:
|
if UnstructuredWordDocumentLoader is None:
|
||||||
logger.warning(f"UnstructuredWordDocumentLoader not available for {file_path}. Skipping.")
|
logger.warning(
|
||||||
|
f"UnstructuredWordDocumentLoader not available for {file_path}. Skipping."
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
return UnstructuredWordDocumentLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
return UnstructuredWordDocumentLoader(
|
||||||
elif ext == ".pptx":
|
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||||
|
)
|
||||||
|
if ext == ".pptx":
|
||||||
if UnstructuredPowerPointLoader is None:
|
if UnstructuredPowerPointLoader is None:
|
||||||
logger.warning(f"UnstructuredPowerPointLoader not available for {file_path}. Skipping.")
|
logger.warning(
|
||||||
|
f"UnstructuredPowerPointLoader not available for {file_path}. Skipping."
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
return UnstructuredPowerPointLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
return UnstructuredPowerPointLoader(
|
||||||
elif ext in [".xlsx", ".xls"]:
|
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||||
|
)
|
||||||
|
if ext in [".xlsx", ".xls"]:
|
||||||
if UnstructuredExcelLoader is None:
|
if UnstructuredExcelLoader is None:
|
||||||
logger.warning(f"UnstructuredExcelLoader not available for {file_path}. Skipping.")
|
logger.warning(
|
||||||
|
f"UnstructuredExcelLoader not available for {file_path}. Skipping."
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
return UnstructuredExcelLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
return UnstructuredExcelLoader(
|
||||||
elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
|
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||||
|
)
|
||||||
|
if ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
|
||||||
if UnstructuredImageLoader is None:
|
if UnstructuredImageLoader is None:
|
||||||
logger.warning(f"UnstructuredImageLoader not available for {file_path}. Skipping.")
|
logger.warning(
|
||||||
|
f"UnstructuredImageLoader not available for {file_path}. Skipping."
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
# Use OCR strategy for images to extract text
|
return UnstructuredImageLoader(
|
||||||
return UnstructuredImageLoader(file_path, **{"strategy": "ocr_only", "languages": ["rus"]})
|
file_path, **{"strategy": "ocr_only", "languages": ["rus"]}
|
||||||
elif ext == ".odt":
|
)
|
||||||
|
if ext == ".odt":
|
||||||
if UnstructuredODTLoader is None:
|
if UnstructuredODTLoader is None:
|
||||||
logger.warning(f"UnstructuredODTLoader not available for {file_path}. Skipping.")
|
logger.warning(
|
||||||
|
f"UnstructuredODTLoader not available for {file_path}. Skipping."
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
return UnstructuredODTLoader(
|
||||||
else:
|
file_path, **{"strategy": "hi_res", "languages": ["rus"]}
|
||||||
# For text files and unsupported formats, try to load as text
|
)
|
||||||
try:
|
return None
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
|
||||||
content = f.read()
|
|
||||||
return None, content # Return content directly for text processing
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
logger.warning(f"Could not decode file as text: {file_path}")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
def load_and_split_documents(self, file_paths: List[str]) -> List[Document]:
|
|
||||||
"""Load documents from file paths and split them appropriately."""
|
|
||||||
all_docs = []
|
|
||||||
|
|
||||||
for file_path in file_paths:
|
def _load_one_adaptive_file(
|
||||||
if self._is_document_processed(file_path):
|
self, adaptive_file: _AdaptiveFile
|
||||||
logger.info(f"Skipping already processed document: {file_path}")
|
) -> Tuple[List[Document], Optional[Tuple[str, str]]]:
|
||||||
continue
|
"""Load and split one adaptive file by using its local working callback."""
|
||||||
|
loaded_docs: List[Document] = []
|
||||||
|
processed_record: Optional[Tuple[str, str]] = None
|
||||||
|
source_identifier = identify_adaptive_file_source(adaptive_file)
|
||||||
|
extension = adaptive_file.extension.lower()
|
||||||
|
file_type = try_guess_file_type(extension)
|
||||||
|
|
||||||
logger.info(f"Processing document: {file_path}")
|
def process_local_file(local_file_path: str):
|
||||||
|
nonlocal loaded_docs, processed_record
|
||||||
|
|
||||||
# Get the appropriate loader for the file extension
|
file_hash = self._get_file_hash(local_file_path)
|
||||||
loader = self._get_loader_for_extension(file_path)
|
if self._is_document_hash_processed(file_hash):
|
||||||
|
logger.info(
|
||||||
|
f"SKIPPING already processed document hash for: {source_identifier}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
logger.info("Document is not processed! Doing it")
|
||||||
|
|
||||||
|
loader = self._get_loader_for_extension(local_file_path)
|
||||||
if loader is None:
|
if loader is None:
|
||||||
# For unsupported formats that we tried to load as text
|
logger.warning(f"No loader available for file: {source_identifier}")
|
||||||
|
return
|
||||||
|
|
||||||
|
docs = loader.load()
|
||||||
|
for doc in docs:
|
||||||
|
doc.metadata["file_type"] = file_type
|
||||||
|
doc.metadata["source"] = source_identifier
|
||||||
|
doc.metadata["filename"] = adaptive_file.filename
|
||||||
|
doc.metadata["file_path"] = source_identifier
|
||||||
|
doc.metadata["file_size"] = os.path.getsize(local_file_path)
|
||||||
|
doc.metadata["file_extension"] = extension
|
||||||
|
|
||||||
|
if "page" in doc.metadata:
|
||||||
|
doc.metadata["page_number"] = doc.metadata["page"]
|
||||||
|
|
||||||
|
split_docs = self.text_splitter.split_documents(docs)
|
||||||
|
for chunk in split_docs:
|
||||||
|
chunk.metadata["years"] = extract_years_from_text(chunk.page_content)
|
||||||
|
chunk.metadata["events"] = extract_russian_event_names(
|
||||||
|
chunk.page_content
|
||||||
|
)
|
||||||
|
|
||||||
|
loaded_docs = split_docs
|
||||||
|
processed_record = (source_identifier, file_hash)
|
||||||
|
|
||||||
|
adaptive_file.work_with_file_locally(process_local_file)
|
||||||
|
return loaded_docs, processed_record
|
||||||
|
|
||||||
|
# Phase 13 API: inserts adaptive files into ADAPTIVE_FILES_QUEUE
|
||||||
|
def insert_adaptive_files_queue(
|
||||||
|
self, adaptive_collection: _AdaptiveCollection, recursive: bool = True
|
||||||
|
):
|
||||||
|
for adaptive_file in adaptive_collection.iterate(recursive=recursive):
|
||||||
|
if adaptive_file.extension.lower() not in SUPPORTED_EXTENSIONS:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping unsupported file extension for {adaptive_file.filename}: {adaptive_file.extension}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
self.ADAPTIVE_FILES_QUEUE.put(adaptive_file)
|
||||||
|
|
||||||
|
logger.debug("ADAPTIVE COLLECTION DEPLETED!")
|
||||||
|
self.collection_finished.set()
|
||||||
|
|
||||||
|
# Phase 13 API: reads adaptive files and writes processed docs into PROCESSED_DOCUMENTS_QUEUE
|
||||||
|
def process_adaptive_files_queue(self):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
adaptive_file = self.ADAPTIVE_FILES_QUEUE.get(timeout=0.2)
|
||||||
|
except queue.Empty:
|
||||||
|
if self.collection_finished.is_set():
|
||||||
|
return
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Load the document(s)
|
split_docs, processed_record = self._load_one_adaptive_file(
|
||||||
docs = loader.load()
|
adaptive_file
|
||||||
|
)
|
||||||
|
if split_docs:
|
||||||
|
self.PROCESSED_DOCUMENTS_QUEUE.put((split_docs, processed_record))
|
||||||
|
except Exception as error:
|
||||||
|
logger.error(f"Error processing {adaptive_file.filename}: {error}")
|
||||||
|
finally:
|
||||||
|
self.ADAPTIVE_FILES_QUEUE.task_done()
|
||||||
|
|
||||||
# Add metadata to each document
|
# Phase 13 API: uploads chunked docs and marks file processed
|
||||||
for doc in docs:
|
def upload_processed_documents_from_queue(self):
|
||||||
# Extract metadata from the original file
|
while True:
|
||||||
doc.metadata["source"] = file_path
|
try:
|
||||||
doc.metadata["filename"] = Path(file_path).name
|
payload = self.PROCESSED_DOCUMENTS_QUEUE.get(timeout=0.2)
|
||||||
doc.metadata["file_path"] = file_path
|
except queue.Empty:
|
||||||
doc.metadata["file_size"] = os.path.getsize(file_path)
|
if self.processing_finished.is_set():
|
||||||
|
return
|
||||||
# Add page number if available in original metadata
|
|
||||||
if "page" in doc.metadata:
|
|
||||||
doc.metadata["page_number"] = doc.metadata["page"]
|
|
||||||
|
|
||||||
# Add file extension as metadata
|
|
||||||
doc.metadata["file_extension"] = Path(file_path).suffix
|
|
||||||
|
|
||||||
# Split documents if they are too large
|
|
||||||
split_docs = self.text_splitter.split_documents(docs)
|
|
||||||
|
|
||||||
# Extract additional metadata from each chunk.
|
|
||||||
for chunk in split_docs:
|
|
||||||
years = extract_years_from_text(chunk.page_content)
|
|
||||||
events = extract_russian_event_names(chunk.page_content)
|
|
||||||
chunk.metadata["years"] = years
|
|
||||||
chunk.metadata["events"] = events
|
|
||||||
|
|
||||||
# Add to the collection
|
|
||||||
all_docs.extend(split_docs)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing {file_path}: {str(e)}")
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return all_docs
|
try:
|
||||||
|
documents, processed_record = payload
|
||||||
def enrich_and_store(self, file_paths: List[str]):
|
self.vector_store.add_documents(documents)
|
||||||
|
|
||||||
|
if processed_record is not None:
|
||||||
|
self._mark_document_processed(
|
||||||
|
processed_record[0], processed_record[1]
|
||||||
|
)
|
||||||
|
except Exception as error:
|
||||||
|
logger.error(
|
||||||
|
f"Error uploading processed documents: {error}. But swallowing error. NOT raising."
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
self.PROCESSED_DOCUMENTS_QUEUE.task_done()
|
||||||
|
|
||||||
|
def _run_threaded_pipeline(self, adaptive_collection: _AdaptiveCollection):
|
||||||
|
"""Run Phase 13 queue/thread pipeline."""
|
||||||
|
process_threads = [
|
||||||
|
threading.Thread(
|
||||||
|
target=self.process_adaptive_files_queue,
|
||||||
|
name=f"adaptive-file-processor-{index}",
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
for index in range(self.file_process_threads_count)
|
||||||
|
]
|
||||||
|
upload_threads = [
|
||||||
|
threading.Thread(
|
||||||
|
target=self.upload_processed_documents_from_queue,
|
||||||
|
name=f"document-uploader-{index}",
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
for index in range(self.document_upload_threads_count)
|
||||||
|
]
|
||||||
|
|
||||||
|
for thread in process_threads:
|
||||||
|
thread.start()
|
||||||
|
for thread in upload_threads:
|
||||||
|
thread.start()
|
||||||
|
|
||||||
|
# This one intentionally runs on main thread per Phase 13 requirement.
|
||||||
|
self.insert_adaptive_files_queue(adaptive_collection, recursive=True)
|
||||||
|
|
||||||
|
# Wait file queue completion and processing threads end.
|
||||||
|
self.ADAPTIVE_FILES_QUEUE.join()
|
||||||
|
for thread in process_threads:
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
# Signal upload workers no more payload is expected.
|
||||||
|
self.processing_finished.set()
|
||||||
|
|
||||||
|
# Wait upload completion and upload threads end.
|
||||||
|
self.PROCESSED_DOCUMENTS_QUEUE.join()
|
||||||
|
for thread in upload_threads:
|
||||||
|
thread.join()
|
||||||
|
|
||||||
|
def _run_sync_pipeline(self, adaptive_collection: _AdaptiveCollection):
|
||||||
|
"""Sequential pipeline for sync mode."""
|
||||||
|
logger.info("Running enrichment in sync mode")
|
||||||
|
self.insert_adaptive_files_queue(adaptive_collection, recursive=True)
|
||||||
|
self.process_adaptive_files_queue()
|
||||||
|
self.processing_finished.set()
|
||||||
|
self.upload_processed_documents_from_queue()
|
||||||
|
|
||||||
|
def enrich_and_store(self, adaptive_collection: _AdaptiveCollection):
|
||||||
"""Load, enrich, and store documents in the vector store."""
|
"""Load, enrich, and store documents in the vector store."""
|
||||||
logger.info(f"Starting enrichment process for {len(file_paths)} files...")
|
logger.info("Starting enrichment process...")
|
||||||
|
|
||||||
# Load and split documents
|
if ENRICHMENT_PROCESSING_MODE == "sync":
|
||||||
documents = self.load_and_split_documents(file_paths)
|
logger.info("Document enrichment process starting in SYNC mode")
|
||||||
|
self._run_sync_pipeline(adaptive_collection)
|
||||||
if not documents:
|
|
||||||
logger.info("No new documents to process.")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Loaded and split {len(documents)} document chunks, adding to vector store...")
|
logger.info("Document enrichment process starting in ASYNC/THREAD mode")
|
||||||
|
self._run_threaded_pipeline(adaptive_collection)
|
||||||
# Add documents to vector store
|
|
||||||
try:
|
|
||||||
self.vector_store.add_documents(documents)
|
|
||||||
|
|
||||||
# Only mark documents as processed after successful insertion to vector store
|
|
||||||
processed_file_paths = set()
|
|
||||||
for doc in documents:
|
|
||||||
if 'source' in doc.metadata:
|
|
||||||
processed_file_paths.add(doc.metadata['source'])
|
|
||||||
|
|
||||||
for file_path in processed_file_paths:
|
|
||||||
self._mark_document_processed(file_path)
|
|
||||||
|
|
||||||
logger.info(f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_paths)} files as processed.")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error adding documents to vector store: {str(e)}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
|
|
||||||
def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]:
|
def get_enrichment_adaptive_collection(
|
||||||
"""Get all supported document file paths from the data directory."""
|
data_dir: str = str(DATA_DIR),
|
||||||
supported_extensions = {
|
) -> _AdaptiveCollection:
|
||||||
'.pdf', '.docx', '.doc', '.pptx', '.xlsx', '.xls',
|
"""Create adaptive collection based on environment source configuration."""
|
||||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
|
source = ENRICHMENT_SOURCE
|
||||||
'.webp', '.odt'
|
if source == "local":
|
||||||
}
|
local_path = ENRICHMENT_LOCAL_PATH or data_dir
|
||||||
|
logger.info(f"Using local adaptive collection from path: {local_path}")
|
||||||
file_paths = []
|
return LocalFilesystemAdaptiveCollection(local_path)
|
||||||
for root, dirs, files in os.walk(data_dir):
|
|
||||||
for file in files:
|
if source == "yadisk":
|
||||||
if Path(file).suffix.lower() in supported_extensions:
|
if not YADISK_TOKEN:
|
||||||
file_paths.append(os.path.join(root, file))
|
raise ValueError("YADISK_TOKEN must be set when ENRICHMENT_SOURCE=yadisk")
|
||||||
|
if not ENRICHMENT_YADISK_PATH:
|
||||||
return file_paths
|
raise ValueError(
|
||||||
|
"ENRICHMENT_YADISK_PATH must be set when ENRICHMENT_SOURCE=yadisk"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Using Yandex Disk adaptive collection from path: {ENRICHMENT_YADISK_PATH}"
|
||||||
|
)
|
||||||
|
return YandexDiskAdaptiveCollection(
|
||||||
|
token=YADISK_TOKEN,
|
||||||
|
base_dir=ENRICHMENT_YADISK_PATH,
|
||||||
|
)
|
||||||
|
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported ENRICHMENT_SOURCE='{source}'. Allowed values: local, yadisk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
|
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
|
||||||
"""Run the full enrichment process."""
|
"""Run the full enrichment process."""
|
||||||
logger.info(f"Starting document enrichment from directory: {data_dir}")
|
logger.info("Starting document enrichment process")
|
||||||
|
|
||||||
# Get all supported documents from the data directory
|
adaptive_collection = get_enrichment_adaptive_collection(data_dir=data_dir)
|
||||||
file_paths = get_all_documents_from_data_dir(data_dir)
|
|
||||||
|
|
||||||
if not file_paths:
|
|
||||||
logger.warning(f"No supported documents found in {data_dir}")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Found {len(file_paths)} documents to process")
|
|
||||||
|
|
||||||
# Initialize the document enricher
|
# Initialize the document enricher
|
||||||
enricher = DocumentEnricher(vector_store)
|
enricher = DocumentEnricher(vector_store)
|
||||||
|
|
||||||
# Run the enrichment process
|
# Run the enrichment process
|
||||||
enricher.enrich_and_store(file_paths)
|
enricher.enrich_and_store(adaptive_collection)
|
||||||
|
|
||||||
logger.info("Document enrichment process completed!")
|
logger.info("Document enrichment process completed!")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Example usage
|
|
||||||
from vector_storage import initialize_vector_store
|
from vector_storage import initialize_vector_store
|
||||||
|
|
||||||
# Initialize vector store
|
|
||||||
vector_store = initialize_vector_store()
|
vector_store = initialize_vector_store()
|
||||||
|
|
||||||
# Run enrichment process
|
|
||||||
run_enrichment_process(vector_store)
|
run_enrichment_process(vector_store)
|
||||||
|
|||||||
@@ -115,11 +115,11 @@ def extract_russian_event_names(text: str) -> List[str]:
|
|||||||
|
|
||||||
class _AdaptiveFile(ABC):
|
class _AdaptiveFile(ABC):
|
||||||
extension: str # Format: .jpg
|
extension: str # Format: .jpg
|
||||||
local_path: str
|
filename: str
|
||||||
|
|
||||||
def __init__(self, extension: str, local_path: str):
|
def __init__(self, filename: str, extension: str):
|
||||||
|
self.filename = filename
|
||||||
self.extension = extension
|
self.extension = extension
|
||||||
self.local_path = local_path
|
|
||||||
|
|
||||||
# This method allows to work with file locally, and lambda should be provided for this.
|
# This method allows to work with file locally, and lambda should be provided for this.
|
||||||
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
||||||
@@ -137,6 +137,12 @@ class _AdaptiveCollection(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
||||||
|
local_path: str
|
||||||
|
|
||||||
|
def __init__(self, filename: str, extension: str, local_path: str):
|
||||||
|
super().__init__(filename, extension)
|
||||||
|
self.local_path = local_path
|
||||||
|
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||||
func(self.local_path)
|
func(self.local_path)
|
||||||
|
|
||||||
@@ -153,7 +159,8 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|||||||
for root, dirs, files in os.walk(self.base_dir):
|
for root, dirs, files in os.walk(self.base_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
full_path = os.path.join(root, file)
|
full_path = os.path.join(root, file)
|
||||||
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
|
p = Path(full_path)
|
||||||
|
yield LocalFilesystemAdaptiveFile(p.name, p.suffix, full_path)
|
||||||
|
|
||||||
if not recursive:
|
if not recursive:
|
||||||
break
|
break
|
||||||
@@ -162,16 +169,19 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|||||||
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
||||||
"""Adaptive file representation for Yandex Disk resources."""
|
"""Adaptive file representation for Yandex Disk resources."""
|
||||||
|
|
||||||
def __init__(self, extension: str, local_path: str, token: str):
|
remote_path: str
|
||||||
super().__init__(extension, local_path)
|
|
||||||
|
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
|
||||||
|
super().__init__(filename, extension)
|
||||||
self.token = token
|
self.token = token
|
||||||
|
self.remote_path = remote_path
|
||||||
|
|
||||||
def _download_to_temp_file(self) -> str:
|
def _download_to_temp_file(self) -> str:
|
||||||
headers = {"Authorization": f"OAuth {self.token}"}
|
headers = {"Authorization": f"OAuth {self.token}"}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||||
headers=headers,
|
headers=headers,
|
||||||
params={"path": self.local_path},
|
params={"path": self.remote_path},
|
||||||
timeout=30,
|
timeout=30,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -180,7 +190,8 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
|
|||||||
file_response = requests.get(href, timeout=120)
|
file_response = requests.get(href, timeout=120)
|
||||||
file_response.raise_for_status()
|
file_response.raise_for_status()
|
||||||
|
|
||||||
suffix = Path(self.local_path).suffix
|
p = Path(self.remote_path)
|
||||||
|
suffix = p.suffix
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||||||
temp_file.write(file_response.content)
|
temp_file.write(file_response.content)
|
||||||
return temp_file.name
|
return temp_file.name
|
||||||
@@ -249,7 +260,8 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
|||||||
if root_info.get("type") == "file":
|
if root_info.get("type") == "file":
|
||||||
path = root_info["path"]
|
path = root_info["path"]
|
||||||
logger.info(f"Found file on Yandex Disk: {path}")
|
logger.info(f"Found file on Yandex Disk: {path}")
|
||||||
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
|
p = Path(path)
|
||||||
|
yield YandexDiskAdaptiveFile(p.name, p.suffix, path, self.token)
|
||||||
return
|
return
|
||||||
|
|
||||||
directories = [root_path]
|
directories = [root_path]
|
||||||
@@ -257,11 +269,12 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
|||||||
current_dir = directories.pop(0)
|
current_dir = directories.pop(0)
|
||||||
for item in self._iter_children(current_dir):
|
for item in self._iter_children(current_dir):
|
||||||
item_type = item.get("type")
|
item_type = item.get("type")
|
||||||
item_path = item.get("path")
|
item_path = str(item.get("path"))
|
||||||
if item_type == "file":
|
if item_type == "file":
|
||||||
logger.info(f"Found file on Yandex Disk: {item_path}")
|
logger.info(f"Found file on Yandex Disk: {item_path}")
|
||||||
|
p = Path(item_path)
|
||||||
yield YandexDiskAdaptiveFile(
|
yield YandexDiskAdaptiveFile(
|
||||||
Path(item_path).suffix, item_path, self.token
|
p.name, p.suffix, item_path, self.token
|
||||||
)
|
)
|
||||||
elif recursive and item_type == "dir":
|
elif recursive and item_type == "dir":
|
||||||
directories.append(item_path)
|
directories.append(item_path)
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
||||||
|
|
||||||
files = list(collection.iterate(recursive=False))
|
files = list(collection.iterate(recursive=False))
|
||||||
file_names = sorted(Path(file.local_path).name for file in files)
|
file_names = sorted(file.filename for file in files)
|
||||||
|
|
||||||
self.assertEqual(file_names, ["root.txt"])
|
self.assertEqual(file_names, ["root.txt"])
|
||||||
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
|
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
|
||||||
@@ -33,7 +33,9 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
|
|
||||||
def test_work_with_file_locally_provides_existing_path(self):
|
def test_work_with_file_locally_provides_existing_path(self):
|
||||||
target_path = self.samples_dir / "root.txt"
|
target_path = self.samples_dir / "root.txt"
|
||||||
adaptive_file = LocalFilesystemAdaptiveFile(target_path.suffix, str(target_path))
|
adaptive_file = LocalFilesystemAdaptiveFile(
|
||||||
|
target_path.name, target_path.suffix, str(target_path)
|
||||||
|
)
|
||||||
|
|
||||||
observed = {}
|
observed = {}
|
||||||
|
|
||||||
@@ -44,6 +46,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
|
|
||||||
adaptive_file.work_with_file_locally(callback)
|
adaptive_file.work_with_file_locally(callback)
|
||||||
|
|
||||||
|
self.assertEqual(adaptive_file.filename, "root.txt")
|
||||||
self.assertEqual(observed["path"], str(target_path))
|
self.assertEqual(observed["path"], str(target_path))
|
||||||
self.assertEqual(observed["content"], "root file")
|
self.assertEqual(observed["content"], "root file")
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ class TestYandexDiskAdaptiveCollection(unittest.TestCase):
|
|||||||
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
|
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
|
||||||
|
|
||||||
for item in files:
|
for item in files:
|
||||||
|
self.assertTrue(item.filename)
|
||||||
logger.info(f"Yandex file found during test iteration: {item.local_path}")
|
logger.info(f"Yandex file found during test iteration: {item.local_path}")
|
||||||
|
|
||||||
self.assertIsInstance(files, list)
|
self.assertIsInstance(files, list)
|
||||||
|
|||||||
Reference in New Issue
Block a user