Phase 12 done... loading via adaptive collection, yadisk or local
This commit is contained in:
@@ -78,9 +78,9 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
|||||||
|
|
||||||
During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper.
|
During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper.
|
||||||
|
|
||||||
- [ ] Adaptive file in helper now has filename in it, so tests should be adjusted for this
|
- [x] Adaptive file in helper now has filename in it, so tests should be adjusted for this
|
||||||
- [ ] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local
|
- [x] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local
|
||||||
- [ ] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH
|
- [x] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH
|
||||||
- [ ] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system
|
- [x] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system
|
||||||
- [ ] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
|
- [x] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
|
||||||
- [ ] Adaptive files has filename in them, so it should be used when extracting metadata
|
- [x] Adaptive files has filename in them, so it should be used when extracting metadata
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
import os
|
import os
|
||||||
import hashlib
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict, Any
|
from typing import List, Tuple
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||||
@@ -37,9 +37,15 @@ from sqlalchemy import create_engine, Column, Integer, String
|
|||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
from helpers import extract_russian_event_names, extract_years_from_text
|
from helpers import (
|
||||||
|
LocalFilesystemAdaptiveCollection,
|
||||||
|
YandexDiskAdaptiveCollection,
|
||||||
|
_AdaptiveCollection,
|
||||||
|
_AdaptiveFile,
|
||||||
|
extract_russian_event_names,
|
||||||
|
extract_years_from_text,
|
||||||
|
)
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
@@ -48,6 +54,27 @@ load_dotenv()
|
|||||||
# Define the path to the data directory
|
# Define the path to the data directory
|
||||||
DATA_DIR = Path("../../../data").resolve()
|
DATA_DIR = Path("../../../data").resolve()
|
||||||
DB_PATH = Path("document_tracking.db").resolve()
|
DB_PATH = Path("document_tracking.db").resolve()
|
||||||
|
ENRICHMENT_SOURCE = os.getenv("ENRICHMENT_SOURCE", "local").lower()
|
||||||
|
ENRICHMENT_LOCAL_PATH = os.getenv("ENRICHMENT_LOCAL_PATH")
|
||||||
|
ENRICHMENT_YADISK_PATH = os.getenv("ENRICHMENT_YADISK_PATH")
|
||||||
|
YADISK_TOKEN = os.getenv("YADISK_TOKEN")
|
||||||
|
|
||||||
|
SUPPORTED_EXTENSIONS = {
|
||||||
|
".pdf",
|
||||||
|
".docx",
|
||||||
|
".doc",
|
||||||
|
".pptx",
|
||||||
|
".xlsx",
|
||||||
|
".xls",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".png",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".webp",
|
||||||
|
".odt",
|
||||||
|
}
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
@@ -91,19 +118,17 @@ class DocumentEnricher:
|
|||||||
hash_sha256.update(chunk)
|
hash_sha256.update(chunk)
|
||||||
return hash_sha256.hexdigest()
|
return hash_sha256.hexdigest()
|
||||||
|
|
||||||
def _is_document_processed(self, file_path: str) -> bool:
|
def _is_document_hash_processed(self, file_hash: str) -> bool:
|
||||||
"""Check if a document has already been processed."""
|
"""Check if a document hash has already been processed."""
|
||||||
file_hash = self._get_file_hash(file_path)
|
|
||||||
existing = self.session.query(ProcessedDocument).filter_by(
|
existing = self.session.query(ProcessedDocument).filter_by(
|
||||||
file_hash=file_hash
|
file_hash=file_hash
|
||||||
).first()
|
).first()
|
||||||
return existing is not None
|
return existing is not None
|
||||||
|
|
||||||
def _mark_document_processed(self, file_path: str):
|
def _mark_document_processed(self, file_identifier: str, file_hash: str):
|
||||||
"""Mark a document as processed in the database."""
|
"""Mark a document as processed in the database."""
|
||||||
file_hash = self._get_file_hash(file_path)
|
|
||||||
doc_record = ProcessedDocument(
|
doc_record = ProcessedDocument(
|
||||||
file_path=file_path,
|
file_path=file_identifier,
|
||||||
file_hash=file_hash
|
file_hash=file_hash
|
||||||
)
|
)
|
||||||
self.session.add(doc_record)
|
self.session.add(doc_record)
|
||||||
@@ -142,77 +167,88 @@ class DocumentEnricher:
|
|||||||
return None
|
return None
|
||||||
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
return UnstructuredODTLoader(file_path, **{"strategy": "hi_res", "languages": ["rus"]})
|
||||||
else:
|
else:
|
||||||
# For text files and unsupported formats, try to load as text
|
return None
|
||||||
try:
|
|
||||||
with open(file_path, 'r', encoding='utf-8') as f:
|
|
||||||
content = f.read()
|
|
||||||
return None, content # Return content directly for text processing
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
logger.warning(f"Could not decode file as text: {file_path}")
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
def load_and_split_documents(self, file_paths: List[str]) -> List[Document]:
|
def _load_one_adaptive_file(
|
||||||
"""Load documents from file paths and split them appropriately."""
|
self, adaptive_file: _AdaptiveFile
|
||||||
all_docs = []
|
) -> Tuple[List[Document], str | None]:
|
||||||
|
"""Load and split one adaptive file by using its local working callback."""
|
||||||
|
loaded_docs: List[Document] = []
|
||||||
|
file_hash: str | None = None
|
||||||
|
source_identifier = adaptive_file.local_path
|
||||||
|
extension = adaptive_file.extension.lower()
|
||||||
|
|
||||||
for file_path in file_paths:
|
def process_local_file(local_file_path: str):
|
||||||
if self._is_document_processed(file_path):
|
nonlocal loaded_docs, file_hash
|
||||||
logger.info(f"Skipping already processed document: {file_path}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Processing document: {file_path}")
|
file_hash = self._get_file_hash(local_file_path)
|
||||||
|
if self._is_document_hash_processed(file_hash):
|
||||||
# Get the appropriate loader for the file extension
|
logger.info(f"Skipping already processed document hash for: {source_identifier}")
|
||||||
loader = self._get_loader_for_extension(file_path)
|
return
|
||||||
|
|
||||||
|
loader = self._get_loader_for_extension(local_file_path)
|
||||||
if loader is None:
|
if loader is None:
|
||||||
# For unsupported formats that we tried to load as text
|
logger.warning(f"No loader available for file: {source_identifier}")
|
||||||
continue
|
return
|
||||||
|
|
||||||
try:
|
|
||||||
# Load the document(s)
|
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
|
|
||||||
# Add metadata to each document
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
# Extract metadata from the original file
|
doc.metadata["source"] = source_identifier
|
||||||
doc.metadata["source"] = file_path
|
doc.metadata["filename"] = adaptive_file.filename
|
||||||
doc.metadata["filename"] = Path(file_path).name
|
doc.metadata["file_path"] = source_identifier
|
||||||
doc.metadata["file_path"] = file_path
|
doc.metadata["file_size"] = os.path.getsize(local_file_path)
|
||||||
doc.metadata["file_size"] = os.path.getsize(file_path)
|
doc.metadata["file_extension"] = extension
|
||||||
|
|
||||||
# Add page number if available in original metadata
|
|
||||||
if "page" in doc.metadata:
|
if "page" in doc.metadata:
|
||||||
doc.metadata["page_number"] = doc.metadata["page"]
|
doc.metadata["page_number"] = doc.metadata["page"]
|
||||||
|
|
||||||
# Add file extension as metadata
|
|
||||||
doc.metadata["file_extension"] = Path(file_path).suffix
|
|
||||||
|
|
||||||
# Split documents if they are too large
|
|
||||||
split_docs = self.text_splitter.split_documents(docs)
|
split_docs = self.text_splitter.split_documents(docs)
|
||||||
|
|
||||||
# Extract additional metadata from each chunk.
|
|
||||||
for chunk in split_docs:
|
for chunk in split_docs:
|
||||||
years = extract_years_from_text(chunk.page_content)
|
years = extract_years_from_text(chunk.page_content)
|
||||||
events = extract_russian_event_names(chunk.page_content)
|
events = extract_russian_event_names(chunk.page_content)
|
||||||
chunk.metadata["years"] = years
|
chunk.metadata["years"] = years
|
||||||
chunk.metadata["events"] = events
|
chunk.metadata["events"] = events
|
||||||
|
|
||||||
# Add to the collection
|
loaded_docs = split_docs
|
||||||
all_docs.extend(split_docs)
|
|
||||||
|
|
||||||
except Exception as e:
|
adaptive_file.work_with_file_locally(process_local_file)
|
||||||
logger.error(f"Error processing {file_path}: {str(e)}")
|
return loaded_docs, file_hash
|
||||||
|
|
||||||
|
def load_and_split_documents(
|
||||||
|
self, adaptive_collection: _AdaptiveCollection, recursive: bool = True
|
||||||
|
) -> Tuple[List[Document], List[Tuple[str, str]]]:
|
||||||
|
"""Load documents from adaptive collection and split them appropriately."""
|
||||||
|
all_docs: List[Document] = []
|
||||||
|
processed_file_records: dict[str, str] = {}
|
||||||
|
|
||||||
|
for adaptive_file in adaptive_collection.iterate(recursive=recursive):
|
||||||
|
if adaptive_file.extension.lower() not in SUPPORTED_EXTENSIONS:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping unsupported file extension for {adaptive_file.filename}: {adaptive_file.extension}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return all_docs
|
logger.info(f"Processing document: {adaptive_file.local_path}")
|
||||||
|
try:
|
||||||
|
split_docs, file_hash = self._load_one_adaptive_file(adaptive_file)
|
||||||
|
if split_docs:
|
||||||
|
all_docs.extend(split_docs)
|
||||||
|
if file_hash:
|
||||||
|
processed_file_records[adaptive_file.local_path] = file_hash
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {adaptive_file.local_path}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
def enrich_and_store(self, file_paths: List[str]):
|
return all_docs, list(processed_file_records.items())
|
||||||
|
|
||||||
|
def enrich_and_store(self, adaptive_collection: _AdaptiveCollection):
|
||||||
"""Load, enrich, and store documents in the vector store."""
|
"""Load, enrich, and store documents in the vector store."""
|
||||||
logger.info(f"Starting enrichment process for {len(file_paths)} files...")
|
logger.info("Starting enrichment process...")
|
||||||
|
|
||||||
# Load and split documents
|
# Load and split documents
|
||||||
documents = self.load_and_split_documents(file_paths)
|
documents, processed_file_records = self.load_and_split_documents(
|
||||||
|
adaptive_collection
|
||||||
|
)
|
||||||
|
|
||||||
if not documents:
|
if not documents:
|
||||||
logger.info("No new documents to process.")
|
logger.info("No new documents to process.")
|
||||||
@@ -225,55 +261,58 @@ class DocumentEnricher:
|
|||||||
self.vector_store.add_documents(documents)
|
self.vector_store.add_documents(documents)
|
||||||
|
|
||||||
# Only mark documents as processed after successful insertion to vector store
|
# Only mark documents as processed after successful insertion to vector store
|
||||||
processed_file_paths = set()
|
for file_identifier, file_hash in processed_file_records:
|
||||||
for doc in documents:
|
self._mark_document_processed(file_identifier, file_hash)
|
||||||
if 'source' in doc.metadata:
|
|
||||||
processed_file_paths.add(doc.metadata['source'])
|
|
||||||
|
|
||||||
for file_path in processed_file_paths:
|
logger.info(
|
||||||
self._mark_document_processed(file_path)
|
f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_records)} files as processed."
|
||||||
|
)
|
||||||
logger.info(f"Successfully added {len(documents)} document chunks to vector store and marked {len(processed_file_paths)} files as processed.")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error adding documents to vector store: {str(e)}")
|
logger.error(f"Error adding documents to vector store: {str(e)}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
def get_all_documents_from_data_dir(data_dir: str = str(DATA_DIR)) -> List[str]:
|
def get_enrichment_adaptive_collection(
|
||||||
"""Get all supported document file paths from the data directory."""
|
data_dir: str = str(DATA_DIR),
|
||||||
supported_extensions = {
|
) -> _AdaptiveCollection:
|
||||||
'.pdf', '.docx', '.doc', '.pptx', '.xlsx', '.xls',
|
"""Create adaptive collection based on environment source configuration."""
|
||||||
'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff',
|
source = ENRICHMENT_SOURCE
|
||||||
'.webp', '.odt'
|
if source == "local":
|
||||||
}
|
local_path = ENRICHMENT_LOCAL_PATH or data_dir
|
||||||
|
logger.info(f"Using local adaptive collection from path: {local_path}")
|
||||||
|
return LocalFilesystemAdaptiveCollection(local_path)
|
||||||
|
|
||||||
file_paths = []
|
if source == "yadisk":
|
||||||
for root, dirs, files in os.walk(data_dir):
|
if not YADISK_TOKEN:
|
||||||
for file in files:
|
raise ValueError("YADISK_TOKEN must be set when ENRICHMENT_SOURCE=yadisk")
|
||||||
if Path(file).suffix.lower() in supported_extensions:
|
if not ENRICHMENT_YADISK_PATH:
|
||||||
file_paths.append(os.path.join(root, file))
|
raise ValueError(
|
||||||
|
"ENRICHMENT_YADISK_PATH must be set when ENRICHMENT_SOURCE=yadisk"
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
f"Using Yandex Disk adaptive collection from path: {ENRICHMENT_YADISK_PATH}"
|
||||||
|
)
|
||||||
|
return YandexDiskAdaptiveCollection(
|
||||||
|
token=YADISK_TOKEN,
|
||||||
|
base_dir=ENRICHMENT_YADISK_PATH,
|
||||||
|
)
|
||||||
|
|
||||||
return file_paths
|
raise ValueError(
|
||||||
|
f"Unsupported ENRICHMENT_SOURCE='{source}'. Allowed values: local, yadisk"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
|
def run_enrichment_process(vector_store, data_dir: str = str(DATA_DIR)):
|
||||||
"""Run the full enrichment process."""
|
"""Run the full enrichment process."""
|
||||||
logger.info(f"Starting document enrichment from directory: {data_dir}")
|
logger.info("Starting document enrichment process")
|
||||||
|
|
||||||
# Get all supported documents from the data directory
|
adaptive_collection = get_enrichment_adaptive_collection(data_dir=data_dir)
|
||||||
file_paths = get_all_documents_from_data_dir(data_dir)
|
|
||||||
|
|
||||||
if not file_paths:
|
|
||||||
logger.warning(f"No supported documents found in {data_dir}")
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info(f"Found {len(file_paths)} documents to process")
|
|
||||||
|
|
||||||
# Initialize the document enricher
|
# Initialize the document enricher
|
||||||
enricher = DocumentEnricher(vector_store)
|
enricher = DocumentEnricher(vector_store)
|
||||||
|
|
||||||
# Run the enrichment process
|
# Run the enrichment process
|
||||||
enricher.enrich_and_store(file_paths)
|
enricher.enrich_and_store(adaptive_collection)
|
||||||
|
|
||||||
logger.info("Document enrichment process completed!")
|
logger.info("Document enrichment process completed!")
|
||||||
|
|
||||||
|
|||||||
@@ -118,9 +118,10 @@ class _AdaptiveFile(ABC):
|
|||||||
local_path: str
|
local_path: str
|
||||||
filename: str
|
filename: str
|
||||||
|
|
||||||
def __init__(self, filename: str, extension: str):
|
def __init__(self, filename: str, extension: str, local_path: str):
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
self.extension = extension
|
self.extension = extension
|
||||||
|
self.local_path = local_path
|
||||||
|
|
||||||
# This method allows to work with file locally, and lambda should be provided for this.
|
# This method allows to work with file locally, and lambda should be provided for this.
|
||||||
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
||||||
@@ -138,11 +139,8 @@ class _AdaptiveCollection(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
||||||
local_path: str
|
|
||||||
|
|
||||||
def __init__(self, filename: str, extension: str, local_path: str):
|
def __init__(self, filename: str, extension: str, local_path: str):
|
||||||
super().__init__(filename, extension)
|
super().__init__(filename, extension, local_path)
|
||||||
self.local_path = local_path
|
|
||||||
|
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||||
func(self.local_path)
|
func(self.local_path)
|
||||||
@@ -173,7 +171,7 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
|
|||||||
remote_path: str
|
remote_path: str
|
||||||
|
|
||||||
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
|
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
|
||||||
super().__init__(filename, extension)
|
super().__init__(filename, extension, remote_path)
|
||||||
self.token = token
|
self.token = token
|
||||||
self.remote_path = remote_path
|
self.remote_path = remote_path
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
||||||
|
|
||||||
files = list(collection.iterate(recursive=False))
|
files = list(collection.iterate(recursive=False))
|
||||||
file_names = sorted(Path(file.local_path).name for file in files)
|
file_names = sorted(file.filename for file in files)
|
||||||
|
|
||||||
self.assertEqual(file_names, ["root.txt"])
|
self.assertEqual(file_names, ["root.txt"])
|
||||||
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
|
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
|
||||||
@@ -33,7 +33,9 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
|
|
||||||
def test_work_with_file_locally_provides_existing_path(self):
|
def test_work_with_file_locally_provides_existing_path(self):
|
||||||
target_path = self.samples_dir / "root.txt"
|
target_path = self.samples_dir / "root.txt"
|
||||||
adaptive_file = LocalFilesystemAdaptiveFile(target_path.suffix, str(target_path))
|
adaptive_file = LocalFilesystemAdaptiveFile(
|
||||||
|
target_path.name, target_path.suffix, str(target_path)
|
||||||
|
)
|
||||||
|
|
||||||
observed = {}
|
observed = {}
|
||||||
|
|
||||||
@@ -44,6 +46,7 @@ class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
|||||||
|
|
||||||
adaptive_file.work_with_file_locally(callback)
|
adaptive_file.work_with_file_locally(callback)
|
||||||
|
|
||||||
|
self.assertEqual(adaptive_file.filename, "root.txt")
|
||||||
self.assertEqual(observed["path"], str(target_path))
|
self.assertEqual(observed["path"], str(target_path))
|
||||||
self.assertEqual(observed["content"], "root file")
|
self.assertEqual(observed["content"], "root file")
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ class TestYandexDiskAdaptiveCollection(unittest.TestCase):
|
|||||||
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
|
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
|
||||||
|
|
||||||
for item in files:
|
for item in files:
|
||||||
|
self.assertTrue(item.filename)
|
||||||
logger.info(f"Yandex file found during test iteration: {item.local_path}")
|
logger.info(f"Yandex file found during test iteration: {item.local_path}")
|
||||||
|
|
||||||
self.assertIsInstance(files, list)
|
self.assertIsInstance(files, list)
|
||||||
|
|||||||
Reference in New Issue
Block a user