Prefect client prep for langchain
This commit is contained in:
BIN
services/rag/langchain/.DS_Store
vendored
BIN
services/rag/langchain/.DS_Store
vendored
Binary file not shown.
@@ -6,7 +6,7 @@ Use if possible logging, using library `loguru`, for steps. Use logrotation in f
|
|||||||
|
|
||||||
Chosen RAG framework: Langchain
|
Chosen RAG framework: Langchain
|
||||||
Chosen Vector Storage: Qdrant
|
Chosen Vector Storage: Qdrant
|
||||||
Chosen data folder: relatve ./../../../data - from the current folder
|
Chosen data folder: relative ./../../../data - from the current folder
|
||||||
|
|
||||||
# Phase 1 (cli entrypoint)
|
# Phase 1 (cli entrypoint)
|
||||||
|
|
||||||
@@ -101,3 +101,13 @@ During this Phase we create asynchronous process of enrichment, utilizing async/
|
|||||||
- [x] Function process_adaptive_files_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS)
|
- [x] Function process_adaptive_files_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_FILE_PROCESS_THREADS)
|
||||||
- [x] Function upload_processed_documents_from_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS)
|
- [x] Function upload_processed_documents_from_queue should be started in number of threads (defined in .env ENRICHMENT_ADAPTIVE_DOCUMENT_UPLOADS_THREADS)
|
||||||
- [x] Program should control threads. Function insert_adaptive_files_queue, after adaptive collection ends, then should wait untill all theads finish. What does finish mean? It means when our insert_adaptive_files_queue function realizes that there is no adaptive files left in collection, it marks shared variable between threads, that collection finished. When our other functions in threads sees that this variable became true - they deplete queue and do not go to the next loop to wait for new items in queue, and just finish. This would eventually finish the program. Each thread finishes, and main program too as usual after processing all of things.
|
- [x] Program should control threads. Function insert_adaptive_files_queue, after adaptive collection ends, then should wait untill all theads finish. What does finish mean? It means when our insert_adaptive_files_queue function realizes that there is no adaptive files left in collection, it marks shared variable between threads, that collection finished. When our other functions in threads sees that this variable became true - they deplete queue and do not go to the next loop to wait for new items in queue, and just finish. This would eventually finish the program. Each thread finishes, and main program too as usual after processing all of things.
|
||||||
|
|
||||||
|
# Phase 14 (integration of Prefect client, for creating flow and tasks on remote Prefect server)
|
||||||
|
|
||||||
|
- [ ] Install Prefect client library.
|
||||||
|
- [ ] Add .env variable PREFECT_API_URL, that will be used for connecting client to the prefect server
|
||||||
|
- [ ] Create prefect client file in `prefect/01_yadisk_analyze.py`. In this file we will work with prefect flows and tasks for this phase.
|
||||||
|
- [ ] Create prefect flow called "analyze_yadisk_file_urls"
|
||||||
|
- [ ] Create prefect task "iterate_yadisk_folder_and_store_file_paths" that will connect to yandex disk with yadisk library, analyze everything inside folder `Общая` recursively and store file paths in the ./../../../yadisk_files.json, in array of strings.
|
||||||
|
- [ ] In our pefect file add function for flow to serve, as per prefect documentation on serving flows
|
||||||
|
- [ ] Tests will be done manually by hand, by executing this script and checking prefect dashboard. No automatical tests needed for this phase.
|
||||||
|
|||||||
@@ -81,13 +81,13 @@ SUPPORTED_EXTENSIONS = {
|
|||||||
".pptx",
|
".pptx",
|
||||||
".xlsx",
|
".xlsx",
|
||||||
".xls",
|
".xls",
|
||||||
".jpg",
|
# ".jpg",
|
||||||
".jpeg",
|
# ".jpeg",
|
||||||
".png",
|
# ".png",
|
||||||
".gif",
|
# ".gif",
|
||||||
".bmp",
|
# ".bmp",
|
||||||
".tiff",
|
# ".tiff",
|
||||||
".webp",
|
# ".webp",
|
||||||
".odt",
|
".odt",
|
||||||
".txt", # this one is obvious but was unexpected to see in data lol
|
".txt", # this one is obvious but was unexpected to see in data lol
|
||||||
}
|
}
|
||||||
@@ -273,7 +273,7 @@ class DocumentEnricher:
|
|||||||
extension = adaptive_file.extension.lower()
|
extension = adaptive_file.extension.lower()
|
||||||
file_type = try_guess_file_type(extension)
|
file_type = try_guess_file_type(extension)
|
||||||
|
|
||||||
def process_local_file(local_file_path: str):
|
def process_local_file(original_path: str, local_file_path: str):
|
||||||
nonlocal loaded_docs, processed_record
|
nonlocal loaded_docs, processed_record
|
||||||
|
|
||||||
file_hash = self._get_file_hash(local_file_path)
|
file_hash = self._get_file_hash(local_file_path)
|
||||||
@@ -295,7 +295,7 @@ class DocumentEnricher:
|
|||||||
doc.metadata["file_type"] = file_type
|
doc.metadata["file_type"] = file_type
|
||||||
doc.metadata["source"] = source_identifier
|
doc.metadata["source"] = source_identifier
|
||||||
doc.metadata["filename"] = adaptive_file.filename
|
doc.metadata["filename"] = adaptive_file.filename
|
||||||
doc.metadata["file_path"] = source_identifier
|
doc.metadata["file_path"] = original_path
|
||||||
doc.metadata["file_size"] = os.path.getsize(local_file_path)
|
doc.metadata["file_size"] = os.path.getsize(local_file_path)
|
||||||
doc.metadata["file_extension"] = extension
|
doc.metadata["file_extension"] = extension
|
||||||
|
|
||||||
@@ -310,7 +310,7 @@ class DocumentEnricher:
|
|||||||
)
|
)
|
||||||
|
|
||||||
loaded_docs = split_docs
|
loaded_docs = split_docs
|
||||||
processed_record = (source_identifier, file_hash)
|
processed_record = (original_path, file_hash)
|
||||||
|
|
||||||
adaptive_file.work_with_file_locally(process_local_file)
|
adaptive_file.work_with_file_locally(process_local_file)
|
||||||
return loaded_docs, processed_record
|
return loaded_docs, processed_record
|
||||||
|
|||||||
@@ -123,9 +123,9 @@ class _AdaptiveFile(ABC):
|
|||||||
|
|
||||||
# This method allows to work with file locally, and lambda should be provided for this.
|
# This method allows to work with file locally, and lambda should be provided for this.
|
||||||
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
||||||
# Lambda: first argument is a local path
|
# Lambda: first argument is an original path, second: local path. In case of just local files, these will be the same
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str, str], None]):
|
||||||
"""Run callback with a local path to the file."""
|
"""Run callback with a local path to the file."""
|
||||||
|
|
||||||
|
|
||||||
@@ -143,8 +143,8 @@ class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
|||||||
super().__init__(filename, extension)
|
super().__init__(filename, extension)
|
||||||
self.local_path = local_path
|
self.local_path = local_path
|
||||||
|
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str, str], None]):
|
||||||
func(self.local_path)
|
func(self.local_path, self.local_path)
|
||||||
|
|
||||||
|
|
||||||
class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
||||||
@@ -196,10 +196,10 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
|
|||||||
temp_file.write(file_response.content)
|
temp_file.write(file_response.content)
|
||||||
return temp_file.name
|
return temp_file.name
|
||||||
|
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str, str], None]):
|
||||||
temp_path = self._download_to_temp_file()
|
temp_path = self._download_to_temp_file()
|
||||||
try:
|
try:
|
||||||
func(temp_path)
|
func(self.remote_path, temp_path)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(temp_path):
|
if os.path.exists(temp_path):
|
||||||
os.unlink(temp_path)
|
os.unlink(temp_path)
|
||||||
|
|||||||
@@ -47,8 +47,12 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
|||||||
|
|
||||||
- [x] Add log of how many files currently being processed in enrichment. We need to see how many total to process and how many processed each time new document being processed. If it's possible, also add progressbar showing percentage and those numbers on top of logs.
|
- [x] Add log of how many files currently being processed in enrichment. We need to see how many total to process and how many processed each time new document being processed. If it's possible, also add progressbar showing percentage and those numbers on top of logs.
|
||||||
|
|
||||||
# Phase 8 (chat feature, as agent, for usage in the cli)
|
# Phase 8 (comment unsupported formats for now)
|
||||||
|
|
||||||
- [ ] Create file `agent.py`, which will incorporate into itself agent, powered by the chat model. It should use integration with openai, env variables are configure
|
- [ ] Remove for now formats, extensions for images of any kind, archives of any kind, and add possible text documents, documents formats, like .txt, .xlsx, etc.
|
||||||
- [ ] Integrate this agent with the existing solution for retrieving, with retrieval.py, if it's possible in current chosen RAG framework
|
|
||||||
- [ ] Integrate this agent with the cli, as command to start chatting with the agent. If there is a built-in solution for console communication with the agent, initiate this on cli command.
|
# Phase 9 (integration of Prefect client, for creating flow and tasks on remote Prefect server)
|
||||||
|
|
||||||
|
- [ ] Install Prefect client library.
|
||||||
|
- [ ] Add .env variable PREFECT_API_URL, that will be used for connecting client to the prefect server
|
||||||
|
- [ ] Create
|
||||||
|
|||||||
@@ -6,24 +6,24 @@ processing them with appropriate loaders, splitting them into chunks,
|
|||||||
and storing them in the vector database with proper metadata.
|
and storing them in the vector database with proper metadata.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
import hashlib
|
import hashlib
|
||||||
from pathlib import Path
|
import os
|
||||||
from typing import List, Dict, Any
|
|
||||||
from datetime import datetime
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from llama_index.core import Document, SimpleDirectoryReader
|
||||||
|
from llama_index.core.node_parser import CodeSplitter, SentenceSplitter
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from llama_index.core import SimpleDirectoryReader, Document
|
|
||||||
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
|
|
||||||
# Removed unused import
|
|
||||||
|
|
||||||
from vector_storage import get_vector_store_and_index
|
|
||||||
|
|
||||||
# Import the new configuration module
|
# Import the new configuration module
|
||||||
from config import get_embedding_model
|
from config import get_embedding_model
|
||||||
|
|
||||||
|
# Removed unused import
|
||||||
|
from vector_storage import get_vector_store_and_index
|
||||||
|
|
||||||
|
|
||||||
class DocumentTracker:
|
class DocumentTracker:
|
||||||
"""Class to handle tracking of processed documents to avoid re-processing."""
|
"""Class to handle tracking of processed documents to avoid re-processing."""
|
||||||
@@ -38,7 +38,7 @@ class DocumentTracker:
|
|||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Create table for tracking processed documents
|
# Create table for tracking processed documents
|
||||||
cursor.execute('''
|
cursor.execute("""
|
||||||
CREATE TABLE IF NOT EXISTS processed_documents (
|
CREATE TABLE IF NOT EXISTS processed_documents (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
filename TEXT UNIQUE NOT NULL,
|
filename TEXT UNIQUE NOT NULL,
|
||||||
@@ -47,7 +47,7 @@ class DocumentTracker:
|
|||||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
metadata_json TEXT
|
metadata_json TEXT
|
||||||
)
|
)
|
||||||
''')
|
""")
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -63,7 +63,7 @@ class DocumentTracker:
|
|||||||
|
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
|
"SELECT COUNT(*) FROM processed_documents WHERE filepath = ? AND checksum = ?",
|
||||||
(filepath, checksum)
|
(filepath, checksum),
|
||||||
)
|
)
|
||||||
count = cursor.fetchone()[0]
|
count = cursor.fetchone()[0]
|
||||||
|
|
||||||
@@ -79,11 +79,14 @@ class DocumentTracker:
|
|||||||
filename = Path(filepath).name
|
filename = Path(filepath).name
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cursor.execute('''
|
cursor.execute(
|
||||||
|
"""
|
||||||
INSERT OR REPLACE INTO processed_documents
|
INSERT OR REPLACE INTO processed_documents
|
||||||
(filename, filepath, checksum, processed_at, metadata_json)
|
(filename, filepath, checksum, processed_at, metadata_json)
|
||||||
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
|
VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?)
|
||||||
''', (filename, filepath, checksum, str(metadata) if metadata else None))
|
""",
|
||||||
|
(filename, filepath, checksum, str(metadata) if metadata else None),
|
||||||
|
)
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
logger.info(f"Document marked as processed: {filepath}")
|
logger.info(f"Document marked as processed: {filepath}")
|
||||||
@@ -104,62 +107,67 @@ class DocumentTracker:
|
|||||||
|
|
||||||
def get_text_splitter(file_extension: str):
|
def get_text_splitter(file_extension: str):
|
||||||
"""Get appropriate text splitter based on file type."""
|
"""Get appropriate text splitter based on file type."""
|
||||||
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, TokenTextSplitter
|
from llama_index.core.node_parser import (
|
||||||
from llama_index.core.node_parser import MarkdownElementNodeParser
|
CodeSplitter,
|
||||||
|
MarkdownElementNodeParser,
|
||||||
|
SentenceSplitter,
|
||||||
|
TokenTextSplitter,
|
||||||
|
)
|
||||||
|
|
||||||
# For code files, use CodeSplitter
|
# For code files, use CodeSplitter
|
||||||
if file_extension.lower() in ['.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.go', '.rs', '.php', '.html', '.css', '.md', '.rst']:
|
if file_extension.lower() in [
|
||||||
|
".py",
|
||||||
|
".js",
|
||||||
|
".ts",
|
||||||
|
".java",
|
||||||
|
".cpp",
|
||||||
|
".c",
|
||||||
|
".h",
|
||||||
|
".cs",
|
||||||
|
".go",
|
||||||
|
".rs",
|
||||||
|
".php",
|
||||||
|
".html",
|
||||||
|
".css",
|
||||||
|
".md",
|
||||||
|
".rst",
|
||||||
|
]:
|
||||||
return CodeSplitter(language="python", max_chars=1000)
|
return CodeSplitter(language="python", max_chars=1000)
|
||||||
|
|
||||||
# For PDF files, use a parser that can handle multi-page documents
|
# For PDF files, use a parser that can handle multi-page documents
|
||||||
elif file_extension.lower() == '.pdf':
|
elif file_extension.lower() == ".pdf":
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(
|
||||||
chunk_size=512, # Smaller chunks for dense PDF content
|
chunk_size=512, # Smaller chunks for dense PDF content
|
||||||
chunk_overlap=100
|
chunk_overlap=100,
|
||||||
)
|
)
|
||||||
|
|
||||||
# For presentation files (PowerPoint), use smaller chunks
|
# For presentation files (PowerPoint), use smaller chunks
|
||||||
elif file_extension.lower() == '.pptx':
|
elif file_extension.lower() == ".pptx":
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(
|
||||||
chunk_size=256, # Slides typically have less text
|
chunk_size=256, # Slides typically have less text
|
||||||
chunk_overlap=50
|
chunk_overlap=50,
|
||||||
)
|
)
|
||||||
|
|
||||||
# For spreadsheets, use smaller chunks
|
# For spreadsheets, use smaller chunks
|
||||||
elif file_extension.lower() == '.xlsx':
|
elif file_extension.lower() == ".xlsx":
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(chunk_size=256, chunk_overlap=50)
|
||||||
chunk_size=256,
|
|
||||||
chunk_overlap=50
|
|
||||||
)
|
|
||||||
|
|
||||||
# For text-heavy documents like Word, use medium-sized chunks
|
# For text-heavy documents like Word, use medium-sized chunks
|
||||||
elif file_extension.lower() in ['.docx', '.odt']:
|
elif file_extension.lower() in [".docx", ".odt"]:
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
||||||
chunk_size=768,
|
|
||||||
chunk_overlap=150
|
|
||||||
)
|
|
||||||
|
|
||||||
# For plain text files, use larger chunks
|
# For plain text files, use larger chunks
|
||||||
elif file_extension.lower() == '.txt':
|
elif file_extension.lower() == ".txt":
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(chunk_size=1024, chunk_overlap=200)
|
||||||
chunk_size=1024,
|
|
||||||
chunk_overlap=200
|
|
||||||
)
|
|
||||||
|
|
||||||
# For image files, we'll handle them differently (metadata extraction)
|
# For image files, we'll handle them differently (metadata extraction)
|
||||||
elif file_extension.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg']:
|
elif file_extension.lower() in [".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg"]:
|
||||||
# Images will be handled by multimodal models, return a simple splitter
|
# Images will be handled by multimodal models, return a simple splitter
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(chunk_size=512, chunk_overlap=100)
|
||||||
chunk_size=512,
|
|
||||||
chunk_overlap=100
|
|
||||||
)
|
|
||||||
|
|
||||||
# For other files, use a standard SentenceSplitter
|
# For other files, use a standard SentenceSplitter
|
||||||
else:
|
else:
|
||||||
return SentenceSplitter(
|
return SentenceSplitter(chunk_size=768, chunk_overlap=150)
|
||||||
chunk_size=768,
|
|
||||||
chunk_overlap=150
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_proper_encoding(text):
|
def ensure_proper_encoding(text):
|
||||||
@@ -178,35 +186,41 @@ def ensure_proper_encoding(text):
|
|||||||
if isinstance(text, bytes):
|
if isinstance(text, bytes):
|
||||||
# Decode bytes to string with proper encoding
|
# Decode bytes to string with proper encoding
|
||||||
try:
|
try:
|
||||||
return text.decode('utf-8')
|
return text.decode("utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
# If UTF-8 fails, try other encodings commonly used for Russian/Cyrillic text
|
||||||
try:
|
try:
|
||||||
return text.decode('cp1251') # Windows Cyrillic encoding
|
return text.decode("cp1251") # Windows Cyrillic encoding
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
try:
|
try:
|
||||||
return text.decode('koi8-r') # Russian encoding
|
return text.decode("koi8-r") # Russian encoding
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# If all else fails, decode with errors='replace'
|
# If all else fails, decode with errors='replace'
|
||||||
return text.decode('utf-8', errors='replace')
|
return text.decode("utf-8", errors="replace")
|
||||||
elif isinstance(text, str):
|
elif isinstance(text, str):
|
||||||
# Ensure the string is properly encoded
|
# Ensure the string is properly encoded
|
||||||
try:
|
try:
|
||||||
# Try to encode and decode to ensure it's valid UTF-8
|
# Try to encode and decode to ensure it's valid UTF-8
|
||||||
return text.encode('utf-8').decode('utf-8')
|
return text.encode("utf-8").decode("utf-8")
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
# If there are encoding issues, try to fix them
|
# If there are encoding issues, try to fix them
|
||||||
return text.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
return text.encode("utf-8", errors="replace").decode(
|
||||||
|
"utf-8", errors="replace"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Convert other types to string and ensure proper encoding
|
# Convert other types to string and ensure proper encoding
|
||||||
text_str = str(text)
|
text_str = str(text)
|
||||||
try:
|
try:
|
||||||
return text_str.encode('utf-8').decode('utf-8')
|
return text_str.encode("utf-8").decode("utf-8")
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
return text_str.encode('utf-8', errors='replace').decode('utf-8', errors='replace')
|
return text_str.encode("utf-8", errors="replace").decode(
|
||||||
|
"utf-8", errors="replace"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def process_documents_from_data_folder(data_path: str = "../../../data", recursive: bool = True):
|
def process_documents_from_data_folder(
|
||||||
|
data_path: str = "../../../data", recursive: bool = True
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
Process all documents from the data folder using appropriate loaders and store in vector DB.
|
||||||
|
|
||||||
@@ -238,9 +252,22 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
|
|
||||||
# Find all supported files in the data directory
|
# Find all supported files in the data directory
|
||||||
supported_extensions = {
|
supported_extensions = {
|
||||||
'.pdf', '.docx', '.xlsx', '.pptx', '.odt', '.txt',
|
".pdf",
|
||||||
'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg',
|
".docx",
|
||||||
'.zip', '.rar', '.tar', '.gz'
|
".xlsx",
|
||||||
|
".pptx",
|
||||||
|
".odt",
|
||||||
|
".txt",
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".svg",
|
||||||
|
".zip",
|
||||||
|
".rar",
|
||||||
|
".tar",
|
||||||
|
".gz",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Walk through the directory structure
|
# Walk through the directory structure
|
||||||
@@ -267,7 +294,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
||||||
|
|
||||||
for file_path in all_files:
|
for file_path in all_files:
|
||||||
logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")
|
logger.info(
|
||||||
|
f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})"
|
||||||
|
)
|
||||||
|
|
||||||
# Check if document has already been processed
|
# Check if document has already been processed
|
||||||
if tracker.is_document_processed(file_path):
|
if tracker.is_document_processed(file_path):
|
||||||
@@ -286,8 +315,7 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
return {"filename": filename}
|
return {"filename": filename}
|
||||||
|
|
||||||
reader = SimpleDirectoryReader(
|
reader = SimpleDirectoryReader(
|
||||||
input_files=[file_path],
|
input_files=[file_path], file_metadata=file_metadata_func
|
||||||
file_metadata=file_metadata_func
|
|
||||||
)
|
)
|
||||||
documents = reader.load_data()
|
documents = reader.load_data()
|
||||||
|
|
||||||
@@ -304,24 +332,32 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
doc.metadata["processed_at"] = datetime.now().isoformat()
|
doc.metadata["processed_at"] = datetime.now().isoformat()
|
||||||
|
|
||||||
# Handle document-type-specific metadata
|
# Handle document-type-specific metadata
|
||||||
if file_ext.lower() == '.pdf':
|
if file_ext.lower() == ".pdf":
|
||||||
# PDF-specific metadata
|
# PDF-specific metadata
|
||||||
doc.metadata["page_label"] = ensure_proper_encoding(doc.metadata.get("page_label", "unknown"))
|
doc.metadata["page_label"] = ensure_proper_encoding(
|
||||||
|
doc.metadata.get("page_label", "unknown")
|
||||||
|
)
|
||||||
doc.metadata["file_type"] = "pdf"
|
doc.metadata["file_type"] = "pdf"
|
||||||
|
|
||||||
elif file_ext.lower() in ['.docx', '.odt']:
|
elif file_ext.lower() in [".docx", ".odt"]:
|
||||||
# Word document metadata
|
# Word document metadata
|
||||||
doc.metadata["section"] = ensure_proper_encoding(doc.metadata.get("section", "unknown"))
|
doc.metadata["section"] = ensure_proper_encoding(
|
||||||
|
doc.metadata.get("section", "unknown")
|
||||||
|
)
|
||||||
doc.metadata["file_type"] = "document"
|
doc.metadata["file_type"] = "document"
|
||||||
|
|
||||||
elif file_ext.lower() == '.pptx':
|
elif file_ext.lower() == ".pptx":
|
||||||
# PowerPoint metadata
|
# PowerPoint metadata
|
||||||
doc.metadata["slide_id"] = ensure_proper_encoding(doc.metadata.get("slide_id", "unknown"))
|
doc.metadata["slide_id"] = ensure_proper_encoding(
|
||||||
|
doc.metadata.get("slide_id", "unknown")
|
||||||
|
)
|
||||||
doc.metadata["file_type"] = "presentation"
|
doc.metadata["file_type"] = "presentation"
|
||||||
|
|
||||||
elif file_ext.lower() == '.xlsx':
|
elif file_ext.lower() == ".xlsx":
|
||||||
# Excel metadata
|
# Excel metadata
|
||||||
doc.metadata["sheet_name"] = ensure_proper_encoding(doc.metadata.get("sheet_name", "unknown"))
|
doc.metadata["sheet_name"] = ensure_proper_encoding(
|
||||||
|
doc.metadata.get("sheet_name", "unknown")
|
||||||
|
)
|
||||||
doc.metadata["file_type"] = "spreadsheet"
|
doc.metadata["file_type"] = "spreadsheet"
|
||||||
|
|
||||||
# Determine the appropriate text splitter based on file type
|
# Determine the appropriate text splitter based on file type
|
||||||
@@ -334,7 +370,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
nodes_with_enhanced_metadata = []
|
nodes_with_enhanced_metadata = []
|
||||||
for i, node in enumerate(nodes):
|
for i, node in enumerate(nodes):
|
||||||
# Enhance node metadata with additional information
|
# Enhance node metadata with additional information
|
||||||
node.metadata["original_doc_id"] = ensure_proper_encoding(doc.doc_id)
|
node.metadata["original_doc_id"] = ensure_proper_encoding(
|
||||||
|
doc.doc_id
|
||||||
|
)
|
||||||
node.metadata["chunk_number"] = i
|
node.metadata["chunk_number"] = i
|
||||||
node.metadata["total_chunks"] = len(nodes)
|
node.metadata["total_chunks"] = len(nodes)
|
||||||
node.metadata["file_path"] = encoded_file_path
|
node.metadata["file_path"] = encoded_file_path
|
||||||
@@ -362,7 +400,9 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
|||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
pbar.close()
|
pbar.close()
|
||||||
logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")
|
logger.info(
|
||||||
|
f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def enrich_documents():
|
def enrich_documents():
|
||||||
|
|||||||
Reference in New Issue
Block a user