llamaindex update + unpacking archives in data
This commit is contained in:
@@ -13,6 +13,7 @@ from typing import List, Dict, Any
|
||||
from datetime import datetime
|
||||
import sqlite3
|
||||
from loguru import logger
|
||||
from tqdm import tqdm
|
||||
|
||||
from llama_index.core import SimpleDirectoryReader, Document
|
||||
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
|
||||
@@ -20,6 +21,9 @@ from llama_index.core.node_parser import SentenceSplitter, CodeSplitter
|
||||
|
||||
from vector_storage import get_vector_store_and_index
|
||||
|
||||
# Import the new configuration module
|
||||
from config import get_embedding_model
|
||||
|
||||
|
||||
class DocumentTracker:
|
||||
"""Class to handle tracking of processed documents to avoid re-processing."""
|
||||
@@ -259,13 +263,18 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
# Initialize progress bar
|
||||
pbar = tqdm(total=len(all_files), desc="Processing documents", unit="file")
|
||||
|
||||
for file_path in all_files:
|
||||
logger.info(f"Processing file: {file_path}")
|
||||
logger.info(f"Processing file: {file_path} ({processed_count + skipped_count + 1}/{len(all_files)})")
|
||||
|
||||
# Check if document has already been processed
|
||||
if tracker.is_document_processed(file_path):
|
||||
logger.info(f"Skipping already processed file: {file_path}")
|
||||
skipped_count += 1
|
||||
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
||||
pbar.update(1)
|
||||
continue
|
||||
|
||||
try:
|
||||
@@ -344,11 +353,15 @@ def process_documents_from_data_folder(data_path: str = "../../../data", recursi
|
||||
# Mark document as processed only after successful insertion
|
||||
tracker.mark_document_processed(file_path, {"nodes_count": len(documents)})
|
||||
processed_count += 1
|
||||
pbar.set_postfix({"Processed": processed_count, "Skipped": skipped_count})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing file {file_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
# Update progress bar regardless of success or failure
|
||||
pbar.update(1)
|
||||
|
||||
pbar.close()
|
||||
logger.info(f"Document enrichment completed. Processed: {processed_count}, Skipped: {skipped_count}")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user