Prefect client prep for langchain

This commit is contained in:
2026-02-16 15:12:44 +03:00
parent 93d538ecc6
commit 77c578c9e6
6 changed files with 148 additions and 94 deletions

View File

@@ -81,13 +81,13 @@ SUPPORTED_EXTENSIONS = {
".pptx",
".xlsx",
".xls",
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".tiff",
".webp",
# ".jpg",
# ".jpeg",
# ".png",
# ".gif",
# ".bmp",
# ".tiff",
# ".webp",
".odt",
".txt", # this one is obvious but was unexpected to see in data lol
}
@@ -273,7 +273,7 @@ class DocumentEnricher:
extension = adaptive_file.extension.lower()
file_type = try_guess_file_type(extension)
def process_local_file(local_file_path: str):
def process_local_file(original_path: str, local_file_path: str):
nonlocal loaded_docs, processed_record
file_hash = self._get_file_hash(local_file_path)
@@ -295,7 +295,7 @@ class DocumentEnricher:
doc.metadata["file_type"] = file_type
doc.metadata["source"] = source_identifier
doc.metadata["filename"] = adaptive_file.filename
doc.metadata["file_path"] = source_identifier
doc.metadata["file_path"] = original_path
doc.metadata["file_size"] = os.path.getsize(local_file_path)
doc.metadata["file_extension"] = extension
@@ -310,7 +310,7 @@ class DocumentEnricher:
)
loaded_docs = split_docs
processed_record = (source_identifier, file_hash)
processed_record = (original_path, file_hash)
adaptive_file.work_with_file_locally(process_local_file)
return loaded_docs, processed_record