Prefect client prep for langchain
This commit is contained in:
@@ -81,13 +81,13 @@ SUPPORTED_EXTENSIONS = {
|
||||
".pptx",
|
||||
".xlsx",
|
||||
".xls",
|
||||
".jpg",
|
||||
".jpeg",
|
||||
".png",
|
||||
".gif",
|
||||
".bmp",
|
||||
".tiff",
|
||||
".webp",
|
||||
# ".jpg",
|
||||
# ".jpeg",
|
||||
# ".png",
|
||||
# ".gif",
|
||||
# ".bmp",
|
||||
# ".tiff",
|
||||
# ".webp",
|
||||
".odt",
|
||||
".txt", # this one is obvious but was unexpected to see in data lol
|
||||
}
|
||||
@@ -273,7 +273,7 @@ class DocumentEnricher:
|
||||
extension = adaptive_file.extension.lower()
|
||||
file_type = try_guess_file_type(extension)
|
||||
|
||||
def process_local_file(local_file_path: str):
|
||||
def process_local_file(original_path: str, local_file_path: str):
|
||||
nonlocal loaded_docs, processed_record
|
||||
|
||||
file_hash = self._get_file_hash(local_file_path)
|
||||
@@ -295,7 +295,7 @@ class DocumentEnricher:
|
||||
doc.metadata["file_type"] = file_type
|
||||
doc.metadata["source"] = source_identifier
|
||||
doc.metadata["filename"] = adaptive_file.filename
|
||||
doc.metadata["file_path"] = source_identifier
|
||||
doc.metadata["file_path"] = original_path
|
||||
doc.metadata["file_size"] = os.path.getsize(local_file_path)
|
||||
doc.metadata["file_extension"] = extension
|
||||
|
||||
@@ -310,7 +310,7 @@ class DocumentEnricher:
|
||||
)
|
||||
|
||||
loaded_docs = split_docs
|
||||
processed_record = (source_identifier, file_hash)
|
||||
processed_record = (original_path, file_hash)
|
||||
|
||||
adaptive_file.work_with_file_locally(process_local_file)
|
||||
return loaded_docs, processed_record
|
||||
|
||||
Reference in New Issue
Block a user