Prep for Phase 12 of loading files for enrichment through the adaptive collections
This commit is contained in:
@@ -7,3 +7,6 @@ QDRANT_HOST=HOST
|
|||||||
QDRANT_REST_PORT=PORT
|
QDRANT_REST_PORT=PORT
|
||||||
QDRANT_GRPC_PORT=PORT
|
QDRANT_GRPC_PORT=PORT
|
||||||
YADISK_TOKEN=TOKEN
|
YADISK_TOKEN=TOKEN
|
||||||
|
ENRICHMENT_SOURCE=local/yadisk
|
||||||
|
ENRICHMENT_LOCAL_PATH=path
|
||||||
|
ENRICHMENT_YADISK_PATH=path
|
||||||
|
|||||||
@@ -73,3 +73,14 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
|||||||
- [x] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
- [x] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
||||||
- [x] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
- [x] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
||||||
- [x] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env.test has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
- [x] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env.test has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
||||||
|
|
||||||
|
# Phase 12 (using local file system or yandex disk)
|
||||||
|
|
||||||
|
During enrichment, we should use adaptive collection from the helpers, for loading documents. We should not use directly local filesystem, but use adaptive collection as a wrapper.
|
||||||
|
|
||||||
|
- [ ] Adaptive file in helper now has filename in it, so tests should be adjusted for this
|
||||||
|
- [ ] Add conditional usage of adaptive collection in the enrichment stage. .env has now variable ENRICHMENT_SOURCE with 2 possible values: yadisk, local
|
||||||
|
- [ ] With local source, use env variable for local filesystem adaptive collection: ENRICHMENT_LOCAL_PATH
|
||||||
|
- [ ] With yadisk source, use env variable for YADISK_TOKEN for token for auth within Yandex Disk, ENRICHMENT_YADISK_PATH for path on the Yandex Disk system
|
||||||
|
- [ ] We still will need filetypes that we will need to skip, so while iterating over files we need to check their extension and skip them.
|
||||||
|
- [ ] Adaptive files has filename in them, so it should be used when extracting metadata
|
||||||
|
|||||||
@@ -116,10 +116,11 @@ def extract_russian_event_names(text: str) -> List[str]:
|
|||||||
class _AdaptiveFile(ABC):
|
class _AdaptiveFile(ABC):
|
||||||
extension: str # Format: .jpg
|
extension: str # Format: .jpg
|
||||||
local_path: str
|
local_path: str
|
||||||
|
filename: str
|
||||||
|
|
||||||
def __init__(self, extension: str, local_path: str):
|
def __init__(self, filename: str, extension: str):
|
||||||
|
self.filename = filename
|
||||||
self.extension = extension
|
self.extension = extension
|
||||||
self.local_path = local_path
|
|
||||||
|
|
||||||
# This method allows to work with file locally, and lambda should be provided for this.
|
# This method allows to work with file locally, and lambda should be provided for this.
|
||||||
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
||||||
@@ -137,6 +138,12 @@ class _AdaptiveCollection(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
||||||
|
local_path: str
|
||||||
|
|
||||||
|
def __init__(self, filename: str, extension: str, local_path: str):
|
||||||
|
super().__init__(filename, extension)
|
||||||
|
self.local_path = local_path
|
||||||
|
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||||
func(self.local_path)
|
func(self.local_path)
|
||||||
|
|
||||||
@@ -153,7 +160,8 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|||||||
for root, dirs, files in os.walk(self.base_dir):
|
for root, dirs, files in os.walk(self.base_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
full_path = os.path.join(root, file)
|
full_path = os.path.join(root, file)
|
||||||
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
|
p = Path(full_path)
|
||||||
|
yield LocalFilesystemAdaptiveFile(p.name, p.suffix, full_path)
|
||||||
|
|
||||||
if not recursive:
|
if not recursive:
|
||||||
break
|
break
|
||||||
@@ -162,16 +170,19 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|||||||
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
||||||
"""Adaptive file representation for Yandex Disk resources."""
|
"""Adaptive file representation for Yandex Disk resources."""
|
||||||
|
|
||||||
def __init__(self, extension: str, local_path: str, token: str):
|
remote_path: str
|
||||||
super().__init__(extension, local_path)
|
|
||||||
|
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
|
||||||
|
super().__init__(filename, extension)
|
||||||
self.token = token
|
self.token = token
|
||||||
|
self.remote_path = remote_path
|
||||||
|
|
||||||
def _download_to_temp_file(self) -> str:
|
def _download_to_temp_file(self) -> str:
|
||||||
headers = {"Authorization": f"OAuth {self.token}"}
|
headers = {"Authorization": f"OAuth {self.token}"}
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||||
headers=headers,
|
headers=headers,
|
||||||
params={"path": self.local_path},
|
params={"path": self.remote_path},
|
||||||
timeout=30,
|
timeout=30,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@@ -180,7 +191,8 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
|
|||||||
file_response = requests.get(href, timeout=120)
|
file_response = requests.get(href, timeout=120)
|
||||||
file_response.raise_for_status()
|
file_response.raise_for_status()
|
||||||
|
|
||||||
suffix = Path(self.local_path).suffix
|
p = Path(self.remote_path)
|
||||||
|
suffix = p.suffix
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||||||
temp_file.write(file_response.content)
|
temp_file.write(file_response.content)
|
||||||
return temp_file.name
|
return temp_file.name
|
||||||
@@ -249,7 +261,8 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
|||||||
if root_info.get("type") == "file":
|
if root_info.get("type") == "file":
|
||||||
path = root_info["path"]
|
path = root_info["path"]
|
||||||
logger.info(f"Found file on Yandex Disk: {path}")
|
logger.info(f"Found file on Yandex Disk: {path}")
|
||||||
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
|
p = Path(path)
|
||||||
|
yield YandexDiskAdaptiveFile(p.name, p.suffix, path, self.token)
|
||||||
return
|
return
|
||||||
|
|
||||||
directories = [root_path]
|
directories = [root_path]
|
||||||
@@ -257,11 +270,12 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
|||||||
current_dir = directories.pop(0)
|
current_dir = directories.pop(0)
|
||||||
for item in self._iter_children(current_dir):
|
for item in self._iter_children(current_dir):
|
||||||
item_type = item.get("type")
|
item_type = item.get("type")
|
||||||
item_path = item.get("path")
|
item_path = str(item.get("path"))
|
||||||
if item_type == "file":
|
if item_type == "file":
|
||||||
logger.info(f"Found file on Yandex Disk: {item_path}")
|
logger.info(f"Found file on Yandex Disk: {item_path}")
|
||||||
|
p = Path(item_path)
|
||||||
yield YandexDiskAdaptiveFile(
|
yield YandexDiskAdaptiveFile(
|
||||||
Path(item_path).suffix, item_path, self.token
|
p.name, p.suffix, item_path, self.token
|
||||||
)
|
)
|
||||||
elif recursive and item_type == "dir":
|
elif recursive and item_type == "dir":
|
||||||
directories.append(item_path)
|
directories.append(item_path)
|
||||||
|
|||||||
Reference in New Issue
Block a user