Prep for Phase 12 of loading files for enrichment through the adaptive collections

This commit is contained in:
2026-02-10 21:42:59 +03:00
parent 06a3155b6b
commit e9dd28ad55
3 changed files with 38 additions and 10 deletions

View File

@@ -116,10 +116,11 @@ def extract_russian_event_names(text: str) -> List[str]:
class _AdaptiveFile(ABC):
extension: str # Format: .jpg
local_path: str
filename: str
def __init__(self, extension: str, local_path: str):
def __init__(self, filename: str, extension: str):
self.filename = filename
self.extension = extension
self.local_path = local_path
# This method allows to work with file locally, and lambda should be provided for this.
# Why separate method? For possible cleanup after work is done. And to download file, if needed
@@ -137,6 +138,12 @@ class _AdaptiveCollection(ABC):
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
local_path: str
def __init__(self, filename: str, extension: str, local_path: str):
super().__init__(filename, extension)
self.local_path = local_path
def work_with_file_locally(self, func: Callable[[str], None]):
func(self.local_path)
@@ -153,7 +160,8 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
for root, dirs, files in os.walk(self.base_dir):
for file in files:
full_path = os.path.join(root, file)
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
p = Path(full_path)
yield LocalFilesystemAdaptiveFile(p.name, p.suffix, full_path)
if not recursive:
break
@@ -162,16 +170,19 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
class YandexDiskAdaptiveFile(_AdaptiveFile):
"""Adaptive file representation for Yandex Disk resources."""
def __init__(self, extension: str, local_path: str, token: str):
super().__init__(extension, local_path)
remote_path: str
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
super().__init__(filename, extension)
self.token = token
self.remote_path = remote_path
def _download_to_temp_file(self) -> str:
headers = {"Authorization": f"OAuth {self.token}"}
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources/download",
headers=headers,
params={"path": self.local_path},
params={"path": self.remote_path},
timeout=30,
)
response.raise_for_status()
@@ -180,7 +191,8 @@ class YandexDiskAdaptiveFile(_AdaptiveFile):
file_response = requests.get(href, timeout=120)
file_response.raise_for_status()
suffix = Path(self.local_path).suffix
p = Path(self.remote_path)
suffix = p.suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
temp_file.write(file_response.content)
return temp_file.name
@@ -249,7 +261,8 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
if root_info.get("type") == "file":
path = root_info["path"]
logger.info(f"Found file on Yandex Disk: {path}")
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
p = Path(path)
yield YandexDiskAdaptiveFile(p.name, p.suffix, path, self.token)
return
directories = [root_path]
@@ -257,11 +270,12 @@ class YandexDiskAdaptiveCollection(_AdaptiveCollection):
current_dir = directories.pop(0)
for item in self._iter_children(current_dir):
item_type = item.get("type")
item_path = item.get("path")
item_path = str(item.get("path"))
if item_type == "file":
logger.info(f"Found file on Yandex Disk: {item_path}")
p = Path(item_path)
yield YandexDiskAdaptiveFile(
Path(item_path).suffix, item_path, self.token
p.name, p.suffix, item_path, self.token
)
elif recursive and item_type == "dir":
directories.append(item_path)