Working Yandex Disk integration for loading files. Tests for local and Yandex

This commit is contained in:
2026-02-10 20:42:07 +03:00
parent 63c3e2c5c7
commit 06a3155b6b
8 changed files with 222 additions and 12 deletions

View File

@@ -2,9 +2,13 @@
import os
import re
from abc import abstractmethod
import tempfile
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Callable, List
from typing import Callable, Iterator, List
import requests
from loguru import logger
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
@@ -109,7 +113,7 @@ def extract_russian_event_names(text: str) -> List[str]:
return events
class _AdaptiveFile:
class _AdaptiveFile(ABC):
extension: str # Format: .jpg
local_path: str
@@ -122,14 +126,14 @@ class _AdaptiveFile:
# Lambda: first argument is a local path
@abstractmethod
def work_with_file_locally(self, func: Callable[[str], None]):
pass
"""Run callback with a local path to the file."""
class _AdaptiveCollection:
class _AdaptiveCollection(ABC):
# Generator method with yield
@abstractmethod
def iterate(self, recursive: bool):
pass
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
"""Iterate files in collection."""
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
@@ -145,8 +149,119 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
self.base_dir = base_dir
def iterate(self, recursive: bool):
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
for root, dirs, files in os.walk(self.base_dir):
for file in files:
full_path = os.path.join(root, file)
yield _AdaptiveFile(Path(full_path).suffix, full_path)
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
if not recursive:
break
class YandexDiskAdaptiveFile(_AdaptiveFile):
"""Adaptive file representation for Yandex Disk resources."""
def __init__(self, extension: str, local_path: str, token: str):
super().__init__(extension, local_path)
self.token = token
def _download_to_temp_file(self) -> str:
headers = {"Authorization": f"OAuth {self.token}"}
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources/download",
headers=headers,
params={"path": self.local_path},
timeout=30,
)
response.raise_for_status()
href = response.json()["href"]
file_response = requests.get(href, timeout=120)
file_response.raise_for_status()
suffix = Path(self.local_path).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
temp_file.write(file_response.content)
return temp_file.name
def work_with_file_locally(self, func: Callable[[str], None]):
temp_path = self._download_to_temp_file()
try:
func(temp_path)
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
class YandexDiskAdaptiveCollection(_AdaptiveCollection):
"""Adaptive collection implementation for Yandex Disk."""
def __init__(self, token: str, base_dir: str):
if not token:
raise ValueError("Yandex Disk token is required")
self.token = token
self.base_dir = base_dir
self._headers = {"Authorization": f"OAuth {self.token}"}
@staticmethod
def _normalize_disk_path(path: str) -> str:
return path if path.startswith("disk:/") else f"disk:/{path.lstrip('/')}"
def _get_resource_info(self, path: str) -> dict:
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources",
headers=self._headers,
params={"path": path, "limit": 1000},
timeout=30,
)
response.raise_for_status()
return response.json()
def _iter_children(self, path: str) -> Iterator[dict]:
offset = 0
while True:
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources",
headers=self._headers,
params={"path": path, "limit": 1000, "offset": offset},
timeout=30,
)
response.raise_for_status()
payload = response.json()
embedded = payload.get("_embedded", {})
items = embedded.get("items", [])
if not items:
break
for item in items:
yield item
if len(items) < 1000:
break
offset += 1000
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
root_path = self._normalize_disk_path(self.base_dir)
root_info = self._get_resource_info(root_path)
if root_info.get("type") == "file":
path = root_info["path"]
logger.info(f"Found file on Yandex Disk: {path}")
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
return
directories = [root_path]
while directories:
current_dir = directories.pop(0)
for item in self._iter_children(current_dir):
item_type = item.get("type")
item_path = item.get("path")
if item_type == "file":
logger.info(f"Found file on Yandex Disk: {item_path}")
yield YandexDiskAdaptiveFile(
Path(item_path).suffix, item_path, self.token
)
elif recursive and item_type == "dir":
directories.append(item_path)