Working Yandex Disk integration for loading files. Tests for local and Yandex
This commit is contained in:
@@ -2,9 +2,13 @@
|
||||
|
||||
import os
|
||||
import re
|
||||
from abc import abstractmethod
|
||||
import tempfile
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Callable, List
|
||||
from typing import Callable, Iterator, List
|
||||
|
||||
import requests
|
||||
from loguru import logger
|
||||
|
||||
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
|
||||
|
||||
@@ -109,7 +113,7 @@ def extract_russian_event_names(text: str) -> List[str]:
|
||||
return events
|
||||
|
||||
|
||||
class _AdaptiveFile:
|
||||
class _AdaptiveFile(ABC):
|
||||
extension: str # Format: .jpg
|
||||
local_path: str
|
||||
|
||||
@@ -122,14 +126,14 @@ class _AdaptiveFile:
|
||||
# Lambda: first argument is a local path
|
||||
@abstractmethod
|
||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||
pass
|
||||
"""Run callback with a local path to the file."""
|
||||
|
||||
|
||||
class _AdaptiveCollection:
|
||||
class _AdaptiveCollection(ABC):
|
||||
# Generator method with yield
|
||||
@abstractmethod
|
||||
def iterate(self, recursive: bool):
|
||||
pass
|
||||
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||
"""Iterate files in collection."""
|
||||
|
||||
|
||||
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
||||
@@ -145,8 +149,119 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
||||
|
||||
self.base_dir = base_dir
|
||||
|
||||
def iterate(self, recursive: bool):
|
||||
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||
for root, dirs, files in os.walk(self.base_dir):
|
||||
for file in files:
|
||||
full_path = os.path.join(root, file)
|
||||
yield _AdaptiveFile(Path(full_path).suffix, full_path)
|
||||
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
|
||||
|
||||
if not recursive:
|
||||
break
|
||||
|
||||
|
||||
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
||||
"""Adaptive file representation for Yandex Disk resources."""
|
||||
|
||||
def __init__(self, extension: str, local_path: str, token: str):
|
||||
super().__init__(extension, local_path)
|
||||
self.token = token
|
||||
|
||||
def _download_to_temp_file(self) -> str:
|
||||
headers = {"Authorization": f"OAuth {self.token}"}
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||
headers=headers,
|
||||
params={"path": self.local_path},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
href = response.json()["href"]
|
||||
|
||||
file_response = requests.get(href, timeout=120)
|
||||
file_response.raise_for_status()
|
||||
|
||||
suffix = Path(self.local_path).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||||
temp_file.write(file_response.content)
|
||||
return temp_file.name
|
||||
|
||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||
temp_path = self._download_to_temp_file()
|
||||
try:
|
||||
func(temp_path)
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
||||
"""Adaptive collection implementation for Yandex Disk."""
|
||||
|
||||
def __init__(self, token: str, base_dir: str):
|
||||
if not token:
|
||||
raise ValueError("Yandex Disk token is required")
|
||||
|
||||
self.token = token
|
||||
self.base_dir = base_dir
|
||||
self._headers = {"Authorization": f"OAuth {self.token}"}
|
||||
|
||||
@staticmethod
|
||||
def _normalize_disk_path(path: str) -> str:
|
||||
return path if path.startswith("disk:/") else f"disk:/{path.lstrip('/')}"
|
||||
|
||||
def _get_resource_info(self, path: str) -> dict:
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources",
|
||||
headers=self._headers,
|
||||
params={"path": path, "limit": 1000},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def _iter_children(self, path: str) -> Iterator[dict]:
|
||||
offset = 0
|
||||
while True:
|
||||
response = requests.get(
|
||||
"https://cloud-api.yandex.net/v1/disk/resources",
|
||||
headers=self._headers,
|
||||
params={"path": path, "limit": 1000, "offset": offset},
|
||||
timeout=30,
|
||||
)
|
||||
response.raise_for_status()
|
||||
payload = response.json()
|
||||
embedded = payload.get("_embedded", {})
|
||||
items = embedded.get("items", [])
|
||||
if not items:
|
||||
break
|
||||
|
||||
for item in items:
|
||||
yield item
|
||||
|
||||
if len(items) < 1000:
|
||||
break
|
||||
offset += 1000
|
||||
|
||||
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||
root_path = self._normalize_disk_path(self.base_dir)
|
||||
root_info = self._get_resource_info(root_path)
|
||||
|
||||
if root_info.get("type") == "file":
|
||||
path = root_info["path"]
|
||||
logger.info(f"Found file on Yandex Disk: {path}")
|
||||
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
|
||||
return
|
||||
|
||||
directories = [root_path]
|
||||
while directories:
|
||||
current_dir = directories.pop(0)
|
||||
for item in self._iter_children(current_dir):
|
||||
item_type = item.get("type")
|
||||
item_path = item.get("path")
|
||||
if item_type == "file":
|
||||
logger.info(f"Found file on Yandex Disk: {item_path}")
|
||||
yield YandexDiskAdaptiveFile(
|
||||
Path(item_path).suffix, item_path, self.token
|
||||
)
|
||||
elif recursive and item_type == "dir":
|
||||
directories.append(item_path)
|
||||
|
||||
Reference in New Issue
Block a user