2026-02-10 13:20:19 +03:00
|
|
|
|
"""Helper utilities for metadata extraction from Russian text."""
|
|
|
|
|
|
|
2026-02-10 20:12:43 +03:00
|
|
|
|
import os
|
2026-02-10 13:20:19 +03:00
|
|
|
|
import re
|
2026-02-10 20:42:07 +03:00
|
|
|
|
import tempfile
|
|
|
|
|
|
from abc import ABC, abstractmethod
|
2026-02-10 20:12:43 +03:00
|
|
|
|
from pathlib import Path
|
2026-02-10 20:42:07 +03:00
|
|
|
|
from typing import Callable, Iterator, List
|
|
|
|
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
|
from loguru import logger
|
2026-02-10 13:20:19 +03:00
|
|
|
|
|
|
|
|
|
|
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
|
|
|
|
|
|
|
|
|
|
|
|
_EVENT_KEYWORDS = (
|
|
|
|
|
|
"конференц",
|
|
|
|
|
|
"форум",
|
|
|
|
|
|
"выставк",
|
|
|
|
|
|
"фестивал",
|
|
|
|
|
|
"саммит",
|
|
|
|
|
|
"чемпионат",
|
|
|
|
|
|
"олимпиад",
|
|
|
|
|
|
"кубок",
|
|
|
|
|
|
"конкурс",
|
|
|
|
|
|
"вебинар",
|
|
|
|
|
|
"семинар",
|
|
|
|
|
|
"лекци",
|
|
|
|
|
|
"презентаци",
|
|
|
|
|
|
"хакатон",
|
|
|
|
|
|
"митап",
|
|
|
|
|
|
"встреч",
|
|
|
|
|
|
"съезд",
|
|
|
|
|
|
"конгресс",
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_EVENT_PHRASE_PATTERN = re.compile(
|
|
|
|
|
|
r"\b("
|
|
|
|
|
|
r"конференц(?:ия|ии|ию|ией)?|"
|
|
|
|
|
|
r"форум(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"выставк(?:а|и|е|у|ой)?|"
|
|
|
|
|
|
r"фестивал(?:ь|я|е|ю|ем)?|"
|
|
|
|
|
|
r"саммит(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"чемпионат(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"олимпиад(?:а|ы|е|у|ой)?|"
|
|
|
|
|
|
r"кубок(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"конкурс(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"вебинар(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"семинар(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"лекци(?:я|и|ю|ей)?|"
|
|
|
|
|
|
r"презентаци(?:я|и|ю|ей)?|"
|
|
|
|
|
|
r"хакатон(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"митап(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"встреч(?:а|и|е|у|ей)?|"
|
|
|
|
|
|
r"съезд(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"конгресс(?:а|е|у|ом)?"
|
|
|
|
|
|
r")\b(?:\s+[A-Za-zА-Яа-я0-9][A-Za-zА-Яа-я0-9\-_/.]{1,40}){0,6}",
|
|
|
|
|
|
flags=re.IGNORECASE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
_QUOTED_EVENT_PATTERN = re.compile(
|
|
|
|
|
|
r"(?:мероприят(?:ие|ия|ию|ием)|событ(?:ие|ия|ию|ием)|"
|
|
|
|
|
|
r"конференц(?:ия|ии|ию|ией)?|форум(?:а|е|у|ом)?|"
|
|
|
|
|
|
r"выставк(?:а|и|е|у|ой)?|фестивал(?:ь|я|е|ю|ем)?)"
|
|
|
|
|
|
r"[^\n\"«»]{0,40}[«\"]([^»\"\n]{3,120})[»\"]",
|
|
|
|
|
|
flags=re.IGNORECASE,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _normalize_event(value: str) -> str:
|
|
|
|
|
|
normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}")
|
|
|
|
|
|
return normalized.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_years_from_text(text: str) -> List[int]:
|
|
|
|
|
|
"""Extract unique years from text as integers."""
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)}
|
|
|
|
|
|
return sorted(years)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_russian_event_names(text: str) -> List[str]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Extract likely Russian event names from text using heuristic regex rules.
|
|
|
|
|
|
|
|
|
|
|
|
Returns normalized event phrases in lowercase.
|
|
|
|
|
|
"""
|
|
|
|
|
|
if not text:
|
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
events: List[str] = []
|
|
|
|
|
|
seen = set()
|
|
|
|
|
|
|
|
|
|
|
|
for match in _EVENT_PHRASE_PATTERN.finditer(text):
|
|
|
|
|
|
candidate = _normalize_event(match.group(0))
|
|
|
|
|
|
if len(candidate) < 6:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if not any(keyword in candidate for keyword in _EVENT_KEYWORDS):
|
|
|
|
|
|
continue
|
|
|
|
|
|
if candidate not in seen:
|
|
|
|
|
|
events.append(candidate)
|
|
|
|
|
|
seen.add(candidate)
|
|
|
|
|
|
|
|
|
|
|
|
for match in _QUOTED_EVENT_PATTERN.finditer(text):
|
|
|
|
|
|
quoted = _normalize_event(match.group(1))
|
|
|
|
|
|
if len(quoted) < 3:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if quoted not in seen:
|
|
|
|
|
|
events.append(quoted)
|
|
|
|
|
|
seen.add(quoted)
|
|
|
|
|
|
|
|
|
|
|
|
return events
|
2026-02-10 20:12:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-02-10 20:42:07 +03:00
|
|
|
|
class _AdaptiveFile(ABC):
|
2026-02-10 20:12:43 +03:00
|
|
|
|
extension: str # Format: .jpg
|
|
|
|
|
|
local_path: str
|
2026-02-10 21:42:59 +03:00
|
|
|
|
filename: str
|
2026-02-10 20:12:43 +03:00
|
|
|
|
|
2026-02-10 22:19:27 +03:00
|
|
|
|
def __init__(self, filename: str, extension: str, local_path: str):
|
2026-02-10 21:42:59 +03:00
|
|
|
|
self.filename = filename
|
2026-02-10 20:12:43 +03:00
|
|
|
|
self.extension = extension
|
2026-02-10 22:19:27 +03:00
|
|
|
|
self.local_path = local_path
|
2026-02-10 20:12:43 +03:00
|
|
|
|
|
|
|
|
|
|
# This method allows to work with file locally, and lambda should be provided for this.
|
|
|
|
|
|
# Why separate method? For possible cleanup after work is done. And to download file, if needed
|
|
|
|
|
|
# Lambda: first argument is a local path
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
2026-02-10 20:42:07 +03:00
|
|
|
|
"""Run callback with a local path to the file."""
|
2026-02-10 20:12:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
2026-02-10 20:42:07 +03:00
|
|
|
|
class _AdaptiveCollection(ABC):
|
2026-02-10 20:12:43 +03:00
|
|
|
|
# Generator method with yield
|
|
|
|
|
|
@abstractmethod
|
2026-02-10 20:42:07 +03:00
|
|
|
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
|
|
|
|
|
"""Iterate files in collection."""
|
2026-02-10 20:12:43 +03:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
2026-02-10 21:42:59 +03:00
|
|
|
|
def __init__(self, filename: str, extension: str, local_path: str):
|
2026-02-10 22:19:27 +03:00
|
|
|
|
super().__init__(filename, extension, local_path)
|
2026-02-10 21:42:59 +03:00
|
|
|
|
|
2026-02-10 20:12:43 +03:00
|
|
|
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
|
|
|
|
|
func(self.local_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|
|
|
|
|
base_dir: str
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, base_dir: str):
|
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
|
|
|
|
self.base_dir = base_dir
|
|
|
|
|
|
|
2026-02-10 20:42:07 +03:00
|
|
|
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
2026-02-10 20:12:43 +03:00
|
|
|
|
for root, dirs, files in os.walk(self.base_dir):
|
|
|
|
|
|
for file in files:
|
|
|
|
|
|
full_path = os.path.join(root, file)
|
2026-02-10 21:42:59 +03:00
|
|
|
|
p = Path(full_path)
|
|
|
|
|
|
yield LocalFilesystemAdaptiveFile(p.name, p.suffix, full_path)
|
2026-02-10 20:42:07 +03:00
|
|
|
|
|
|
|
|
|
|
if not recursive:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
|
|
|
|
|
"""Adaptive file representation for Yandex Disk resources."""
|
|
|
|
|
|
|
2026-02-10 21:42:59 +03:00
|
|
|
|
remote_path: str
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
|
2026-02-10 22:19:27 +03:00
|
|
|
|
super().__init__(filename, extension, remote_path)
|
2026-02-10 20:42:07 +03:00
|
|
|
|
self.token = token
|
2026-02-10 21:42:59 +03:00
|
|
|
|
self.remote_path = remote_path
|
2026-02-10 20:42:07 +03:00
|
|
|
|
|
|
|
|
|
|
def _download_to_temp_file(self) -> str:
|
|
|
|
|
|
headers = {"Authorization": f"OAuth {self.token}"}
|
|
|
|
|
|
response = requests.get(
|
|
|
|
|
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
|
|
|
|
|
headers=headers,
|
2026-02-10 21:42:59 +03:00
|
|
|
|
params={"path": self.remote_path},
|
2026-02-10 20:42:07 +03:00
|
|
|
|
timeout=30,
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
href = response.json()["href"]
|
|
|
|
|
|
|
|
|
|
|
|
file_response = requests.get(href, timeout=120)
|
|
|
|
|
|
file_response.raise_for_status()
|
|
|
|
|
|
|
2026-02-10 21:42:59 +03:00
|
|
|
|
p = Path(self.remote_path)
|
|
|
|
|
|
suffix = p.suffix
|
2026-02-10 20:42:07 +03:00
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
|
|
|
|
|
temp_file.write(file_response.content)
|
|
|
|
|
|
return temp_file.name
|
|
|
|
|
|
|
|
|
|
|
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
|
|
|
|
|
temp_path = self._download_to_temp_file()
|
|
|
|
|
|
try:
|
|
|
|
|
|
func(temp_path)
|
|
|
|
|
|
finally:
|
|
|
|
|
|
if os.path.exists(temp_path):
|
|
|
|
|
|
os.unlink(temp_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
|
|
|
|
|
"""Adaptive collection implementation for Yandex Disk."""
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, token: str, base_dir: str):
|
|
|
|
|
|
if not token:
|
|
|
|
|
|
raise ValueError("Yandex Disk token is required")
|
|
|
|
|
|
|
|
|
|
|
|
self.token = token
|
|
|
|
|
|
self.base_dir = base_dir
|
|
|
|
|
|
self._headers = {"Authorization": f"OAuth {self.token}"}
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def _normalize_disk_path(path: str) -> str:
|
|
|
|
|
|
return path if path.startswith("disk:/") else f"disk:/{path.lstrip('/')}"
|
|
|
|
|
|
|
|
|
|
|
|
def _get_resource_info(self, path: str) -> dict:
|
|
|
|
|
|
response = requests.get(
|
|
|
|
|
|
"https://cloud-api.yandex.net/v1/disk/resources",
|
|
|
|
|
|
headers=self._headers,
|
|
|
|
|
|
params={"path": path, "limit": 1000},
|
|
|
|
|
|
timeout=30,
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
return response.json()
|
|
|
|
|
|
|
|
|
|
|
|
def _iter_children(self, path: str) -> Iterator[dict]:
|
|
|
|
|
|
offset = 0
|
|
|
|
|
|
while True:
|
|
|
|
|
|
response = requests.get(
|
|
|
|
|
|
"https://cloud-api.yandex.net/v1/disk/resources",
|
|
|
|
|
|
headers=self._headers,
|
|
|
|
|
|
params={"path": path, "limit": 1000, "offset": offset},
|
|
|
|
|
|
timeout=30,
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
payload = response.json()
|
|
|
|
|
|
embedded = payload.get("_embedded", {})
|
|
|
|
|
|
items = embedded.get("items", [])
|
|
|
|
|
|
if not items:
|
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
for item in items:
|
|
|
|
|
|
yield item
|
|
|
|
|
|
|
|
|
|
|
|
if len(items) < 1000:
|
|
|
|
|
|
break
|
|
|
|
|
|
offset += 1000
|
|
|
|
|
|
|
|
|
|
|
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
|
|
|
|
|
root_path = self._normalize_disk_path(self.base_dir)
|
|
|
|
|
|
root_info = self._get_resource_info(root_path)
|
|
|
|
|
|
|
|
|
|
|
|
if root_info.get("type") == "file":
|
|
|
|
|
|
path = root_info["path"]
|
|
|
|
|
|
logger.info(f"Found file on Yandex Disk: {path}")
|
2026-02-10 21:42:59 +03:00
|
|
|
|
p = Path(path)
|
|
|
|
|
|
yield YandexDiskAdaptiveFile(p.name, p.suffix, path, self.token)
|
2026-02-10 20:42:07 +03:00
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
directories = [root_path]
|
|
|
|
|
|
while directories:
|
|
|
|
|
|
current_dir = directories.pop(0)
|
|
|
|
|
|
for item in self._iter_children(current_dir):
|
|
|
|
|
|
item_type = item.get("type")
|
2026-02-10 21:42:59 +03:00
|
|
|
|
item_path = str(item.get("path"))
|
2026-02-10 20:42:07 +03:00
|
|
|
|
if item_type == "file":
|
|
|
|
|
|
logger.info(f"Found file on Yandex Disk: {item_path}")
|
2026-02-10 21:42:59 +03:00
|
|
|
|
p = Path(item_path)
|
2026-02-10 20:42:07 +03:00
|
|
|
|
yield YandexDiskAdaptiveFile(
|
2026-02-10 21:42:59 +03:00
|
|
|
|
p.name, p.suffix, item_path, self.token
|
2026-02-10 20:42:07 +03:00
|
|
|
|
)
|
|
|
|
|
|
elif recursive and item_type == "dir":
|
|
|
|
|
|
directories.append(item_path)
|