"""Helper utilities for metadata extraction from Russian text.""" import os import re from abc import abstractmethod from pathlib import Path from typing import Callable, List _YEAR_PATTERN = re.compile(r"(? str: normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}") return normalized.lower() def extract_years_from_text(text: str) -> List[int]: """Extract unique years from text as integers.""" if not text: return [] years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)} return sorted(years) def extract_russian_event_names(text: str) -> List[str]: """ Extract likely Russian event names from text using heuristic regex rules. Returns normalized event phrases in lowercase. """ if not text: return [] events: List[str] = [] seen = set() for match in _EVENT_PHRASE_PATTERN.finditer(text): candidate = _normalize_event(match.group(0)) if len(candidate) < 6: continue if not any(keyword in candidate for keyword in _EVENT_KEYWORDS): continue if candidate not in seen: events.append(candidate) seen.add(candidate) for match in _QUOTED_EVENT_PATTERN.finditer(text): quoted = _normalize_event(match.group(1)) if len(quoted) < 3: continue if quoted not in seen: events.append(quoted) seen.add(quoted) return events class _AdaptiveFile: extension: str # Format: .jpg local_path: str def __init__(self, extension: str, local_path: str): self.extension = extension self.local_path = local_path # This method allows to work with file locally, and lambda should be provided for this. # Why separate method? For possible cleanup after work is done. And to download file, if needed # Lambda: first argument is a local path @abstractmethod def work_with_file_locally(self, func: Callable[[str], None]): pass class _AdaptiveCollection: # Generator method with yield @abstractmethod def iterate(self, recursive: bool): pass class LocalFilesystemAdaptiveFile(_AdaptiveFile): def work_with_file_locally(self, func: Callable[[str], None]): func(self.local_path) class LocalFilesystemAdaptiveCollection(_AdaptiveCollection): base_dir: str def __init__(self, base_dir: str): super().__init__() self.base_dir = base_dir def iterate(self, recursive: bool): for root, dirs, files in os.walk(self.base_dir): for file in files: full_path = os.path.join(root, file) yield _AdaptiveFile(Path(full_path).suffix, full_path)