"""Helper utilities for metadata extraction from Russian text.""" import re from typing import List _YEAR_PATTERN = re.compile(r"(? str: normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}") return normalized.lower() def extract_years_from_text(text: str) -> List[int]: """Extract unique years from text as integers.""" if not text: return [] years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)} return sorted(years) def extract_russian_event_names(text: str) -> List[str]: """ Extract likely Russian event names from text using heuristic regex rules. Returns normalized event phrases in lowercase. """ if not text: return [] events: List[str] = [] seen = set() for match in _EVENT_PHRASE_PATTERN.finditer(text): candidate = _normalize_event(match.group(0)) if len(candidate) < 6: continue if not any(keyword in candidate for keyword in _EVENT_KEYWORDS): continue if candidate not in seen: events.append(candidate) seen.add(candidate) for match in _QUOTED_EVENT_PATTERN.finditer(text): quoted = _normalize_event(match.group(1)) if len(quoted) < 3: continue if quoted not in seen: events.append(quoted) seen.add(quoted) return events