108 lines
3.1 KiB
Python
108 lines
3.1 KiB
Python
|
|
"""Helper utilities for metadata extraction from Russian text."""
|
|||
|
|
|
|||
|
|
import re
|
|||
|
|
from typing import List
|
|||
|
|
|
|||
|
|
|
|||
|
|
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
|
|||
|
|
|
|||
|
|
_EVENT_KEYWORDS = (
|
|||
|
|
"конференц",
|
|||
|
|
"форум",
|
|||
|
|
"выставк",
|
|||
|
|
"фестивал",
|
|||
|
|
"саммит",
|
|||
|
|
"чемпионат",
|
|||
|
|
"олимпиад",
|
|||
|
|
"кубок",
|
|||
|
|
"конкурс",
|
|||
|
|
"вебинар",
|
|||
|
|
"семинар",
|
|||
|
|
"лекци",
|
|||
|
|
"презентаци",
|
|||
|
|
"хакатон",
|
|||
|
|
"митап",
|
|||
|
|
"встреч",
|
|||
|
|
"съезд",
|
|||
|
|
"конгресс",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
_EVENT_PHRASE_PATTERN = re.compile(
|
|||
|
|
r"\b("
|
|||
|
|
r"конференц(?:ия|ии|ию|ией)?|"
|
|||
|
|
r"форум(?:а|е|у|ом)?|"
|
|||
|
|
r"выставк(?:а|и|е|у|ой)?|"
|
|||
|
|
r"фестивал(?:ь|я|е|ю|ем)?|"
|
|||
|
|
r"саммит(?:а|е|у|ом)?|"
|
|||
|
|
r"чемпионат(?:а|е|у|ом)?|"
|
|||
|
|
r"олимпиад(?:а|ы|е|у|ой)?|"
|
|||
|
|
r"кубок(?:а|е|у|ом)?|"
|
|||
|
|
r"конкурс(?:а|е|у|ом)?|"
|
|||
|
|
r"вебинар(?:а|е|у|ом)?|"
|
|||
|
|
r"семинар(?:а|е|у|ом)?|"
|
|||
|
|
r"лекци(?:я|и|ю|ей)?|"
|
|||
|
|
r"презентаци(?:я|и|ю|ей)?|"
|
|||
|
|
r"хакатон(?:а|е|у|ом)?|"
|
|||
|
|
r"митап(?:а|е|у|ом)?|"
|
|||
|
|
r"встреч(?:а|и|е|у|ей)?|"
|
|||
|
|
r"съезд(?:а|е|у|ом)?|"
|
|||
|
|
r"конгресс(?:а|е|у|ом)?"
|
|||
|
|
r")\b(?:\s+[A-Za-zА-Яа-я0-9][A-Za-zА-Яа-я0-9\-_/.]{1,40}){0,6}",
|
|||
|
|
flags=re.IGNORECASE,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
_QUOTED_EVENT_PATTERN = re.compile(
|
|||
|
|
r"(?:мероприят(?:ие|ия|ию|ием)|событ(?:ие|ия|ию|ием)|"
|
|||
|
|
r"конференц(?:ия|ии|ию|ией)?|форум(?:а|е|у|ом)?|"
|
|||
|
|
r"выставк(?:а|и|е|у|ой)?|фестивал(?:ь|я|е|ю|ем)?)"
|
|||
|
|
r"[^\n\"«»]{0,40}[«\"]([^»\"\n]{3,120})[»\"]",
|
|||
|
|
flags=re.IGNORECASE,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _normalize_event(value: str) -> str:
|
|||
|
|
normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}")
|
|||
|
|
return normalized.lower()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_years_from_text(text: str) -> List[int]:
|
|||
|
|
"""Extract unique years from text as integers."""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)}
|
|||
|
|
return sorted(years)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def extract_russian_event_names(text: str) -> List[str]:
|
|||
|
|
"""
|
|||
|
|
Extract likely Russian event names from text using heuristic regex rules.
|
|||
|
|
|
|||
|
|
Returns normalized event phrases in lowercase.
|
|||
|
|
"""
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
events: List[str] = []
|
|||
|
|
seen = set()
|
|||
|
|
|
|||
|
|
for match in _EVENT_PHRASE_PATTERN.finditer(text):
|
|||
|
|
candidate = _normalize_event(match.group(0))
|
|||
|
|
if len(candidate) < 6:
|
|||
|
|
continue
|
|||
|
|
if not any(keyword in candidate for keyword in _EVENT_KEYWORDS):
|
|||
|
|
continue
|
|||
|
|
if candidate not in seen:
|
|||
|
|
events.append(candidate)
|
|||
|
|
seen.add(candidate)
|
|||
|
|
|
|||
|
|
for match in _QUOTED_EVENT_PATTERN.finditer(text):
|
|||
|
|
quoted = _normalize_event(match.group(1))
|
|||
|
|
if len(quoted) < 3:
|
|||
|
|
continue
|
|||
|
|
if quoted not in seen:
|
|||
|
|
events.append(quoted)
|
|||
|
|
seen.add(quoted)
|
|||
|
|
|
|||
|
|
return events
|