rag-solution/services/rag/langchain/helpers.py

"""Helper utilities for metadata extraction from Russian text."""

import os
import re
from abc import abstractmethod
from pathlib import Path
from typing import Callable, List

_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")

_EVENT_KEYWORDS = (
    "конференц",
    "форум",
    "выставк",
    "фестивал",
    "саммит",
    "чемпионат",
    "олимпиад",
    "кубок",
    "конкурс",
    "вебинар",
    "семинар",
    "лекци",
    "презентаци",
    "хакатон",
    "митап",
    "встреч",
    "съезд",
    "конгресс",
)

_EVENT_PHRASE_PATTERN = re.compile(
    r"\b("
    r"конференц(?:ия|ии|ию|ией)?|"
    r"форум(?:а|е|у|ом)?|"
    r"выставк(?:а|и|е|у|ой)?|"
    r"фестивал(?:ь|я|е|ю|ем)?|"
    r"саммит(?:а|е|у|ом)?|"
    r"чемпионат(?:а|е|у|ом)?|"
    r"олимпиад(?:а|ы|е|у|ой)?|"
    r"кубок(?:а|е|у|ом)?|"
    r"конкурс(?:а|е|у|ом)?|"
    r"вебинар(?:а|е|у|ом)?|"
    r"семинар(?:а|е|у|ом)?|"
    r"лекци(?:я|и|ю|ей)?|"
    r"презентаци(?:я|и|ю|ей)?|"
    r"хакатон(?:а|е|у|ом)?|"
    r"митап(?:а|е|у|ом)?|"
    r"встреч(?:а|и|е|у|ей)?|"
    r"съезд(?:а|е|у|ом)?|"
    r"конгресс(?:а|е|у|ом)?"
    r")\b(?:\s+[A-Za-zА-Яа-я0-9][A-Za-zА-Яа-я0-9\-_/.]{1,40}){0,6}",
    flags=re.IGNORECASE,
)

_QUOTED_EVENT_PATTERN = re.compile(
    r"(?:мероприят(?:ие|ия|ию|ием)|событ(?:ие|ия|ию|ием)|"
    r"конференц(?:ия|ии|ию|ией)?|форум(?:а|е|у|ом)?|"
    r"выставк(?:а|и|е|у|ой)?|фестивал(?:ь|я|е|ю|ем)?)"
    r"[^\n\"«»]{0,40}[«\"]([^»\"\n]{3,120})[»\"]",
    flags=re.IGNORECASE,
)


def _normalize_event(value: str) -> str:
    normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}")
    return normalized.lower()


def extract_years_from_text(text: str) -> List[int]:
    """Extract unique years from text as integers."""
    if not text:
        return []

    years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)}
    return sorted(years)


def extract_russian_event_names(text: str) -> List[str]:
    """
    Extract likely Russian event names from text using heuristic regex rules.

    Returns normalized event phrases in lowercase.
    """
    if not text:
        return []

    events: List[str] = []
    seen = set()

    for match in _EVENT_PHRASE_PATTERN.finditer(text):
        candidate = _normalize_event(match.group(0))
        if len(candidate) < 6:
            continue
        if not any(keyword in candidate for keyword in _EVENT_KEYWORDS):
            continue
        if candidate not in seen:
            events.append(candidate)
            seen.add(candidate)

    for match in _QUOTED_EVENT_PATTERN.finditer(text):
        quoted = _normalize_event(match.group(1))
        if len(quoted) < 3:
            continue
        if quoted not in seen:
            events.append(quoted)
            seen.add(quoted)

    return events


class _AdaptiveFile:
    extension: str  # Format: .jpg
    local_path: str

    def __init__(self, extension: str, local_path: str):
        self.extension = extension
        self.local_path = local_path

    # This method allows to work with file locally, and lambda should be provided for this.
    # Why separate method? For possible cleanup after work is done. And to download file, if needed
    # Lambda: first argument is a local path
    @abstractmethod
    def work_with_file_locally(self, func: Callable[[str], None]):
        pass


class _AdaptiveCollection:
    # Generator method with yield
    @abstractmethod
    def iterate(self, recursive: bool):
        pass


class LocalFilesystemAdaptiveFile(_AdaptiveFile):
    def work_with_file_locally(self, func: Callable[[str], None]):
        func(self.local_path)


class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
    base_dir: str

    def __init__(self, base_dir: str):
        super().__init__()

        self.base_dir = base_dir

    def iterate(self, recursive: bool):
        for root, dirs, files in os.walk(self.base_dir):
            for file in files:
                full_path = os.path.join(root, file)
                yield _AdaptiveFile(Path(full_path).suffix, full_path)