Files
rag-solution/services/rag/langchain/helpers.py

153 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Helper utilities for metadata extraction from Russian text."""
import os
import re
from abc import abstractmethod
from pathlib import Path
from typing import Callable, List
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
_EVENT_KEYWORDS = (
"конференц",
"форум",
"выставк",
"фестивал",
"саммит",
"чемпионат",
"олимпиад",
"кубок",
"конкурс",
"вебинар",
"семинар",
"лекци",
"презентаци",
"хакатон",
"митап",
"встреч",
"съезд",
"конгресс",
)
_EVENT_PHRASE_PATTERN = re.compile(
r"\b("
r"конференц(?:ия|ии|ию|ией)?|"
r"форум(?:а|е|у|ом)?|"
r"выставк(?:а|и|е|у|ой)?|"
r"фестивал(?:ь|я|е|ю|ем)?|"
r"саммит(?:а|е|у|ом)?|"
r"чемпионат(?:а|е|у|ом)?|"
r"олимпиад(?:а|ы|е|у|ой)?|"
r"кубок(?:а|е|у|ом)?|"
r"конкурс(?:а|е|у|ом)?|"
r"вебинар(?:а|е|у|ом)?|"
r"семинар(?:а|е|у|ом)?|"
r"лекци(?:я|и|ю|ей)?|"
r"презентаци(?:я|и|ю|ей)?|"
r"хакатон(?:а|е|у|ом)?|"
r"митап(?:а|е|у|ом)?|"
r"встреч(?:а|и|е|у|ей)?|"
r"съезд(?:а|е|у|ом)?|"
r"конгресс(?:а|е|у|ом)?"
r")\b(?:\s+[A-Za-zА-Яа-я0-9][A-Za-zА-Яа-я0-9\-_/.]{1,40}){0,6}",
flags=re.IGNORECASE,
)
_QUOTED_EVENT_PATTERN = re.compile(
r"(?:мероприят(?:ие|ия|ию|ием)|событ(?:ие|ия|ию|ием)|"
r"конференц(?:ия|ии|ию|ией)?|форум(?:а|е|у|ом)?|"
r"выставк(?:а|и|е|у|ой)?|фестивал(?:ь|я|е|ю|ем)?)"
r"[^\n\"«»]{0,40}[«\"]([^»\"\n]{3,120})[»\"]",
flags=re.IGNORECASE,
)
def _normalize_event(value: str) -> str:
normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}")
return normalized.lower()
def extract_years_from_text(text: str) -> List[int]:
"""Extract unique years from text as integers."""
if not text:
return []
years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)}
return sorted(years)
def extract_russian_event_names(text: str) -> List[str]:
"""
Extract likely Russian event names from text using heuristic regex rules.
Returns normalized event phrases in lowercase.
"""
if not text:
return []
events: List[str] = []
seen = set()
for match in _EVENT_PHRASE_PATTERN.finditer(text):
candidate = _normalize_event(match.group(0))
if len(candidate) < 6:
continue
if not any(keyword in candidate for keyword in _EVENT_KEYWORDS):
continue
if candidate not in seen:
events.append(candidate)
seen.add(candidate)
for match in _QUOTED_EVENT_PATTERN.finditer(text):
quoted = _normalize_event(match.group(1))
if len(quoted) < 3:
continue
if quoted not in seen:
events.append(quoted)
seen.add(quoted)
return events
class _AdaptiveFile:
extension: str # Format: .jpg
local_path: str
def __init__(self, extension: str, local_path: str):
self.extension = extension
self.local_path = local_path
# This method allows to work with file locally, and lambda should be provided for this.
# Why separate method? For possible cleanup after work is done. And to download file, if needed
# Lambda: first argument is a local path
@abstractmethod
def work_with_file_locally(self, func: Callable[[str], None]):
pass
class _AdaptiveCollection:
# Generator method with yield
@abstractmethod
def iterate(self, recursive: bool):
pass
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
def work_with_file_locally(self, func: Callable[[str], None]):
func(self.local_path)
class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
base_dir: str
def __init__(self, base_dir: str):
super().__init__()
self.base_dir = base_dir
def iterate(self, recursive: bool):
for root, dirs, files in os.walk(self.base_dir):
for file in files:
full_path = os.path.join(root, file)
yield _AdaptiveFile(Path(full_path).suffix, full_path)