Files
rag-solution/services/rag/langchain/helpers.py

281 lines
8.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Helper utilities for metadata extraction from Russian text."""
import os
import re
import tempfile
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Callable, Iterator, List
import requests
from loguru import logger
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
_EVENT_KEYWORDS = (
"конференц",
"форум",
"выставк",
"фестивал",
"саммит",
"чемпионат",
"олимпиад",
"кубок",
"конкурс",
"вебинар",
"семинар",
"лекци",
"презентаци",
"хакатон",
"митап",
"встреч",
"съезд",
"конгресс",
)
_EVENT_PHRASE_PATTERN = re.compile(
r"\b("
r"конференц(?:ия|ии|ию|ией)?|"
r"форум(?:а|е|у|ом)?|"
r"выставк(?:а|и|е|у|ой)?|"
r"фестивал(?:ь|я|е|ю|ем)?|"
r"саммит(?:а|е|у|ом)?|"
r"чемпионат(?:а|е|у|ом)?|"
r"олимпиад(?:а|ы|е|у|ой)?|"
r"кубок(?:а|е|у|ом)?|"
r"конкурс(?:а|е|у|ом)?|"
r"вебинар(?:а|е|у|ом)?|"
r"семинар(?:а|е|у|ом)?|"
r"лекци(?:я|и|ю|ей)?|"
r"презентаци(?:я|и|ю|ей)?|"
r"хакатон(?:а|е|у|ом)?|"
r"митап(?:а|е|у|ом)?|"
r"встреч(?:а|и|е|у|ей)?|"
r"съезд(?:а|е|у|ом)?|"
r"конгресс(?:а|е|у|ом)?"
r")\b(?:\s+[A-Za-zА-Яа-я0-9][A-Za-zА-Яа-я0-9\-_/.]{1,40}){0,6}",
flags=re.IGNORECASE,
)
_QUOTED_EVENT_PATTERN = re.compile(
r"(?:мероприят(?:ие|ия|ию|ием)|событ(?:ие|ия|ию|ием)|"
r"конференц(?:ия|ии|ию|ией)?|форум(?:а|е|у|ом)?|"
r"выставк(?:а|и|е|у|ой)?|фестивал(?:ь|я|е|ю|ем)?)"
r"[^\n\"«»]{0,40}[«\"]([^»\"\n]{3,120})[»\"]",
flags=re.IGNORECASE,
)
def _normalize_event(value: str) -> str:
normalized = " ".join(value.strip().split()).strip(".,;:!?()[]{}")
return normalized.lower()
def extract_years_from_text(text: str) -> List[int]:
"""Extract unique years from text as integers."""
if not text:
return []
years = {int(match.group(0)) for match in _YEAR_PATTERN.finditer(text)}
return sorted(years)
def extract_russian_event_names(text: str) -> List[str]:
"""
Extract likely Russian event names from text using heuristic regex rules.
Returns normalized event phrases in lowercase.
"""
if not text:
return []
events: List[str] = []
seen = set()
for match in _EVENT_PHRASE_PATTERN.finditer(text):
candidate = _normalize_event(match.group(0))
if len(candidate) < 6:
continue
if not any(keyword in candidate for keyword in _EVENT_KEYWORDS):
continue
if candidate not in seen:
events.append(candidate)
seen.add(candidate)
for match in _QUOTED_EVENT_PATTERN.finditer(text):
quoted = _normalize_event(match.group(1))
if len(quoted) < 3:
continue
if quoted not in seen:
events.append(quoted)
seen.add(quoted)
return events
class _AdaptiveFile(ABC):
extension: str # Format: .jpg
filename: str
def __init__(self, filename: str, extension: str):
self.filename = filename
self.extension = extension
# This method allows to work with file locally, and lambda should be provided for this.
# Why separate method? For possible cleanup after work is done. And to download file, if needed
# Lambda: first argument is a local path
@abstractmethod
def work_with_file_locally(self, func: Callable[[str], None]):
"""Run callback with a local path to the file."""
class _AdaptiveCollection(ABC):
# Generator method with yield
@abstractmethod
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
"""Iterate files in collection."""
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
local_path: str
def __init__(self, filename: str, extension: str, local_path: str):
super().__init__(filename, extension)
self.local_path = local_path
def work_with_file_locally(self, func: Callable[[str], None]):
func(self.local_path)
class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
base_dir: str
def __init__(self, base_dir: str):
super().__init__()
self.base_dir = base_dir
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
for root, dirs, files in os.walk(self.base_dir):
for file in files:
full_path = os.path.join(root, file)
p = Path(full_path)
yield LocalFilesystemAdaptiveFile(p.name, p.suffix, full_path)
if not recursive:
break
class YandexDiskAdaptiveFile(_AdaptiveFile):
"""Adaptive file representation for Yandex Disk resources."""
remote_path: str
def __init__(self, filename: str, extension: str, remote_path: str, token: str):
super().__init__(filename, extension)
self.token = token
self.remote_path = remote_path
def _download_to_temp_file(self) -> str:
headers = {"Authorization": f"OAuth {self.token}"}
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources/download",
headers=headers,
params={"path": self.remote_path},
timeout=30,
)
response.raise_for_status()
href = response.json()["href"]
file_response = requests.get(href, timeout=120)
file_response.raise_for_status()
p = Path(self.remote_path)
suffix = p.suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
temp_file.write(file_response.content)
return temp_file.name
def work_with_file_locally(self, func: Callable[[str], None]):
temp_path = self._download_to_temp_file()
try:
func(temp_path)
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
class YandexDiskAdaptiveCollection(_AdaptiveCollection):
"""Adaptive collection implementation for Yandex Disk."""
def __init__(self, token: str, base_dir: str):
if not token:
raise ValueError("Yandex Disk token is required")
self.token = token
self.base_dir = base_dir
self._headers = {"Authorization": f"OAuth {self.token}"}
@staticmethod
def _normalize_disk_path(path: str) -> str:
return path if path.startswith("disk:/") else f"disk:/{path.lstrip('/')}"
def _get_resource_info(self, path: str) -> dict:
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources",
headers=self._headers,
params={"path": path, "limit": 1000},
timeout=30,
)
response.raise_for_status()
return response.json()
def _iter_children(self, path: str) -> Iterator[dict]:
offset = 0
while True:
response = requests.get(
"https://cloud-api.yandex.net/v1/disk/resources",
headers=self._headers,
params={"path": path, "limit": 1000, "offset": offset},
timeout=30,
)
response.raise_for_status()
payload = response.json()
embedded = payload.get("_embedded", {})
items = embedded.get("items", [])
if not items:
break
for item in items:
yield item
if len(items) < 1000:
break
offset += 1000
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
root_path = self._normalize_disk_path(self.base_dir)
root_info = self._get_resource_info(root_path)
if root_info.get("type") == "file":
path = root_info["path"]
logger.info(f"Found file on Yandex Disk: {path}")
p = Path(path)
yield YandexDiskAdaptiveFile(p.name, p.suffix, path, self.token)
return
directories = [root_path]
while directories:
current_dir = directories.pop(0)
for item in self._iter_children(current_dir):
item_type = item.get("type")
item_path = str(item.get("path"))
if item_type == "file":
logger.info(f"Found file on Yandex Disk: {item_path}")
p = Path(item_path)
yield YandexDiskAdaptiveFile(
p.name, p.suffix, item_path, self.token
)
elif recursive and item_type == "dir":
directories.append(item_path)