Working Yandex Disk integration for loading files. Tests for local and Yandex
This commit is contained in:
BIN
services/rag/langchain/.DS_Store
vendored
Normal file
BIN
services/rag/langchain/.DS_Store
vendored
Normal file
Binary file not shown.
1
services/rag/langchain/.gitignore
vendored
1
services/rag/langchain/.gitignore
vendored
@@ -215,3 +215,4 @@ __marimo__/
|
|||||||
# Streamlit
|
# Streamlit
|
||||||
.streamlit/secrets.toml
|
.streamlit/secrets.toml
|
||||||
document_tracking.db
|
document_tracking.db
|
||||||
|
.env.test
|
||||||
|
|||||||
@@ -70,6 +70,6 @@ Chosen data folder: relatve ./../../../data - from the current folder
|
|||||||
|
|
||||||
- [x] Create adaptive collection class and adaptive file class in the helpers, which will be as abstract classes, that should encompass feature of iterating and working with files locally
|
- [x] Create adaptive collection class and adaptive file class in the helpers, which will be as abstract classes, that should encompass feature of iterating and working with files locally
|
||||||
- [x] Write local filesystem implementation of adaptive collection
|
- [x] Write local filesystem implementation of adaptive collection
|
||||||
- [ ] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
- [x] Write tests for local filesystem implementation, using test/samples folder filled with files and directories for testing of iteration and recursivess
|
||||||
- [ ] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
- [x] Create Yandex Disk implementation of the Adaptive Collection. Constructor should have requirement for TOKEN for Yandex Disk.
|
||||||
- [ ] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
- [x] Write tests for Yandex Disk implementation, using folder "Общая/Информация". .env.test has YADISK_TOKEN variable for connecting. While testing log output of found files during iterating. If test fails at this step, leave to manual fixing, and this step can be marked as done.
|
||||||
|
|||||||
@@ -2,9 +2,13 @@
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from abc import abstractmethod
|
import tempfile
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, List
|
from typing import Callable, Iterator, List
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from loguru import logger
|
||||||
|
|
||||||
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
|
_YEAR_PATTERN = re.compile(r"(?<!\d)(1\d{3}|20\d{2}|2100)(?!\d)")
|
||||||
|
|
||||||
@@ -109,7 +113,7 @@ def extract_russian_event_names(text: str) -> List[str]:
|
|||||||
return events
|
return events
|
||||||
|
|
||||||
|
|
||||||
class _AdaptiveFile:
|
class _AdaptiveFile(ABC):
|
||||||
extension: str # Format: .jpg
|
extension: str # Format: .jpg
|
||||||
local_path: str
|
local_path: str
|
||||||
|
|
||||||
@@ -122,14 +126,14 @@ class _AdaptiveFile:
|
|||||||
# Lambda: first argument is a local path
|
# Lambda: first argument is a local path
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def work_with_file_locally(self, func: Callable[[str], None]):
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||||
pass
|
"""Run callback with a local path to the file."""
|
||||||
|
|
||||||
|
|
||||||
class _AdaptiveCollection:
|
class _AdaptiveCollection(ABC):
|
||||||
# Generator method with yield
|
# Generator method with yield
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def iterate(self, recursive: bool):
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||||
pass
|
"""Iterate files in collection."""
|
||||||
|
|
||||||
|
|
||||||
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
class LocalFilesystemAdaptiveFile(_AdaptiveFile):
|
||||||
@@ -145,8 +149,119 @@ class LocalFilesystemAdaptiveCollection(_AdaptiveCollection):
|
|||||||
|
|
||||||
self.base_dir = base_dir
|
self.base_dir = base_dir
|
||||||
|
|
||||||
def iterate(self, recursive: bool):
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||||
for root, dirs, files in os.walk(self.base_dir):
|
for root, dirs, files in os.walk(self.base_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
full_path = os.path.join(root, file)
|
full_path = os.path.join(root, file)
|
||||||
yield _AdaptiveFile(Path(full_path).suffix, full_path)
|
yield LocalFilesystemAdaptiveFile(Path(full_path).suffix, full_path)
|
||||||
|
|
||||||
|
if not recursive:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
class YandexDiskAdaptiveFile(_AdaptiveFile):
|
||||||
|
"""Adaptive file representation for Yandex Disk resources."""
|
||||||
|
|
||||||
|
def __init__(self, extension: str, local_path: str, token: str):
|
||||||
|
super().__init__(extension, local_path)
|
||||||
|
self.token = token
|
||||||
|
|
||||||
|
def _download_to_temp_file(self) -> str:
|
||||||
|
headers = {"Authorization": f"OAuth {self.token}"}
|
||||||
|
response = requests.get(
|
||||||
|
"https://cloud-api.yandex.net/v1/disk/resources/download",
|
||||||
|
headers=headers,
|
||||||
|
params={"path": self.local_path},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
href = response.json()["href"]
|
||||||
|
|
||||||
|
file_response = requests.get(href, timeout=120)
|
||||||
|
file_response.raise_for_status()
|
||||||
|
|
||||||
|
suffix = Path(self.local_path).suffix
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
|
||||||
|
temp_file.write(file_response.content)
|
||||||
|
return temp_file.name
|
||||||
|
|
||||||
|
def work_with_file_locally(self, func: Callable[[str], None]):
|
||||||
|
temp_path = self._download_to_temp_file()
|
||||||
|
try:
|
||||||
|
func(temp_path)
|
||||||
|
finally:
|
||||||
|
if os.path.exists(temp_path):
|
||||||
|
os.unlink(temp_path)
|
||||||
|
|
||||||
|
|
||||||
|
class YandexDiskAdaptiveCollection(_AdaptiveCollection):
|
||||||
|
"""Adaptive collection implementation for Yandex Disk."""
|
||||||
|
|
||||||
|
def __init__(self, token: str, base_dir: str):
|
||||||
|
if not token:
|
||||||
|
raise ValueError("Yandex Disk token is required")
|
||||||
|
|
||||||
|
self.token = token
|
||||||
|
self.base_dir = base_dir
|
||||||
|
self._headers = {"Authorization": f"OAuth {self.token}"}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_disk_path(path: str) -> str:
|
||||||
|
return path if path.startswith("disk:/") else f"disk:/{path.lstrip('/')}"
|
||||||
|
|
||||||
|
def _get_resource_info(self, path: str) -> dict:
|
||||||
|
response = requests.get(
|
||||||
|
"https://cloud-api.yandex.net/v1/disk/resources",
|
||||||
|
headers=self._headers,
|
||||||
|
params={"path": path, "limit": 1000},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response.json()
|
||||||
|
|
||||||
|
def _iter_children(self, path: str) -> Iterator[dict]:
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
response = requests.get(
|
||||||
|
"https://cloud-api.yandex.net/v1/disk/resources",
|
||||||
|
headers=self._headers,
|
||||||
|
params={"path": path, "limit": 1000, "offset": offset},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
payload = response.json()
|
||||||
|
embedded = payload.get("_embedded", {})
|
||||||
|
items = embedded.get("items", [])
|
||||||
|
if not items:
|
||||||
|
break
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
yield item
|
||||||
|
|
||||||
|
if len(items) < 1000:
|
||||||
|
break
|
||||||
|
offset += 1000
|
||||||
|
|
||||||
|
def iterate(self, recursive: bool) -> Iterator[_AdaptiveFile]:
|
||||||
|
root_path = self._normalize_disk_path(self.base_dir)
|
||||||
|
root_info = self._get_resource_info(root_path)
|
||||||
|
|
||||||
|
if root_info.get("type") == "file":
|
||||||
|
path = root_info["path"]
|
||||||
|
logger.info(f"Found file on Yandex Disk: {path}")
|
||||||
|
yield YandexDiskAdaptiveFile(Path(path).suffix, path, self.token)
|
||||||
|
return
|
||||||
|
|
||||||
|
directories = [root_path]
|
||||||
|
while directories:
|
||||||
|
current_dir = directories.pop(0)
|
||||||
|
for item in self._iter_children(current_dir):
|
||||||
|
item_type = item.get("type")
|
||||||
|
item_path = item.get("path")
|
||||||
|
if item_type == "file":
|
||||||
|
logger.info(f"Found file on Yandex Disk: {item_path}")
|
||||||
|
yield YandexDiskAdaptiveFile(
|
||||||
|
Path(item_path).suffix, item_path, self.token
|
||||||
|
)
|
||||||
|
elif recursive and item_type == "dir":
|
||||||
|
directories.append(item_path)
|
||||||
|
|||||||
1
services/rag/langchain/test/samples/level1/first.md
Normal file
1
services/rag/langchain/test/samples/level1/first.md
Normal file
@@ -0,0 +1 @@
|
|||||||
|
first level
|
||||||
1
services/rag/langchain/test/samples/root.txt
Normal file
1
services/rag/langchain/test/samples/root.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
root file
|
||||||
@@ -0,0 +1,52 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from helpers import LocalFilesystemAdaptiveCollection, LocalFilesystemAdaptiveFile
|
||||||
|
|
||||||
|
|
||||||
|
class TestLocalFilesystemAdaptiveCollection(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.samples_dir = Path(__file__).parent / "samples"
|
||||||
|
|
||||||
|
def test_iterate_non_recursive_returns_only_root_files(self):
|
||||||
|
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
||||||
|
|
||||||
|
files = list(collection.iterate(recursive=False))
|
||||||
|
file_names = sorted(Path(file.local_path).name for file in files)
|
||||||
|
|
||||||
|
self.assertEqual(file_names, ["root.txt"])
|
||||||
|
self.assertTrue(all(isinstance(file, LocalFilesystemAdaptiveFile) for file in files))
|
||||||
|
|
||||||
|
def test_iterate_recursive_returns_nested_files(self):
|
||||||
|
collection = LocalFilesystemAdaptiveCollection(str(self.samples_dir))
|
||||||
|
|
||||||
|
files = list(collection.iterate(recursive=True))
|
||||||
|
relative_paths = sorted(
|
||||||
|
str(Path(file.local_path).relative_to(self.samples_dir)) for file in files
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
relative_paths,
|
||||||
|
["level1/first.md", "level1/level2/second.log", "root.txt"],
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_work_with_file_locally_provides_existing_path(self):
|
||||||
|
target_path = self.samples_dir / "root.txt"
|
||||||
|
adaptive_file = LocalFilesystemAdaptiveFile(target_path.suffix, str(target_path))
|
||||||
|
|
||||||
|
observed = {}
|
||||||
|
|
||||||
|
def callback(path: str):
|
||||||
|
observed["path"] = path
|
||||||
|
with open(path, "r", encoding="utf-8") as handle:
|
||||||
|
observed["content"] = handle.read().strip()
|
||||||
|
|
||||||
|
adaptive_file.work_with_file_locally(callback)
|
||||||
|
|
||||||
|
self.assertEqual(observed["path"], str(target_path))
|
||||||
|
self.assertEqual(observed["content"], "root file")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
import os
|
||||||
|
import unittest
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from loguru import logger
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from helpers import YandexDiskAdaptiveCollection
|
||||||
|
|
||||||
|
load_dotenv(dotenv_path=Path(__file__).resolve().parent.parent / ".env.test")
|
||||||
|
|
||||||
|
|
||||||
|
class TestYandexDiskAdaptiveCollection(unittest.TestCase):
|
||||||
|
def test_constructor_requires_token(self):
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
YandexDiskAdaptiveCollection(token="", base_dir="Общая/Информация")
|
||||||
|
|
||||||
|
def test_iterate_logs_found_files_for_shared_folder(self):
|
||||||
|
token = os.getenv("YADISK_TOKEN")
|
||||||
|
if not token:
|
||||||
|
self.skipTest("YADISK_TOKEN is not configured")
|
||||||
|
|
||||||
|
collection = YandexDiskAdaptiveCollection(
|
||||||
|
token=token,
|
||||||
|
base_dir="Общая/Информация",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
files = list(collection.iterate(recursive=True))
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
self.skipTest(f"Yandex Disk request failed and needs manual verification: {exc}")
|
||||||
|
|
||||||
|
for item in files:
|
||||||
|
logger.info(f"Yandex file found during test iteration: {item.local_path}")
|
||||||
|
|
||||||
|
self.assertIsInstance(files, list)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
Reference in New Issue
Block a user