Initial commit

2026-02-03 19:24:41 +03:00
commit 351fe27cca
11 changed files with 346 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/data/.gitignore
+++ b/data/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
--- a/services/control/go.mod
+++ b/services/control/go.mod
@@ -0,0 +1,3 @@
+module control
+
+go 1.25.5
--- a/services/control/rag/providers/langchain_provider/langchain_provider.go
+++ b/services/control/rag/providers/langchain_provider/langchain_provider.go
@@ -0,0 +1 @@
+package langchain_provider
--- a/services/control/rag/rag.go
+++ b/services/control/rag/rag.go
@@ -0,0 +1,10 @@
+package rag
+
+import (
+	"control/rag/types"
+)
+
+type RagProvider interface {
+	RegisterDocument(doc types.RagDocumentProvision) error
+	QueryInformation(query types.Query) (result types.QueryResult, err error)
+}
--- a/services/control/rag/types/types.go
+++ b/services/control/rag/types/types.go
@@ -0,0 +1,12 @@
+package types
+
+type Query struct {
+	Text string
+}
+
+type QueryResult struct {
+	Text string
+}
+
+type RagDocumentProvision struct {
+}
--- a/services/rag/langchain/.env.dist
+++ b/services/rag/langchain/.env.dist
@@ -0,0 +1,2 @@
+OLLAMA_EMBEDDING_MODEL=MODEL
+OLLAMA_CHAT_MODEL=MODEL
--- a/services/rag/langchain/.gitignore
+++ b/services/rag/langchain/.gitignore
@@ -0,0 +1,216 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
--- a/services/rag/langchain/PLANNING.md
+++ b/services/rag/langchain/PLANNING.md
@@ -0,0 +1,41 @@
+# Requirements
+
+Libraries should be installed into the local virtual environment, which is defined in the `venv` folder.
+If some libraries are not installed, check online which are best and install them.
+Use if possible logging, using library `loguru`, for steps. Use logrotation in file `logs/dev.log`, also log to stdout
+
+Chosen RAG framework: Langchain
+Chosen Vector Storage: Qdrant
+Chosen data folder: relatve ./../../../data - from the current folder
+
+# Phase 1 (cli entrypoint)
+
+- [x] Create virtual env in the `venv` folder in the current directory.
+- [ ] Create cli.py file, with the usage of `click` python library. Make default command "ping" which will write output "pong"
+
+# Phase 2 (installation of base framework for RAG solution and preparation for data loading)
+
+- [ ] Install langchain as base framework for RAG solution
+- [ ] Analyze the upper `data` folder (./../../../data), to learn all the possible files extensions of files there. Then, create file in the current directory `EXTENSIONS.md` with the list of extensions - and loader/loaders for chosen framework (this can be learned online - search for the info), that is needed to load the data in the provided extension. Prioriize libraries that work without external service that require API keys or paid subscriptions. Important: skip stream media files extensions (audio, video). We are not going to load them now.
+- [ ] Install all needed libraries for loaders, mentioned in the `EXTENSIONS.md`. If some libraries require API keys for external services, add them to the `.env` file (create it if it does not exist)
+
+# Phase 3 (preparation for storing data in the vector storage + embeddings)
+- [ ] Install needed library for using Qdrant connection as vector storage. Ensure ports are used (which are needed in the chosen framework): Rest Api: 6333, gRPC Api: 6334. Database available and running on localhost.
+- [ ] Create file called `vector_storage.py`, which will contain vector storage initialization, available for import by other modules of initialized. If needed in chosen RAG framework, add embedding model iniialization in the same file. Use ollama, model name defined in the .env file: OLLAMA_EMBEDDING_MODEL. Ollama available by the default local port: 11434
+- [ ] Just in case add possibility to connect via openai embedding, using openrouter api key. Comment this section, so it can be used in the future.
+
+# Phase 4 (creating module for loading documents from the folder)
+
+- [ ] Create file `enrichment.py` with the function that will load data with configured data loaders from the data folder into the vector storage. Remember to specify default embeddings meta properties, such as filename, paragraph, page, section, wherever this is possible (documents can have pages, sections, paragraphs, etc). Use text splitters of the chosen RAG framework accordingly to the documents being loaded. Which chunking/text-splitting strategies framework has, can be learned online.
+- [ ] Use built-in strategy for marking which documents loaded (if there is such mechanism) and which are not, to avoid re-reading and re-encriching vector storage with the existing data. If there is no built-in mechanism of this type, install sqlite library and use local sqlite database file to store this information.
+- [ ] Add activation of this function in the cli entrypoint, as a command.
+
+# Phase 5 (preparation for the retrieval feature)
+
+- [ ] Create file `retrieval.py` with the configuration for chosen RAG framework, that will retrieve data from the vector storage based on the query. Use retrieving library/plugin, that supports chosen vector storage within the chosen RAG framework. Retrieving configuration should search for the provided text in the query as argument in the function and return found information with the stored meta data, like paragraph, section, page etc.
+
+# Phase 6 (chat feature, as agent, for usage in the cli)
+
+- [ ] Create file `agent.py`, which will incorporate into itself agent, powered by the chat model. It should use integration with ollama, model specified in .env in property: OLLAMA_CHAT_MODEL
+- [ ] Integrate this agent with the existing solution for retrieving, with retrieval.py
+- [ ] Integrate this agent with the cli, as command to start chatting with the agent. If there is a built-in solution for console communication with the agent, initiate this on cli command.
--- a/services/rag/langchain/app.py
+++ b/services/rag/langchain/app.py
@@ -0,0 +1 @@
+from langchain.agents import create_agent
--- a/services/rag/langchain/requirements.txt
+++ b/services/rag/langchain/requirements.txt
@@ -0,0 +1,58 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+annotated-types==0.7.0
+anyio==4.12.0
+attrs==25.4.0
+beautifulsoup4==4.14.3
+bs4==0.0.2
+certifi==2025.11.12
+charset-normalizer==3.4.4
+click==8.3.1
+dataclasses-json==0.6.7
+frozenlist==1.8.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.3
+idna==3.11
+jsonpatch==1.33
+jsonpointer==3.0.0
+langchain==1.2.0
+langchain-classic==1.0.1
+langchain-community==0.4.1
+langchain-core==1.2.5
+langchain-ollama==1.0.1
+langchain-text-splitters==1.1.0
+langgraph==1.0.5
+langgraph-checkpoint==3.0.1
+langgraph-prebuilt==1.0.5
+langgraph-sdk==0.3.1
+langsmith==0.5.2
+marshmallow==3.26.2
+multidict==6.7.0
+mypy_extensions==1.1.0
+numpy==2.4.0
+ollama==0.6.1
+orjson==3.11.5
+ormsgpack==1.12.1
+packaging==25.0
+propcache==0.4.1
+pydantic==2.12.5
+pydantic-settings==2.12.0
+pydantic_core==2.41.5
+python-dotenv==1.2.1
+PyYAML==6.0.3
+requests==2.32.5
+requests-toolbelt==1.0.0
+soupsieve==2.8.1
+SQLAlchemy==2.0.45
+tenacity==9.1.2
+typing-inspect==0.9.0
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.2
+uuid_utils==0.12.0
+xxhash==3.6.0
+yarl==1.22.0
+zstandard==0.25.0