From 92d637fa595a7658ab282fa8e50f669c0fef4297 Mon Sep 17 00:00:00 2001 From: Kamikaza731 Date: Wed, 11 Dec 2024 17:19:43 +0100 Subject: [PATCH 1/3] Introduce prometheus into TNOM --- poetry.lock | 138 +++++++++++++++++++++- pyproject.toml | 3 + tnom/main.py | 10 +- tnom/prometheus_client_endpoint.py | 182 +++++++++++++++++++++++++++++ 4 files changed, 331 insertions(+), 2 deletions(-) create mode 100644 tnom/prometheus_client_endpoint.py diff --git a/poetry.lock b/poetry.lock index b1976ce..feef57b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -398,6 +398,26 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "fastapi" +version = "0.115.6" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = false +python-versions = ">=3.8" +files = [ + {file = "fastapi-0.115.6-py3-none-any.whl", hash = "sha256:e9240b29e36fa8f4bb7290316988e90c381e5092e0cbe84e7818cc3713bcf305"}, + {file = "fastapi-0.115.6.tar.gz", hash = "sha256:9ec46f7addc14ea472958a96aae5b5de65f39721a46aaf5705c480d9a8b76654"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0 || >2.0.0,<2.0.1 || >2.0.1,<2.1.0 || >2.1.0,<3.0.0" +starlette = ">=0.40.0,<0.42.0" +typing-extensions = ">=4.8.0" + +[package.extras] +all = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "pydantic-extra-types (>=2.0.0)", "pydantic-settings (>=2.0.0)", "python-multipart (>=0.0.7)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] +standard = ["email-validator (>=2.0.0)", "fastapi-cli[standard] (>=0.0.5)", "httpx (>=0.23.0)", "jinja2 (>=2.11.2)", "python-multipart (>=0.0.7)", "uvicorn[standard] (>=0.12.0)"] + [[package]] name = "frozenlist" version = "1.5.0" @@ -510,6 +530,32 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "h2" +version = "4.1.0" +description = "HTTP/2 State-Machine based protocol implementation" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, + {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, +] + +[package.dependencies] +hpack = ">=4.0,<5" +hyperframe = ">=6.0,<7" + +[[package]] +name = "hpack" +version = "4.0.0" +description = "Pure-Python HPACK header compression" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, + {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, +] + [[package]] name = "httpcore" version = "1.0.7" @@ -556,6 +602,40 @@ http2 = ["h2 (>=3,<5)"] socks = ["socksio (==1.*)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "hypercorn" +version = "0.17.3" +description = "A ASGI Server based on Hyper libraries and inspired by Gunicorn" +optional = false +python-versions = ">=3.8" +files = [ + {file = "hypercorn-0.17.3-py3-none-any.whl", hash = "sha256:059215dec34537f9d40a69258d323f56344805efb462959e727152b0aa504547"}, + {file = "hypercorn-0.17.3.tar.gz", hash = "sha256:1b37802ee3ac52d2d85270700d565787ab16cf19e1462ccfa9f089ca17574165"}, +] + +[package.dependencies] +h11 = "*" +h2 = ">=3.1.0" +priority = "*" +wsproto = ">=0.14.0" + +[package.extras] +docs = ["pydata_sphinx_theme", "sphinxcontrib_mermaid"] +h3 = ["aioquic (>=0.9.0,<1.0)"] +trio = ["trio (>=0.22.0)"] +uvloop = ["uvloop (>=0.18)"] + +[[package]] +name = "hyperframe" +version = "6.0.1" +description = "HTTP/2 framing layer for Python" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, + {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, +] + [[package]] name = "idna" version = "3.10" @@ -814,6 +894,31 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "priority" +version = "2.0.0" +description = "A pure-Python implementation of the HTTP/2 priority tree" +optional = false +python-versions = ">=3.6.1" +files = [ + {file = "priority-2.0.0-py3-none-any.whl", hash = "sha256:6f8eefce5f3ad59baf2c080a664037bb4725cd0a790d53d59ab4059288faf6aa"}, + {file = "priority-2.0.0.tar.gz", hash = "sha256:c965d54f1b8d0d0b19479db3924c7c36cf672dbf2aec92d43fbdaf4492ba18c0"}, +] + +[[package]] +name = "prometheus-client" +version = "0.21.1" +description = "Python client for the Prometheus monitoring system." +optional = false +python-versions = ">=3.8" +files = [ + {file = "prometheus_client-0.21.1-py3-none-any.whl", hash = "sha256:594b45c410d6f4f8888940fe80b5cc2521b305a1fafe1c58609ef715a001f301"}, + {file = "prometheus_client-0.21.1.tar.gz", hash = "sha256:252505a722ac04b0456be05c05f75f45d760c2911ffc45f2a06bcaed9f3ae3fb"}, +] + +[package.extras] +twisted = ["twisted"] + [[package]] name = "propcache" version = "0.2.0" @@ -1236,6 +1341,23 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "starlette" +version = "0.41.3" +description = "The little ASGI library that shines." +optional = false +python-versions = ">=3.8" +files = [ + {file = "starlette-0.41.3-py3-none-any.whl", hash = "sha256:44cedb2b7c77a9de33a8b74b2b90e9f50d11fcf25d8270ea525ad71a25374ff7"}, + {file = "starlette-0.41.3.tar.gz", hash = "sha256:0e4ab3d16522a255be6b28260b938eae2482f98ce5cc934cb08dce8dc3ba5835"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart (>=0.0.7)", "pyyaml"] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1264,6 +1386,20 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "wsproto" +version = "1.2.0" +description = "WebSockets state-machine based protocol implementation" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "wsproto-1.2.0-py3-none-any.whl", hash = "sha256:b9acddd652b585d75b20477888c56642fdade28bdfd3579aa24a4d2c037dd736"}, + {file = "wsproto-1.2.0.tar.gz", hash = "sha256:ad565f26ecb92588a3e43bc3d96164de84cd9902482b130d0ddbaa9664a85065"}, +] + +[package.dependencies] +h11 = ">=0.9.0,<1" + [[package]] name = "yarl" version = "1.17.2" @@ -1475,4 +1611,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.11,<3.14" -content-hash = "e730e810b1611070e581eac7ca544cd7d17c50b61b514b1455041cc4fbeeff89" +content-hash = "1e069e3f65d6892cac06837d82bf01ba23c10976b3b1757da12566bcd390e3e7" diff --git a/pyproject.toml b/pyproject.toml index 6ae35f7..219d0b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ aiohttp = "^3.11.6" numpy = "^2.1.3" pydantic = "^2.9.2" argparse = "^1.4.0" +fastapi = "^0.115.6" +hypercorn = "^0.17.3" +prometheus-client = "^0.21.1" [tool.poetry.group.dev.dependencies] diff --git a/tnom/main.py b/tnom/main.py index 7e29927..844548f 100644 --- a/tnom/main.py +++ b/tnom/main.py @@ -8,6 +8,7 @@ import config_load import database_handler import dead_man_switch +import prometheus_client_endpoint as prom import query_rand_api from check_apis import check_apis from set_up_db import init_and_check_db @@ -647,7 +648,14 @@ async def health_check_task() -> None: if alert_yml["health_check_enabled"]: tasks.append(health_check_task()) - + if alert_yml["prometheus_client_enabled"] is True: + # acquire latest epoch data + latest_epoch :int = database_handler.read_last_recorded_epoch(database_path) + prometheus = prom.PrometheusMetrics(database_path, latest_epoch) + prometheus_host = alert_yml["prometheus_host"] + prometheus_port = alert_yml["prometheus_port"] + tasks.append( + prom.start_metrics_server(prometheus, prometheus_host, prometheus_port)) try: await asyncio.gather(*tasks, return_exceptions=True) except KeyboardInterrupt: diff --git a/tnom/prometheus_client_endpoint.py b/tnom/prometheus_client_endpoint.py new file mode 100644 index 0000000..88701c4 --- /dev/null +++ b/tnom/prometheus_client_endpoint.py @@ -0,0 +1,182 @@ +"""Prometheus client for exposing TNOM metrics. + +It has one class: + - PrometheusMetrics: Provides Prometheus metrics for the monitoring system. + Which has 2 funcs: + - __init__: Initialize the Prometheus metrics. + - update_metrics: Update the metrics from the database. + +And two functions to set up FastAPI: + - create_prometheus_client: Create and return a FastAPI app with Prometheus metrics. + - start_metrics_server: Start the Prometheus metrics server. +""" + +from pathlib import Path + +import hypercorn.asyncio +import hypercorn.config +from database_handler import read_current_epoch_data +from fastapi import FastAPI +from prometheus_client import Counter, Gauge, make_asgi_app + + +class PrometheusMetrics: + """Provides Prometheus metrics for the monitoring system. + + This class is responsible for updating and exposing Prometheus metrics + for the monitoring system. + + """ + + def __init__(self, db_path: Path, epoch: int) -> None: + """Initialize the Prometheus metrics. + + Args: + db_path (Path): Path to the database file. + epoch (int): The current epoch number. + + """ + self.db_path = db_path + self.epoch = epoch + + namespace = "nibiru_oracle" + + self.slash_epoch = Gauge( + f"{namespace}_slash_epoch", + "Current epoch", + ) + + self.miss_counter_events = Gauge( + f"{namespace}_miss_counter_events", + "Total number of miss counter events", + ) + + self.miss_counter_events_p1_executed = Counter( + f"{namespace}_miss_counter_events_p1_executed", + "P1 alert executed", + ) + + self.miss_counter_events_p2_executed = Counter( + f"{namespace}_miss_counter_events_p2_executed", + "P2 alert executed", + ) + + self.miss_counter_events_p3_executed = Counter( + f"{namespace}_miss_counter_events_p3_executed", + "P3 alert executed", + ) + + self.unsigned_oracle_events = Gauge( + f"{namespace}_unsigned_oracle_events", + "Total number of unsigned oracle events", + ) + + self.price_feed_addr_balance = Gauge( + f"{namespace}_price_feed_balance", + "Price feed wallet unibi balance", + ) + + self.small_balance_alert = Counter( + f"{namespace}_small_balance_alert_executed", + "Small balance alert executed", + ) + + self.very_small_balance_alert = Counter( + f"{namespace}_very_small_balance_alert_executed", + "Very small balance alert executed", + ) + + self.consecutive_misses = Gauge( + f"{namespace}_consecutive_misses", + "Consecutive unsigned events.", + ) + self.api_cons_miss = Gauge( + f"{namespace}_api_cons_miss", + "API detected as not working.", + ) + + def update_metrics(self) -> None: + """Update the metrics from the database. + + Read the current epoch data from the database and update all metrics. + If there is an error reading the data, raise a ValueError. + """ + try: + data = read_current_epoch_data(self.db_path, self.epoch) + + # Gauge data + self.slash_epoch.set(data["slash_epoch"]) + self.miss_counter_events.set(data["miss_counter_events"]) + self.unsigned_oracle_events.set(data["unsigned_oracle_events"]) + self.price_feed_addr_balance.set(data["price_feed_addr_balance"]) + self.consecutive_misses.set(data["consecutive_misses"]) + self.api_cons_miss.set(data["api_cons_miss"]) + + # Counter data + self.miss_counter_events_p1_executed.inc( + data["miss_counter_p1_executed"]) + self.miss_counter_events_p2_executed.inc( + data["miss_counter_p2_executed"]) + self.miss_counter_events_p3_executed.inc( + data["miss_counter_p3_executed"]) + self.small_balance_alert.inc(data["small_balance_alert_executed"]) + self.very_small_balance_alert.inc( + data["very_small_balance_alert_executed"]) + except KeyError as e: + msg = f"Missing data field for metrics update: {e}" + raise ValueError(msg) from e + except Exception as e: + msg = f"Error updating metrics: {e}" + raise ValueError(msg) from e + +def create_prometheus_client(metrics: PrometheusMetrics) -> FastAPI: + """Create and return a FastAPI app with Prometheus metrics. + + This function initializes a FastAPI application and mounts a Prometheus + metrics endpoint at '/metrics'. It updates the provided PrometheusMetrics + instance before creating the ASGI app for Prometheus. + + Args: + metrics (PrometheusMetrics): An instance of PrometheusMetrics to update + and manage the metrics. + + Returns: + FastAPI: A FastAPI application with Prometheus metrics endpoint mounted. + + """ + app = FastAPI() + metrics.update_metrics() + metrics_app = make_asgi_app() + app.mount ("/metrics", metrics_app) + return app + +async def start_metrics_server( + metrics: PrometheusMetrics, + prometheus_host: str, + prometheus_port: int, +) -> None: + """Start the Prometheus metrics server. + + This function starts a metrics server using the Hypercorn ASGI server. + The metrics are collected from the given `metrics` object. + The server will listen on the given `prometheus_port` at the given + `prometheus_host` and on localhost:7130. + + Args: + metrics (PrometheusMetrics): The metrics object to collect metrics from. + prometheus_port (int): The port to listen on. + prometheus_host (str): The hostname to listen on. + + """ + config = hypercorn.config.Config() + config.bind = [] + if prometheus_host and prometheus_port: + config.bind.append(f"{prometheus_host}:{prometheus_port}") + else: + config.bind.append("127.0.0.1:7130") + config.graceful_timeout = 5 + config.workers = 1 + config.shutdown_timeout = 10 + + app = create_prometheus_client(metrics) + await hypercorn.asyncio.serve(app, config, mode="asgi") From 05225a41cc41ae9b461bc3bdbfb5e5845d5b6b2c Mon Sep 17 00:00:00 2001 From: Kamikaza731 Date: Wed, 11 Dec 2024 21:18:40 +0100 Subject: [PATCH 2/3] Small adjustment to Fast API and prometheus client, removed docs from the API, fixed prometheus from blocking TNOM from shutting down, fixed all jobs in the events the tasks were not shutting down, small adjustment to the dockerfile, exposed default prometheus port, --- .gitignore | 3 +- Dockerfile | 4 +- alert_example.yml | 16 +- pyproject.toml | 2 +- tnom/dead_man_switch/health_check.py | 69 +++-- tnom/main.py | 415 ++++++++++++++++----------- tnom/prometheus_client_endpoint.py | 38 ++- 7 files changed, 348 insertions(+), 199 deletions(-) diff --git a/.gitignore b/.gitignore index 20837e7..265224b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ build dist tnom.spec chain_database -tnom-exec \ No newline at end of file +tnom-exec +template \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index f160254..deb9b2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,4 +26,6 @@ RUN useradd -m pythonuser USER pythonuser # Specify everything just in case -CMD ["python", "/app/tnom/main.py", "--working-dir", "/app", "--config-path", "/app/config/config.yml", "--alert-path", "/app/config/alert.yml"] \ No newline at end of file +CMD ["python", "/app/tnom/main.py", "--working-dir", "/app", "--config-path", "/app/config/config.yml", "--alert-path", "/app/config/alert.yml"] + +EXPOSE 7130 \ No newline at end of file diff --git a/alert_example.yml b/alert_example.yml index fd6e985..a514978 100644 --- a/alert_example.yml +++ b/alert_example.yml @@ -1,7 +1,13 @@ # Enable which alerts you wants to use. You can use both telegram and pagerduty # at the same time or only one of them. -telegram_alerts: false # Set to true if you want to enable telegram alerts -pagerduty_alerts: false # Set to true if you want to enable pagerduty alerts +telegram_alerts: true # Set to true if you want to enable telegram alerts + +pagerduty_alerts: true # Set to true if you want to enable pagerduty alerts + +# This is totally optional. You do not need to use it if you do now want to. +health_check_enabled: true # Set to true if you want to enable health check + +prometheus_client_enabled: true # Set to true if you want to enable prometheus # Telegram telegram_bot_token: "your_telegram_bot_token" @@ -11,7 +17,9 @@ telegram_chat_id: "your_telegram_chat_id" pagerduty_routing_key: "your_pagerduty_routing_key" # Health check -# This is totally optional. You do not need to use it if you do now want to. -health_check_enabled: false # Set to true if you want to enable health check health_check_url: "https://hc-ping.com/1234567890abcdef" health_check_interval: 180 + +# Prometheus +prometheus_host: "127.0.0.1" # or "0.0.0.0" if you need open access +prometheus_port: "7125" # you can set to which ever port is open \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 219d0b7..6078a9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tnom" -version = "0.4.2" +version = "0.5.0" description = "The Nibiru Oracle Monitoring is a tool for monitoring signer wallet for Nibiru Oracle." authors = ["Kamikaza731"] readme = "README.md" diff --git a/tnom/dead_man_switch/health_check.py b/tnom/dead_man_switch/health_check.py index a011a37..2b8cd2b 100644 --- a/tnom/dead_man_switch/health_check.py +++ b/tnom/dead_man_switch/health_check.py @@ -12,33 +12,43 @@ """ from __future__ import annotations +import asyncio +import contextlib import logging -import time +from http import HTTPStatus -import requests -import schedule +import aiohttp -def dead_man_switch_trigger(dead_man_switch_url: str) -> None: - """Triggers a dead man switch at the given URL. +async def dead_man_switch_trigger(url: str) -> None: + """Async function to trigger dead man switch. Args: - dead_man_switch_url (str): The URL of the dead man switch to trigger. + url (str): The URL to trigger the dead man switch. Returns: None """ try: - requests.head(dead_man_switch_url, timeout=1) - except requests.RequestException: - logging.exception("Failed to trigger dead man switch.") + async with (aiohttp.ClientSession() as session, + session.get(url, timeout=10) as response): + if response.status == HTTPStatus.OK: + logging.info("Health check ping successful.") + else: + logging.warning( + "Health check ping failed. Status code: %s", response.status) + except Exception as e: + logging.exception("Error in health check ping: %s", e) # noqa: TRY401 + + -# TO DO TEST THIS FUNC OUT -def run_health_check( +async def run_health_check( dead_man_switch_url: str, interval: int, - max_iterations: int | None) -> None: + max_iterations: int | None, + shutdown_event: asyncio.Event, + ) -> None: """Triggers a dead man switch at the given URL at regular intervals. Args: @@ -46,6 +56,8 @@ def run_health_check( interval (int): The interval in seconds between each dead man switch trigger. max_iterations (int | None): The maximum number of iterations. If None, the loop will run indefinitely. + shutdown_event (asyncio.Event): The event to signal when the loop should + stop. Returns: None @@ -65,13 +77,28 @@ def run_health_check( msg = "interval must be greater than zero" raise ValueError(msg) - schedule.every(interval).seconds.do(dead_man_switch_trigger, dead_man_switch_url) - iteration = 0 - while max_iterations is None or iteration < max_iterations: - schedule.run_pending() - time.sleep(1.0) - iteration += 1 - - # Clear the scheduled job when done - schedule.clear() + try: + while not shutdown_event.is_set(): + # Check for max iterations if specified + if max_iterations is not None and iteration >= max_iterations: + break + + # Trigger health check + await dead_man_switch_trigger(dead_man_switch_url) + + # Wait for the interval or until shutdown is signaled + with contextlib.suppress(asyncio.TimeoutError): + await asyncio.wait_for( + shutdown_event.wait(), + timeout=interval, + ) + + iteration += 1 + + except asyncio.CancelledError: + logging.info("Health check task was cancelled.") + except Exception as e: + logging.exception("Error in health check: %s", e) # noqa: TRY401 + finally: + logging.info("Health check task shutting down.") diff --git a/tnom/main.py b/tnom/main.py index 844548f..a3437bb 100644 --- a/tnom/main.py +++ b/tnom/main.py @@ -1,6 +1,7 @@ import argparse import asyncio import logging +import signal import sys from pathlib import Path @@ -389,7 +390,7 @@ def setup_argument_parser() -> argparse.ArgumentParser: working_dir = Path.cwd() parser.add_argument( - "--working-dir", # Changed from working_dir to working-dir for consistency + "--working-dir", type=str, help="The working directory for config files and database\n" "Default: current working directory", @@ -418,7 +419,25 @@ def setup_argument_parser() -> argparse.ArgumentParser: parser.add_argument( "--version", action="version", - version="v0.4.2", + version="v0.5.0", + ) + + parser.add_argument( + "--prometheus-host", + type=str, + help="Prometheus host to run on\n" + "Overrides host in alert config file\n" + "Default: 127.0.0.1 if not specified in config", + required=False, + ) + + parser.add_argument( + "--prometheus-port", + type=int, + help="Prometheus port to run on\n" + "Overrides port in alert config file\n" + "Default: 7130 if not specified in config", + required=False, ) return parser @@ -471,10 +490,42 @@ async def main() -> None: logging.error("No alerts are enabled! Please enable at least one alert system.") sys.exit(1) + # Determine Prometheus host and port + prometheus_host : str = (args.prometheus_host + or alert_yml.get("prometheus_host")) + prometheus_port : int = (args.prometheus_port + or alert_yml.get("prometheus_port")) + monitoring_system = MonitoringSystem(config_yml, alert_yml, database_path) - # Create event loop - loop = asyncio.get_event_loop() + shutdown_event = asyncio.Event() + + async def graceful_shutdown(tasks: list) -> None: + """Perform a more controlled shutdown of tasks. + + Args: + tasks (list): List of running tasks to shut down + + """ + if not tasks: + return + + # First, cancel all tasks + for task in tasks: + if not task.done(): + task.cancel() + + # Wait for tasks to complete, but with a timeout + try: + await asyncio.wait_for( + asyncio.gather(*tasks, return_exceptions=True), + timeout=10.0, + # Set timeout to 10 secounds, maybe set to 60 in the future? + ) + except asyncio.TimeoutError: + logging.warning("Some tasks did not shut down within the timeout.") + except Exception as e: + logging.exception("Error during shutdown: %s", e) async def monitoring_loop() -> None: """The main loop of the monitoring system. @@ -486,138 +537,145 @@ async def monitoring_loop() -> None: If an exception is raised, the loop will log the exception and sleep for 10 seconds before continuing. """ - while True: - try: - # Step three - check APIs - latest_epoch = ( - database_handler.read_last_recorded_epoch(database_path)) - healthy_apis = await check_apis(config_yml) - while not healthy_apis: - logging.error("Failed to check APIs") - await monitoring_system.process_api_not_working( - latest_epoch, no_healthy_apis=True) - # stop the script here and start from while True again until there - # is a healthy api - await asyncio.sleep(config_yml.get("monitoring_interval", 60)) + try: + while True: + try: + # Step three - check APIs + latest_epoch = ( + database_handler.read_last_recorded_epoch(database_path)) healthy_apis = await check_apis(config_yml) - # this is needed to revert the consecutive_misses counter - if healthy_apis: - await monitoring_system.process_api_not_working( - latest_epoch, no_healthy_apis=False) - - # Step four - Make query with random healthy API - query_results = await query_rand_api.collect_data_from_random_healthy_api( # noqa: E501 - healthy_apis, config_yml) - - # Process query data - query_data = { - "miss_counter": query_results["miss_counter"], - "check_for_aggregate_votes": query_results["check_for_aggregate_votes"], # noqa: E501 - "current_epoch": query_results["current_epoch"], - "wallet_balance": query_results["wallet_balance"], - } - # Step five - Write data to database - # Check if the current_epoch exists in the database - if database_handler.check_if_epoch_is_recorded( - database_path, query_data["current_epoch"]): - logging.info("Writing data in the existing epoch") - # Step 5.2a - Update the existing db with data - - # read current epoch data - read_crw_data: dict = database_handler.read_current_epoch_data( - database_path, query_data["current_epoch"]) - db_unsigned_or_ev: int = read_crw_data["unsigned_oracle_events"] - db_small_bal_alert: int = ( - read_crw_data["small_balance_alert_executed"]) - db_very_small_bal_alert: int = ( - read_crw_data["very_small_balance_alert_executed"]) - db_consecutive_misses: int = read_crw_data["consecutive_misses"] - db_miss_counter_p1_executed : int = read_crw_data[ - "miss_counter_p1_executed"] - db_miss_counter_p2_executed : int = read_crw_data[ - "miss_counter_p2_executed"] - db_miss_counter_p3_executed : int = read_crw_data[ - "miss_counter_p3_executed"] - db_api_cons_miss : int = read_crw_data["api_cons_miss"] - # if the check failed the return should be false adding +1 to not - # signing events - if query_data["check_for_aggregate_votes"] is False: - db_unsigned_or_ev += 1 - logging.info(f"Incrementing unsigned events to: {db_unsigned_or_ev}") - insert_data: dict[int] = { - "slash_epoch": query_data["current_epoch"], - "miss_counter_events": query_data["miss_counter"], - "miss_counter_p1_executed": db_miss_counter_p1_executed, - "miss_counter_p2_executed": db_miss_counter_p2_executed, - "miss_counter_p3_executed": db_miss_counter_p3_executed, - "unsigned_oracle_events": db_unsigned_or_ev, - "price_feed_addr_balance": query_data["wallet_balance"], - "small_balance_alert_executed": db_small_bal_alert, - "very_small_balance_alert_executed": db_very_small_bal_alert, - "consecutive_misses": db_consecutive_misses, - "api_cons_miss": db_api_cons_miss, + while not healthy_apis: + logging.error("Failed to check APIs") + await monitoring_system.process_api_not_working( + latest_epoch, no_healthy_apis=True) + # stop the script here and start from while True again until there + # is a healthy api + await asyncio.sleep(config_yml.get("monitoring_interval", 60)) + healthy_apis = await check_apis(config_yml) + # this is needed to revert the consecutive_misses counter + if healthy_apis: + await monitoring_system.process_api_not_working( + latest_epoch, no_healthy_apis=False) + + # Step four - Make query with random healthy API + query_results = await query_rand_api.collect_data_from_random_healthy_api( # noqa: E501 + healthy_apis, config_yml) + + # Process query data + query_data = { + "miss_counter": query_results["miss_counter"], + "check_for_aggregate_votes": query_results["check_for_aggregate_votes"], # noqa: E501 + "current_epoch": query_results["current_epoch"], + "wallet_balance": query_results["wallet_balance"], } - database_handler.write_epoch_data(database_path, insert_data) - elif database_handler.check_if_epoch_is_recorded( - database_path, query_data["current_epoch"]) is False: - logging.info("Writing data in a new epoch") - # Step 5.2b if no db, no current epoch entered or just starting for - # first time - - # make the new entry in the db - - # check if there is a previous entry + # Step five - Write data to database + # Check if the current_epoch exists in the database if database_handler.check_if_epoch_is_recorded( - database_path, query_data["current_epoch"] - 1): - read_prev_crw_data : dict = database_handler.read_current_epoch_data( # noqa: E501 - database_path, query_data["current_epoch"] - 1) - if (read_prev_crw_data["slash_epoch"] - == query_data["current_epoch"] - 1): - prev_small_bal_alert: int = read_prev_crw_data[ - "small_balance_alert_executed"] - prev_very_small_bal_alert: int = read_prev_crw_data[ - "very_small_balance_alert_executed"] - prev_consecutive_misses: int = read_prev_crw_data[ - "consecutive_misses"] + database_path, query_data["current_epoch"]): + logging.info("Writing data in the existing epoch") + # Step 5.2a - Update the existing db with data + + # read current epoch data + read_crw_data: dict = database_handler.read_current_epoch_data( + database_path, query_data["current_epoch"]) + db_unsigned_or_ev: int = read_crw_data["unsigned_oracle_events"] + db_small_bal_alert: int = ( + read_crw_data["small_balance_alert_executed"]) + db_very_small_bal_alert: int = ( + read_crw_data["very_small_balance_alert_executed"]) + db_consecutive_misses: int = read_crw_data["consecutive_misses"] + db_miss_counter_p1_executed : int = read_crw_data[ + "miss_counter_p1_executed"] + db_miss_counter_p2_executed : int = read_crw_data[ + "miss_counter_p2_executed"] + db_miss_counter_p3_executed : int = read_crw_data[ + "miss_counter_p3_executed"] + db_api_cons_miss : int = read_crw_data["api_cons_miss"] + # if the check failed the return should be false adding +1 to not + # signing events + if query_data["check_for_aggregate_votes"] is False: + db_unsigned_or_ev += 1 + logging.info(f"Incrementing unsigned events to: {db_unsigned_or_ev}") + insert_data: dict[int] = { + "slash_epoch": query_data["current_epoch"], + "miss_counter_events": query_data["miss_counter"], + "miss_counter_p1_executed": db_miss_counter_p1_executed, + "miss_counter_p2_executed": db_miss_counter_p2_executed, + "miss_counter_p3_executed": db_miss_counter_p3_executed, + "unsigned_oracle_events": db_unsigned_or_ev, + "price_feed_addr_balance": query_data["wallet_balance"], + "small_balance_alert_executed": db_small_bal_alert, + "very_small_balance_alert_executed": db_very_small_bal_alert, + "consecutive_misses": db_consecutive_misses, + "api_cons_miss": db_api_cons_miss, + } + database_handler.write_epoch_data(database_path, insert_data) elif database_handler.check_if_epoch_is_recorded( - database_path, query_data["current_epoch"] - 1) is False: - prev_small_bal_alert = 0 - prev_very_small_bal_alert = 0 - prev_consecutive_misses = 0 - insert_data: dict[int] = { - "slash_epoch": query_data["current_epoch"], - "miss_counter_events": query_data["miss_counter"], - "miss_counter_p1_executed": 0, - "miss_counter_p2_executed": 0, - "miss_counter_p3_executed": 0, - "unsigned_oracle_events": 0, - "price_feed_addr_balance": query_data["wallet_balance"], - "small_balance_alert_executed": prev_small_bal_alert, - "very_small_balance_alert_executed": prev_very_small_bal_alert, - "consecutive_misses": prev_consecutive_misses, - "api_cons_miss": 0, - } - database_handler.write_epoch_data(database_path, insert_data) - # Process alerts - await monitoring_system.process_balance_alerts(query_data, insert_data) - await monitoring_system.process_signing_alerts( - query_data["current_epoch"], - query_data, - insert_data["unsigned_oracle_events"], - ) - await monitoring_system.process_miss_parameter_alerts( - query_data, insert_data, - ) - - # Sleep for interval - await asyncio.sleep(config_yml.get("monitoring_interval", 60)) - except asyncio.CancelledError: # noqa: PERF203 - logging.info("Monitoring loop cancelled.") - raise - except Exception as e: - logging.exception("Error in monitoring loop: %s", e) # noqa: TRY401 - await asyncio.sleep(10) - # To do reaserch RuffPERF203 + database_path, query_data["current_epoch"]) is False: + logging.info("Writing data in a new epoch") + # Step 5.2b if no db, no current epoch entered or just starting for + # first time + + # make the new entry in the db + + # check if there is a previous entry + if database_handler.check_if_epoch_is_recorded( + database_path, query_data["current_epoch"] - 1): + read_prev_crw_data : dict = database_handler.read_current_epoch_data( # noqa: E501 + database_path, query_data["current_epoch"] - 1) + if (read_prev_crw_data["slash_epoch"] + == query_data["current_epoch"] - 1): + prev_small_bal_alert: int = read_prev_crw_data[ + "small_balance_alert_executed"] + prev_very_small_bal_alert: int = read_prev_crw_data[ + "very_small_balance_alert_executed"] + prev_consecutive_misses: int = read_prev_crw_data[ + "consecutive_misses"] + elif database_handler.check_if_epoch_is_recorded( + database_path, query_data["current_epoch"] - 1) is False: + prev_small_bal_alert = 0 + prev_very_small_bal_alert = 0 + prev_consecutive_misses = 0 + insert_data: dict[int] = { + "slash_epoch": query_data["current_epoch"], + "miss_counter_events": query_data["miss_counter"], + "miss_counter_p1_executed": 0, + "miss_counter_p2_executed": 0, + "miss_counter_p3_executed": 0, + "unsigned_oracle_events": 0, + "price_feed_addr_balance": query_data["wallet_balance"], + "small_balance_alert_executed": prev_small_bal_alert, + "very_small_balance_alert_executed": prev_very_small_bal_alert, + "consecutive_misses": prev_consecutive_misses, + "api_cons_miss": 0, + } + database_handler.write_epoch_data(database_path, insert_data) + # Process alerts + await monitoring_system.process_balance_alerts(query_data, insert_data) + await monitoring_system.process_signing_alerts( + query_data["current_epoch"], + query_data, + insert_data["unsigned_oracle_events"], + ) + await monitoring_system.process_miss_parameter_alerts( + query_data, insert_data, + ) + # Check for shutdown between major operations + if shutdown_event.is_set(): + break + # Sleep for interval + await asyncio.sleep(config_yml.get("monitoring_interval", 60)) + except Exception as e: + if not shutdown_event.is_set(): + logging.exception("Error in monitoring loop: %s", e) + await asyncio.sleep(10) + else: + break + except asyncio.CancelledError: + logging.info("Monitoring loop cancelled.") + finally: + logging.info("Monitoring loop shutting down gracefully.") + # To do reaserch RuffPERF203 async def health_check_task() -> None: """Runs the health check task asynchronously. @@ -631,46 +689,67 @@ async def health_check_task() -> None: """ try: - await asyncio.to_thread( - dead_man_switch.run_health_check, - alert_yml["health_check_url"], - alert_yml["health_check_interval"], - None, - ) if alert_yml["health_check_enabled"] else None + if alert_yml["health_check_enabled"]: + await dead_man_switch.run_health_check( + alert_yml["health_check_url"], + alert_yml["health_check_interval"], + None, + shutdown_event, + ) except asyncio.CancelledError: logging.info("Health check task cancelled.") - raise - - # Run monitoring and health check concurrently - tasks = [ - monitoring_loop(), - ] - - if alert_yml["health_check_enabled"]: - tasks.append(health_check_task()) - if alert_yml["prometheus_client_enabled"] is True: - # acquire latest epoch data - latest_epoch :int = database_handler.read_last_recorded_epoch(database_path) - prometheus = prom.PrometheusMetrics(database_path, latest_epoch) - prometheus_host = alert_yml["prometheus_host"] - prometheus_port = alert_yml["prometheus_port"] - tasks.append( - prom.start_metrics_server(prometheus, prometheus_host, prometheus_port)) + finally: + logging.info("Health check task shutting down gracefully.") + + # Create a single shutdown handler + def handle_shutdown() -> None: + """Shutdown signal handler. + + Set the shutdown event when a shutdown signal is received, + unless the event is already set. + """ + if not shutdown_event.is_set(): + logging.info("Shutdown signal received. Stopping tasks...") + shutdown_event.set() + + # Setup signal handlers + loop = asyncio.get_event_loop() + for sig in (signal.SIGINT, signal.SIGTERM): + loop.add_signal_handler(sig, handle_shutdown) + + # Prepare tasks + tasks = [] try: - await asyncio.gather(*tasks, return_exceptions=True) - except KeyboardInterrupt: - logging.info("Interrupt received, shutting down...") - finally: - # Cancel all tasks - for task in asyncio.all_tasks(): - if task is not asyncio.current_task(): - task.cancel() - loop.close() + # Create tasks + monitoring_task = asyncio.create_task(monitoring_loop()) + tasks.append(monitoring_task) + + if alert_yml["health_check_enabled"]: + health_check_task_obj = asyncio.create_task(health_check_task()) + tasks.append(health_check_task_obj) + + if alert_yml["prometheus_client_enabled"]: + latest_epoch = database_handler.read_last_recorded_epoch(database_path) + prometheus = prom.PrometheusMetrics(database_path, latest_epoch) + prometheus_task = asyncio.create_task( + prom.start_metrics_server( + prometheus, + prometheus_host, + prometheus_port, + shutdown_event, + ), + ) + tasks.append(prometheus_task) + + # Wait for tasks or shutdown + await shutdown_event.wait() - # Wait for all tasks to be cancelled - await asyncio.gather(*asyncio.all_tasks(), return_exceptions=True) - logging.info("All tasks cancelled successfully.") - # TO do fix task cancelation + except Exception as e: + logging.exception("Unexpected error: %s", e) + finally: + # Perform graceful shutdown + await graceful_shutdown(tasks) + logging.info("All tasks stopped successfully.") if __name__ == "__main__": try: diff --git a/tnom/prometheus_client_endpoint.py b/tnom/prometheus_client_endpoint.py index 88701c4..59ae06c 100644 --- a/tnom/prometheus_client_endpoint.py +++ b/tnom/prometheus_client_endpoint.py @@ -10,7 +10,8 @@ - create_prometheus_client: Create and return a FastAPI app with Prometheus metrics. - start_metrics_server: Start the Prometheus metrics server. """ - +import asyncio +import logging from pathlib import Path import hypercorn.asyncio @@ -144,7 +145,12 @@ def create_prometheus_client(metrics: PrometheusMetrics) -> FastAPI: FastAPI: A FastAPI application with Prometheus metrics endpoint mounted. """ - app = FastAPI() + app = FastAPI( + docs_url=None, + redoc_url=None, + openapi_url=None, # no need for documentation since API + #is only used to communicate with Prometheus + ) metrics.update_metrics() metrics_app = make_asgi_app() app.mount ("/metrics", metrics_app) @@ -154,6 +160,7 @@ async def start_metrics_server( metrics: PrometheusMetrics, prometheus_host: str, prometheus_port: int, + shutdown_event: asyncio.Event, ) -> None: """Start the Prometheus metrics server. @@ -166,6 +173,11 @@ async def start_metrics_server( metrics (PrometheusMetrics): The metrics object to collect metrics from. prometheus_port (int): The port to listen on. prometheus_host (str): The hostname to listen on. + shutdown_event (asyncio.Event): An event to notify when the server + should shutdown. + + Returns: + None """ config = hypercorn.config.Config() @@ -179,4 +191,24 @@ async def start_metrics_server( config.shutdown_timeout = 10 app = create_prometheus_client(metrics) - await hypercorn.asyncio.serve(app, config, mode="asgi") + async def shutdown_trigger() -> None: + """Wait for the shutdown event to be set. + + This function waits asynchronously for the `shutdown_event` to be set, + indicating that the server should shut down. + + """ + await shutdown_event.wait() + + try: + await hypercorn.asyncio.serve( + app, + config, + mode="asgi", + shutdown_trigger=shutdown_trigger) + except asyncio.CancelledError: + logging.info("Prometheus server task cancelled.") + # Hypercorn's serve function doesn't automatically clean up + # Make sure to release resources if necessary here. + finally: + logging.info("Prometheus server shutting down.") From df199fb981f5c24dc8e094cffd1453ddc80afe06 Mon Sep 17 00:00:00 2001 From: Kamikaza731 Date: Thu, 12 Dec 2024 22:00:01 +0100 Subject: [PATCH 3/3] Small adjustment to readme, alert and config example, updated changelog, small adjustments in main.py for variable names etc... look more into prometheus metrics --- CHANGELOG.md | 16 ++++ README.md | 115 +++++++++++++++++++---------- alert_example.yml | 2 +- config_example.yml | 4 +- tnom/main.py | 10 +-- tnom/prometheus_client_endpoint.py | 6 +- 6 files changed, 105 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4abb7f..701ae14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.5.0] - 2024-12-11 + +### Added + +- Added prometheus client +- Added new argument flags for prometheus + +### Fixed + +- Fixed the issue with task not shutting down gracefully + +### Changed + +- Small changes to dead man switch to adjust to work with async tasks + + ## [0.4.2] - 2024-12-10 ### Fixed diff --git a/README.md b/README.md index 8c99a52..0be66fa 100644 --- a/README.md +++ b/README.md @@ -7,27 +7,35 @@ script can be compiled using Nuitka. ## Table of Contents -- [TNOM - The Nibiru Oracle Monitor](#tnom---the-nibiru-oracle-monitor) - [How does the TNOM work?](#how-does-the-tnom-work) - [Installation](#installation) - [Requirements](#requirements) - [Set up config](#set-up-config) -- [Run the script](#run-the-script) -- [Deploy the script](#deploy-the-script) +- [How to run the script](#how-to-run-the-script) +- [Deployment options](#deployment-options) -# How does the TNOM work? -TNOM is a monitoring tool. It collects data from Nibiru's REST APIs. The data is then -stored in the local SQLite database. It then checks if the data is within the expected -range. For example, if a certain amount of unsigned events from oracle happens, or if the -wallet balance is below a certain amount. If the data is not within the expected range, -TNOM alerts the user. Currently, it only supports Pagerduty and Telegram as the -means of sending alerts. It also supports health checks if the script or your server halts. +# How does the TNOM work? -TNOM can be run directly using Python, or you can compile the code using Nuitka. -The compiled version is somewhat faster but the con is that it might take a while -to build it. You can also download the compiled version. +TNOM is a monitoring tool that collects data related to Nibiru's Oracle price feeder. +The data is queried from the Nibiru REST API and stored in a SQLite database. +If some parameters are out of the expected range, TNOM alerts the user, by Pagerduty +or Telegram. +As an addition features it has a dead man switch that can be triggered if the +monitoring tool stops working. +At the moment there is also a prometheus endpoint which can be used to send metrics +from the SQLite database to Grafana or maybe you want to make your own alerting +system. + +It checks for the following parameters: +- wallet balance (alerts if it has less then 1 NIBI and there is also critical alerts +if it has less than 0.1 NIBI) +- number of unsigned events (alerts if there were more than 10 events and another +after 20, there is also an alert if there were consecutive 3 misses) +- number of miss events* (this parameter was not tested yet and I think it is related +when price feeder gives to low/high price) +- if all APIs are available ( if not it will send an alert) # Installation @@ -42,6 +50,24 @@ the compiled version. ## Requirements +Hardware and Operating System Requirements: + +- CPU Arch: AMD64, ARM 64-bit* (ARM should also work but it is not tested) +- OS: Linux ( Tested on Rocky Linux 9, Ubuntu 24.01 and Debian 12, but it should run +on any Linux distro, in theory script should also run on MacOS but it is not tested) + +Minimum Requirements: + +- CPU Core: At least 2 cores (recommended to run script directly or precompiled TNOM) +- RAM: At least 1-2 GB (recommended to run script directly or precompiled TNOM) + +Recommended Requirements: + +- CPU Core: At least 4 cores +- RAM: At least 4 GB + +Software Requirements: + - Python 3.11 or higher - All of the requirements in the requirements.txt file - Poetry (optional) @@ -102,10 +128,10 @@ vim config.yml # or nano config.yml ``` The alert config is a yml file that stores the alert details. It requires to have -Pagerduty or Telegram details. You can also use both options if you desire. Another -feature is a dead man switch. This feature will send an alert if the script -does not run for a while. It is not required, but it is recommended. -Check the alert_example.yml for an example. +Pagerduty or Telegram details. You can also use both options if you desire. +Some additional features are dead man switch and prometheus metrics endpoint. +Dead man switch is a feature that can be triggered if the monitoring tool stops +working. It is not a requriement but it is recommended to set it up. ```bash cp alert_example.yml alert.yml @@ -114,9 +140,10 @@ vim alert.yml # or nano alert.yml For alert to work you will need at least a Telegram bot and chat ID, and for Pagerduty you will need a routing key from Events API v2. The alerts have multiple severity -levels so you can set dynamic notifications. Setting up telegram, pagerduty and health -check is out of the scope of this guide. If there are many interested people, I might -create a guide on how to set them up. Some links to get you started: +levels so you can set dynamic notifications. For the dead man switch you need to get +link to your health check endpoint. Setting up telegram, pagerduty and health +check is out of the scope of this guide. If there are many interested people, this +section could be expanded a bit on that topic.Some links to get you started: - [PagerDuty](https://developer.pagerduty.com/docs/3d063fd4814a6-events-api-v2-overview) - [Telegram](https://core.telegram.org/bots/tutorial) @@ -143,9 +170,11 @@ You can print help if you feel stuck. poetry run python tnom/main.py -h # or python tnom/main.py --help # or build/tnom -h if you built executable binary usage: main.py [-h] [--working-dir WORKING_DIR] [--config-path CONFIG_PATH] - [--alert-path ALERT_PATH] [--version] + [--alert-path ALERT_PATH] [--version] + [--prometheus-host PROMETHEUS_HOST] + [--prometheus-port PROMETHEUS_PORT] -Monitoring system for price feeds and wallet balances +Monitoring tool for tracking Nibiru Oracle options: -h, --help show this help message and exit @@ -154,27 +183,35 @@ options: Default: current working directory --config-path CONFIG_PATH Path to the config YAML file - Default: /home/user/tnom/config.yml + Default always looks to the current dir: /home/nm/Desktop/tnom/config.yml --alert-path ALERT_PATH Path to the alert YAML file - Default: /home/user/tnom/alert.yml + Default always looks to the current dir: /home/nm/Desktop/tnom/alert.yml --version show program's version number and exit + --prometheus-host PROMETHEUS_HOST + Prometheus host to run on + Overrides host in alert config file + Default: 127.0.0.1 if not specified in config + --prometheus-port PROMETHEUS_PORT + Prometheus port to run on + Overrides port in alert config file + Default: 7130 if not specified in config ``` -Flag working-dir is optional. It always takes as a default value the +Flag `--working-dir` is optional. It always takes as a default value the current directory from where the script is run. For example, if you run it while in /home/user/ that will be considered a working directory. It is recommended to use for consistency because it will later generate a database in that directory. You can select any directory but it might make sense to place it inside the project root directory. -The config-path is the path to config.yml file. It is optional but as a default, it -takes as a value /working-dir/config.yml. So you can place it inside of your work dir, -or if you want to keep it separate you can create a directory just for the config and -then when running use the argument --config-path /path/to/config.yml. +The `--config-path` is the path to config.yml file. It is optional but as a default, +it takes as a value /working-dir/config.yml. So you can place it inside of your work +dir, or if you want to keep it separate you can create a directory just for the config and then when running use the argument --config-path /path/to/config.yml. -The alert-path is the path to alert.yml file. It is optional and it functions -identically to the --config-path argument. So you either place it in the working dir or use argument --alert-path /path/to/alert.yml while running the script. +The `--alert-path` is the path to alert.yml file. It is optional and it functions +identically to the `--config-path` argument. So you either place it in the working +dir or use argument `--alert-path` /path/to/alert.yml while running the script. ### Option 1 - Running the script directly @@ -251,10 +288,7 @@ or /usr/local/bin/whichever you prefer. ## Deployment options -Here you can find some basic setup to deploy it on the server and run it 24/7. -A word of advice, whatever program you use you need to have a kill signal in place. -At the moment the script has a problem to stop the script gracefully. Until this is -fixed use something similar to KillSignal=SIGINT. +Here you can find some basic setup to deploy it on the server. ### Systemd @@ -322,7 +356,7 @@ sudo systemctl start tnom.service From the Dockerfile create the image. ```bash -docker build -t tnom:v0.3.0 . +docker build -t tnom:v0.5.0 . # Create config directories mkdir -p ./config mkdir -p ./chain_database @@ -338,5 +372,10 @@ docker run -d \ -v $(pwd)/config:/app/config \ -v $(pwd)/chain_database:/app/chain_database \ --name tnom \ - tnom:v0.3.0 -``` \ No newline at end of file + tnom:v0.5.0 +``` + +# Prometheus metrics + +Prometheus metrics are available at http://localhost:7130/metrics by default. +Should add soon whole metrics page when some parameters are adjusted. diff --git a/alert_example.yml b/alert_example.yml index a514978..04df6be 100644 --- a/alert_example.yml +++ b/alert_example.yml @@ -22,4 +22,4 @@ health_check_interval: 180 # Prometheus prometheus_host: "127.0.0.1" # or "0.0.0.0" if you need open access -prometheus_port: "7125" # you can set to which ever port is open \ No newline at end of file +prometheus_port: "7130" # you can set to which ever port is open \ No newline at end of file diff --git a/config_example.yml b/config_example.yml index d0d7fd8..4930d81 100644 --- a/config_example.yml +++ b/config_example.yml @@ -1,13 +1,13 @@ # enter your own valoper address validator_address: "nibivaloper1s7dm9d34jehmudn8afhchzf9l46wt4nz7fauzs" # enter your price feed address is you use another address to sign oracle transactions, -# otherwise use your validator address +# otherwise use your validator wallet address price_feed_addr: "nibi1s7dm9d34jehmudn8afhchzf9l46wt4nzhjn8kd" APIs: - - http://localhost:1317 # the script should be able to work with any local network port - https://nibiru-api.cogwheel.zone - https://nibiru-api.polkachu.com - https://rest.lavenderfive.com:443/nibiru - https://nibiru.api.m.stavr.tech - https://lcd-nibiru.imperator.co +# - http://localhost:1317 # the script should be able to work with any local network port monitoring_interval: 60 # default is 60s diff --git a/tnom/main.py b/tnom/main.py index a3437bb..eaa331f 100644 --- a/tnom/main.py +++ b/tnom/main.py @@ -17,8 +17,8 @@ ONE_NIBI = 1000000 ZERO_PT_ONE = 100000 CONSECUTIVE_MISSES_THRESHOLD = 3 -TOTAL_MISSES_THRESHOLD = 10 -CRITICAL_MISSES_THRESHOLD = 20 +P2_UNSIGNED_EV_THR = 10 +P1_UNSIGNED_EV_THR = 20 API_CONS_MISS_THRESHOLD = 3 class MonitoringSystem: @@ -240,7 +240,7 @@ async def process_signing_alerts( self.alert_sent["consecutive"] = True # Check total misses - if total_misses >= TOTAL_MISSES_THRESHOLD and not self.alert_sent["total"]: + if total_misses >= P2_UNSIGNED_EV_THR and not self.alert_sent["total"]: alerts_to_send.append({ "details": { "total_misses": total_misses, @@ -253,7 +253,7 @@ async def process_signing_alerts( self.alert_sent["total"] = True # Check critical threshold - if (total_misses >= CRITICAL_MISSES_THRESHOLD + if (total_misses >= P1_UNSIGNED_EV_THR and not self.alert_sent["critical"]): alerts_to_send.append({ "details": { @@ -382,7 +382,7 @@ async def process_api_not_working( def setup_argument_parser() -> argparse.ArgumentParser: """Set up and return the argument parser with all arguments configured.""" parser = argparse.ArgumentParser( - description="Monitoring system for price feeds and wallet balances", + description="Monitoring tool for tracking Nibiru oracle price feeder.", formatter_class=argparse.RawTextHelpFormatter, ) diff --git a/tnom/prometheus_client_endpoint.py b/tnom/prometheus_client_endpoint.py index 59ae06c..b561986 100644 --- a/tnom/prometheus_client_endpoint.py +++ b/tnom/prometheus_client_endpoint.py @@ -39,8 +39,10 @@ def __init__(self, db_path: Path, epoch: int) -> None: """ self.db_path = db_path self.epoch = epoch - - namespace = "nibiru_oracle" + # maybe set better namespace? + # TO DO adjust metrics + # Keep or discard script/program python data? + namespace = "tnom" self.slash_epoch = Gauge( f"{namespace}_slash_epoch",