diff --git a/README.md b/README.md index c60b278..185543b 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Observe Pyth on-chain price feeds and run sanity checks on the data. ## Usage -Container images are available at https://gallery.ecr.aws/pyth-network/observer. +Container images are available at https://github.com/pyth-network/pyth-observer/pkgs/container/pyth-observer To run Observer locally, make sure you have a recent version of [Poetry](https://python-poetry.org) installed and run: @@ -38,6 +38,14 @@ Event types are configured via environment variables: - `ZENDUTY_INTEGRATION_KEY` - Integration key for Zenduty service API integration - `OPEN_ALERTS_FILE` - Path to local file used for persisting open alerts +### Zenduty Alert Thresholds +- Zenduty alert will fire if a check fails 5 or more times within 5 minutes. +- The alert will be resolved if the check failed < 4 times within 5 minutes. +- Checks run approximately once per minute. +- These thresholds can be overridden per check type in config.yaml + - `zenduty_alert_threshold`: number of failures in 5 minutes >= to this value trigger an alert (default: 5) + - `zenduty_resolution_threshold`: number of failures in 5 minutes <= this value resolve the alert (default: 3) + ## Finding the Telegram Group Chat ID To integrate Telegram events with the Observer, you need the Telegram group chat ID. Here's how you can find it: diff --git a/pyproject.toml b/pyproject.toml index 856fa8a..d4637ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ ignore_missing_imports = true [tool.poetry] name = "pyth-observer" -version = "0.2.10" +version = "0.2.11" description = "Alerts and stuff" authors = [] readme = "README.md" diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index 9b93933..681e497 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -84,6 +84,7 @@ async def run(self, states: List[State]): alert = self.open_alerts.get(alert_identifier) if alert is None: self.open_alerts[alert_identifier] = { + "type": check.__class__.__name__, "window_start": current_time.isoformat(), "failures": 1, "last_window_failures": None, @@ -175,21 +176,31 @@ async def process_zenduty_events(self, current_time): for identifier, info in self.open_alerts.items(): self.check_zd_alert_status(identifier, current_time) - # Resolve the alert if raised and failed < 5 times in the last 5m window + check_config = self.config["checks"]["global"][info["type"]] + alert_threshold = check_config.get("zenduty_alert_threshold", 5) + resolution_threshold = check_config.get("zenduty_resolution_threshold", 3) + # Resolve the alert if raised and failed < $threshold times in the last 5m window + resolved = False if ( - info["sent"] - and info["last_window_failures"] is not None - and info["last_window_failures"] < 5 + info["last_window_failures"] is not None + and info["last_window_failures"] <= resolution_threshold ): logger.debug(f"Resolving Zenduty alert {identifier}") - response = await send_zenduty_alert( - identifier, identifier, resolved=True - ) - if response and 200 <= response.status < 300: + resolved = True + if info["sent"]: + response = await send_zenduty_alert( + identifier, identifier, resolved=True + ) + if response and 200 <= response.status < 300: + to_remove.append(identifier) + else: to_remove.append(identifier) - # Raise alert if failed > 5 times within the last 5m window - # re-alert every 5 minutes - elif info["failures"] >= 5 and ( + # Raise alert if failed > $threshold times within the last 5m window + # or if already alerted and not yet resolved. + # Re-alert every 5 minutes but not more often. + elif ( + info["failures"] >= alert_threshold or (info["sent"] and not resolved) + ) and ( not info.get("last_alert") or current_time - datetime.fromisoformat(info["last_alert"]) > timedelta(minutes=5) diff --git a/sample.config.yaml b/sample.config.yaml index d960629..941ea8a 100644 --- a/sample.config.yaml +++ b/sample.config.yaml @@ -11,6 +11,7 @@ events: # - DatadogEvent - LogEvent # - TelegramEvent + - ZendutyEvent checks: global: # Price feed checks @@ -18,6 +19,8 @@ checks: enable: true max_slot_distance: 25 abandoned_slot_distance: 100000 + zenduty_alert_threshold: 3 + zenduty_resolution_threshold: 0 PriceFeedCoinGeckoCheck: enable: true max_deviation: 5 @@ -44,11 +47,15 @@ checks: enable: true max_slot_distance: 25 max_aggregate_distance: 6 + zenduty_alert_threshold: 5 + zenduty_resolution_threshold: 2 PublisherStalledCheck: enable: false stall_time_limit: 30 abandoned_time_limit: 600 max_slot_distance: 25 + zenduty_alert_threshold: 1 + zenduty_resolution_threshold: 0 # Per-symbol config Crypto.MNGO/USD: PriceFeedOfflineCheck: