From d57a39380c6a7f59977caaf90658ade87148d281 Mon Sep 17 00:00:00 2001 From: ayazabbas <30928485+ayazabbas@users.noreply.github.com> Date: Wed, 12 Jun 2024 18:29:51 +0100 Subject: [PATCH] telegram event noise reduction (#77) * apply the same notification rules on zd events to tg events * update sample config and version bump * linting --- README.md | 10 +++---- pyproject.toml | 2 +- pyth_observer/dispatch.py | 24 ++++++++++------- sample.config.yaml | 57 ++++++++++++++++++++++++++++----------- 4 files changed, 62 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 185543b..dc01a66 100644 --- a/README.md +++ b/README.md @@ -33,18 +33,18 @@ Event types are configured via environment variables: - `TelegramEvent` - `TELEGRAM_BOT_TOKEN` - API token for the Telegram bot + - `OPEN_ALERTS_FILE` - Path to local file used for persisting open alerts - `ZendutyEvent` - `ZENDUTY_INTEGRATION_KEY` - Integration key for Zenduty service API integration - `OPEN_ALERTS_FILE` - Path to local file used for persisting open alerts -### Zenduty Alert Thresholds -- Zenduty alert will fire if a check fails 5 or more times within 5 minutes. -- The alert will be resolved if the check failed < 4 times within 5 minutes. +### Alert Thresholds +- Alert thresholds apply to ZendutyEvent and TelegramEvent (resolution only applies to zenduty) - Checks run approximately once per minute. - These thresholds can be overridden per check type in config.yaml - - `zenduty_alert_threshold`: number of failures in 5 minutes >= to this value trigger an alert (default: 5) - - `zenduty_resolution_threshold`: number of failures in 5 minutes <= this value resolve the alert (default: 3) + - `alert_threshold`: number of failures in 5 minutes >= to this value trigger an alert (default: 5) + - `resolution_threshold`: number of failures in 5 minutes <= this value resolve the alert (default: 3) ## Finding the Telegram Group Chat ID diff --git a/pyproject.toml b/pyproject.toml index 878fc92..bffa397 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ ignore_missing_imports = true [tool.poetry] name = "pyth-observer" -version = "0.2.12" +version = "0.2.13" description = "Alerts and stuff" authors = [] readme = "README.md" diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index 886e453..6338bd4 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -47,7 +47,7 @@ def __init__(self, config, publishers): self.open_alerts = self.load_alerts() # below is used to store events to later send if mutilple failures occur # events cannot be stored in open_alerts as they are not JSON serializable. - self.zenduty_events = {} + self.delayed_events = {} def load_alerts(self): try: @@ -79,7 +79,7 @@ async def run(self, states: List[State]): for event_type in self.config["events"]: event: Event = globals()[event_type](check, context) - if event_type == "ZendutyEvent": + if event_type in ["ZendutyEvent", "TelegramEvent"]: alert_identifier = self.generate_alert_identifier(check) alert = self.open_alerts.get(alert_identifier) if alert is None: @@ -89,11 +89,12 @@ async def run(self, states: List[State]): "failures": 1, "last_window_failures": None, "sent": False, + "event_type": event_type, } else: alert["failures"] += 1 - self.zenduty_events[alert_identifier] = event - continue # Skip sending immediately for ZendutyEvent + self.delayed_events[alert_identifier] = event + continue # Skip sending immediately for ZendutyEvent or TelegramEvent sent_events.append(event.send()) @@ -177,8 +178,8 @@ async def process_zenduty_events(self, current_time): for identifier, info in self.open_alerts.items(): self.check_zd_alert_status(identifier, current_time) check_config = self.config["checks"]["global"][info["type"]] - alert_threshold = check_config.get("zenduty_alert_threshold", 5) - resolution_threshold = check_config.get("zenduty_resolution_threshold", 3) + alert_threshold = check_config.get("alert_threshold", 5) + resolution_threshold = check_config.get("resolution_threshold", 3) # Resolve the alert if raised and failed < $threshold times in the last 5m window resolved = False if ( @@ -187,7 +188,10 @@ async def process_zenduty_events(self, current_time): ): logger.debug(f"Resolving Zenduty alert {identifier}") resolved = True - if info["sent"]: + if ( + info["sent"] + and info.get("event_type", "ZendutyEvent") == "ZendutyEvent" + ): response = await send_zenduty_alert( identifier, identifier, resolved=True ) @@ -208,7 +212,7 @@ async def process_zenduty_events(self, current_time): logger.debug(f"Raising Zenduty alert {identifier}") self.open_alerts[identifier]["sent"] = True self.open_alerts[identifier]["last_alert"] = current_time.isoformat() - event = self.zenduty_events.get(identifier) + event = self.delayed_events.get(identifier) if event: to_alert.append(event.send()) @@ -216,8 +220,8 @@ async def process_zenduty_events(self, current_time): for identifier in to_remove: if self.open_alerts.get(identifier): del self.open_alerts[identifier] - if self.zenduty_events.get(identifier): - del self.zenduty_events[identifier] + if self.delayed_events.get(identifier): + del self.delayed_events[identifier] with open(self.open_alerts_file, "w") as file: json.dump(self.open_alerts, file) diff --git a/sample.config.yaml b/sample.config.yaml index 941ea8a..fba7ecf 100644 --- a/sample.config.yaml +++ b/sample.config.yaml @@ -1,26 +1,29 @@ network: name: "pythnet" - http_endpoint: "https://pythnet.rpcpool.com" - ws_endpoint: "wss://pythnet.rpcpool.com" + http_endpoint: "https://api2.pythnet.pyth.network" + ws_endpoint: "wss://api2.pythnet.pyth.network" first_mapping: "AHtgzX45WTKfkPG53L6WYhGEXwQkN1BVknET3sVsLL8J" crosschain_endpoint: "https://hermes.pyth.network" request_rate_limit: 10 request_rate_period: 1 events: - # NOTE: Uncomment to enable Datadog metrics, see README.md for datadog credential docs. - # - DatadogEvent - LogEvent + # - DatadogEvent # - TelegramEvent - ZendutyEvent +# Alert thresholds apply to Zenduty and Telegram events +# - Checks run approximately once per minute +# - `alert_threshold`: number of failures within 5 minutes >= to this value trigger an alert (default: 5) +# - `resolution_threshold`: number of failures within 5 minutes <= this value resolve the alert (default: 3) checks: global: # Price feed checks PriceFeedOfflineCheck: enable: true - max_slot_distance: 25 + max_slot_distance: 120 abandoned_slot_distance: 100000 - zenduty_alert_threshold: 3 - zenduty_resolution_threshold: 0 + alert_threshold: 3 + resolution_threshold: 0 PriceFeedCoinGeckoCheck: enable: true max_deviation: 5 @@ -46,24 +49,48 @@ checks: PublisherPriceCheck: enable: true max_slot_distance: 25 - max_aggregate_distance: 6 - zenduty_alert_threshold: 5 - zenduty_resolution_threshold: 2 + max_aggregate_distance: 5 + alert_threshold: 2 + resolution_threshold: 1 PublisherStalledCheck: enable: false stall_time_limit: 30 - abandoned_time_limit: 600 + abandoned_time_limit: 300 max_slot_distance: 25 - zenduty_alert_threshold: 1 - zenduty_resolution_threshold: 0 + alert_threshold: 1 + resolution_threshold: 0 # Per-symbol config + Crypto.ANC/USD: + PublisherPriceCheck: + enable: true + max_slot_distance: 25 + max_aggregate_distance: 50 + Crypto.MIR/USD: + PublisherPriceCheck: + enable: true + max_slot_distance: 25 + max_aggregate_distance: 25 Crypto.MNGO/USD: PriceFeedOfflineCheck: - max_slot_distance: 10000 + max_slot_distance: 100000 + Crypto.SLND/USD: + PriceFeedOfflineCheck: + max_slot_distance: 100000 + Crypto.SNY/USD: + PriceFeedOfflineCheck: + max_slot_distance: 100000 + Crypto.PORT/USD: + PriceFeedOfflineCheck: + max_slot_distance: 100000 FX.USD/HKD: PriceFeedOfflineCheck: max_slot_distance: 10000 + Crypto.ZBC/USD: + PublisherPriceCheck: + max_aggregate_distance: 30 Crypto.BTC/USD: PublisherStalledCheck: enable: true - stall_time_limit: 60 + stall_time_limit: 300 # This will override the global stall_time_limit for Crypto.BTC/USD + abandoned_time_limit: 600 # This will override the global abandoned_time_limit for Crypto.BTC/USD + max_slot_distance: 25