From 6944c9de2be959b4a373adb72b46ff8aa5465ba1 Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:32:33 +0100 Subject: [PATCH 1/6] reduce noise of zenduty alerts --- pyth_observer/dispatch.py | 43 ++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index 8e7926a..bdbab16 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -14,8 +14,7 @@ from pyth_observer.event import DatadogEvent # Used dynamically from pyth_observer.event import LogEvent # Used dynamically from pyth_observer.event import TelegramEvent # Used dynamically -from pyth_observer.event import ZendutyEvent # Used dynamically -from pyth_observer.event import Event +from pyth_observer.event import Context, Event, ZendutyEvent from pyth_observer.zenduty import send_zenduty_alert assert DatadogEvent @@ -46,6 +45,9 @@ def __init__(self, config, publishers): if "ZendutyEvent" in self.config["events"]: self.open_alerts_file = os.environ["OPEN_ALERTS_FILE"] self.open_alerts = self.load_alerts() + # below is used to store events to later send if mutilple failures occur + # events cannot be stored in open_alerts as they are not JSON serializable. + self.zenduty_events = {} def load_alerts(self): try: @@ -68,17 +70,14 @@ async def run(self, states: List[State]): # Then, wrap each failed check in events and send them sent_events: List[Awaitable] = [] - context = { - "network": self.config["network"]["name"], - "publishers": self.publishers, - } + context = Context( + network=self.config["network"]["name"], publishers=self.publishers + ) for check in failed_checks: for event_type in self.config["events"]: event: Event = globals()[event_type](check, context) - sent_events.append(event.send()) - if event_type == "ZendutyEvent": # Add failed check to open alerts alert_identifier = ( @@ -87,25 +86,41 @@ async def run(self, states: List[State]): state = check.state() if isinstance(state, PublisherState): alert_identifier += f"-{state.publisher_name}" - self.open_alerts[alert_identifier] = datetime.now().isoformat() + try: + failures = self.open_alerts[alert_identifier]["failures"] + 1 + except KeyError: + failures = 1 + self.open_alerts[alert_identifier] = { + "last_failure": datetime.now().isoformat(), + "failures": failures, + } + # store the event to send it later if it fails multiple times + self.zenduty_events[alert_identifier] = event + continue # do not immediately send a zenduty alert + + sent_events.append(event.send()) await asyncio.gather(*sent_events) - # Check open alerts and resolve those that are older than 2 minutes + # Check open alerts for zenduty if "ZendutyEvent" in self.config["events"]: to_remove = [] current_time = datetime.now() - for identifier, last_failure in self.open_alerts.items(): - if current_time - datetime.fromisoformat(last_failure) >= timedelta( - minutes=2 - ): + for identifier, info in self.open_alerts.items(): + # Resolve the alert if it last failed > 5 minutes ago + if current_time - datetime.fromisoformat( + info["last_failure"] + ) >= timedelta(minutes=5): logger.debug(f"Resolving Zenduty alert {identifier}") response = await send_zenduty_alert( alert_identifier=identifier, message=identifier, resolved=True ) if response and 200 <= response.status < 300: to_remove.append(identifier) + elif info["failures"] > 2: + # Raise alert if the check has failed more than once without self-resolving + await self.zenduty_events[identifier].send() for identifier in to_remove: del self.open_alerts[identifier] From 420ce5d7cd40c0d8ae43a20390476dbcfbce8c20 Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:36:27 +0100 Subject: [PATCH 2/6] keep resolution threshold at 2 minutes --- pyth_observer/dispatch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index bdbab16..c047cb9 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -108,10 +108,10 @@ async def run(self, states: List[State]): to_remove = [] current_time = datetime.now() for identifier, info in self.open_alerts.items(): - # Resolve the alert if it last failed > 5 minutes ago + # Resolve the alert if it last failed > 2 minutes ago if current_time - datetime.fromisoformat( info["last_failure"] - ) >= timedelta(minutes=5): + ) >= timedelta(minutes=2): logger.debug(f"Resolving Zenduty alert {identifier}") response = await send_zenduty_alert( alert_identifier=identifier, message=identifier, resolved=True From 65748dccd74fb0deaabb2709d8a2b25a102a45a6 Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:42:01 +0100 Subject: [PATCH 3/6] fix zenduty ratelimited log --- pyth_observer/zenduty.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyth_observer/zenduty.py b/pyth_observer/zenduty.py index 83e3154..08fc8c3 100644 --- a/pyth_observer/zenduty.py +++ b/pyth_observer/zenduty.py @@ -33,11 +33,12 @@ async def send_zenduty_alert(alert_identifier, message, resolved=False, summary= elif response.status == 429: retries += 1 if retries < max_retries: + sleeptime = min(30, 2**retries) logger.error( - f"Received 429 Too Many Requests for {alert_identifier}. Retrying in 1 second..." + f"Received 429 Too Many Requests for {alert_identifier}. Retrying in {sleeptime} s..." ) await asyncio.sleep( - min(30, 2**retries) + sleeptime ) # Backoff before retrying, wait upto 30s else: logger.error( From 45714cd0de57d50bc0734325fc4c2f31329d2aef Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:43:45 +0100 Subject: [PATCH 4/6] ensure event is dropped from memory when resolved --- pyth_observer/dispatch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index c047cb9..9594df5 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -124,6 +124,7 @@ async def run(self, states: List[State]): for identifier in to_remove: del self.open_alerts[identifier] + del self.zenduty_events[identifier] # Write open alerts to file to ensure persistence with open(self.open_alerts_file, "w") as file: From 7ed7a7f583cc0d6c363fa4dc1905897649a75ad2 Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:49:03 +0100 Subject: [PATCH 5/6] update comment --- pyth_observer/dispatch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyth_observer/dispatch.py b/pyth_observer/dispatch.py index 9594df5..59554b4 100644 --- a/pyth_observer/dispatch.py +++ b/pyth_observer/dispatch.py @@ -119,7 +119,7 @@ async def run(self, states: List[State]): if response and 200 <= response.status < 300: to_remove.append(identifier) elif info["failures"] > 2: - # Raise alert if the check has failed more than once without self-resolving + # Raise alert if the check has failed more than twice before self-resolving await self.zenduty_events[identifier].send() for identifier in to_remove: From 898538aeaedba10a33475d5a06a95db0090f77de Mon Sep 17 00:00:00 2001 From: Ayaz Abbas Date: Fri, 17 May 2024 15:51:08 +0100 Subject: [PATCH 6/6] bump version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c988e1c..adc4381 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ ignore_missing_imports = true [tool.poetry] name = "pyth-observer" -version = "0.2.6" +version = "0.2.7" description = "Alerts and stuff" authors = [] readme = "README.md"