Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce noise of zenduty alerts #70

Merged
merged 6 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 30 additions & 14 deletions pyth_observer/dispatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@
from pyth_observer.event import DatadogEvent # Used dynamically
from pyth_observer.event import LogEvent # Used dynamically
from pyth_observer.event import TelegramEvent # Used dynamically
from pyth_observer.event import ZendutyEvent # Used dynamically
from pyth_observer.event import Event
from pyth_observer.event import Context, Event, ZendutyEvent
from pyth_observer.zenduty import send_zenduty_alert

assert DatadogEvent
Expand Down Expand Up @@ -46,6 +45,9 @@ def __init__(self, config, publishers):
if "ZendutyEvent" in self.config["events"]:
self.open_alerts_file = os.environ["OPEN_ALERTS_FILE"]
self.open_alerts = self.load_alerts()
# below is used to store events to later send if mutilple failures occur
# events cannot be stored in open_alerts as they are not JSON serializable.
self.zenduty_events = {}

def load_alerts(self):
try:
Expand All @@ -68,17 +70,14 @@ async def run(self, states: List[State]):

# Then, wrap each failed check in events and send them
sent_events: List[Awaitable] = []
context = {
"network": self.config["network"]["name"],
"publishers": self.publishers,
}
context = Context(
network=self.config["network"]["name"], publishers=self.publishers
)

for check in failed_checks:
for event_type in self.config["events"]:
event: Event = globals()[event_type](check, context)

sent_events.append(event.send())

if event_type == "ZendutyEvent":
# Add failed check to open alerts
alert_identifier = (
Expand All @@ -87,28 +86,45 @@ async def run(self, states: List[State]):
state = check.state()
if isinstance(state, PublisherState):
alert_identifier += f"-{state.publisher_name}"
self.open_alerts[alert_identifier] = datetime.now().isoformat()
try:
failures = self.open_alerts[alert_identifier]["failures"] + 1
except KeyError:
failures = 1
self.open_alerts[alert_identifier] = {
"last_failure": datetime.now().isoformat(),
"failures": failures,
}
# store the event to send it later if it fails multiple times
self.zenduty_events[alert_identifier] = event
continue # do not immediately send a zenduty alert

sent_events.append(event.send())

await asyncio.gather(*sent_events)

# Check open alerts and resolve those that are older than 2 minutes
# Check open alerts for zenduty
if "ZendutyEvent" in self.config["events"]:

to_remove = []
current_time = datetime.now()
for identifier, last_failure in self.open_alerts.items():
if current_time - datetime.fromisoformat(last_failure) >= timedelta(
minutes=2
):
for identifier, info in self.open_alerts.items():
# Resolve the alert if it last failed > 2 minutes ago
if current_time - datetime.fromisoformat(
info["last_failure"]
) >= timedelta(minutes=2):
logger.debug(f"Resolving Zenduty alert {identifier}")
response = await send_zenduty_alert(
alert_identifier=identifier, message=identifier, resolved=True
)
if response and 200 <= response.status < 300:
to_remove.append(identifier)
elif info["failures"] > 2:
# Raise alert if the check has failed more than twice before self-resolving
await self.zenduty_events[identifier].send()

for identifier in to_remove:
del self.open_alerts[identifier]
del self.zenduty_events[identifier]

# Write open alerts to file to ensure persistence
with open(self.open_alerts_file, "w") as file:
Expand Down
5 changes: 3 additions & 2 deletions pyth_observer/zenduty.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ async def send_zenduty_alert(alert_identifier, message, resolved=False, summary=
elif response.status == 429:
retries += 1
if retries < max_retries:
sleeptime = min(30, 2**retries)
logger.error(
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in 1 second..."
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in {sleeptime} s..."
)
await asyncio.sleep(
min(30, 2**retries)
sleeptime
) # Backoff before retrying, wait upto 30s
else:
logger.error(
Expand Down
Loading