Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add zenduty integration #67

Merged
merged 9 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,11 @@ cover: ve
lint: lint.python
#lint: lint.yaml - argh, RHEL is too old to do this by default

lint.python: ve
. ve/bin/activate; flake8 observer.py pyth_observer/
lint.python:
poetry run isort pyth_observer/
poetry run black pyth_observer/
poetry run pyright pyth_observer/
poetry run pyflakes pyth_observer/

lint.yaml:
yamllint .
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ ignore_missing_imports = true

[tool.poetry]
name = "pyth-observer"
version = "0.2.5"
version = "0.2.6"
description = "Alerts and stuff"
authors = []
readme = "README.md"
Expand Down
50 changes: 50 additions & 0 deletions pyth_observer/dispatch.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import asyncio
import json
import os
from copy import deepcopy
from datetime import datetime, timedelta
from typing import Any, Awaitable, Dict, List

from loguru import logger
from prometheus_client import Gauge

from pyth_observer.check import Check, State
Expand All @@ -10,11 +14,14 @@
from pyth_observer.event import DatadogEvent # Used dynamically
from pyth_observer.event import LogEvent # Used dynamically
from pyth_observer.event import TelegramEvent # Used dynamically
from pyth_observer.event import ZendutyEvent # Used dynamically
from pyth_observer.event import Event
from pyth_observer.zenduty import send_zenduty_alert

assert DatadogEvent
assert LogEvent
assert TelegramEvent
assert ZendutyEvent


class Dispatch:
Expand All @@ -36,6 +43,16 @@ def __init__(self, config, publishers):
"Publisher check failure status",
["check", "symbol", "publisher"],
)
if "ZendutyEvent" in self.config["events"]:
self.open_alerts_file = os.environ["OPEN_ALERTS_FILE"]
self.open_alerts = self.load_alerts()

def load_alerts(self):
try:
with open(self.open_alerts_file, "r") as file:
return json.load(file)
except FileNotFoundError:
return {} # Return an empty dict if the file doesn't exist

async def run(self, states: List[State]):
# First, run each check and store the ones that failed
Expand All @@ -62,8 +79,41 @@ async def run(self, states: List[State]):

sent_events.append(event.send())

if event_type == "ZendutyEvent":
# Add failed check to open alerts
alert_identifier = (
f"{check.__class__.__name__}-{check.state().symbol}"
)
state = check.state()
if isinstance(state, PublisherState):
alert_identifier += f"-{state.publisher_name}"
self.open_alerts[alert_identifier] = datetime.now().isoformat()

await asyncio.gather(*sent_events)

# Check open alerts and resolve those that are older than 2 minutes
if "ZendutyEvent" in self.config["events"]:

to_remove = []
current_time = datetime.now()
for identifier, last_failure in self.open_alerts.items():
if current_time - datetime.fromisoformat(last_failure) >= timedelta(
minutes=2
):
logger.debug(f"Resolving Zenduty alert {identifier}")
response = await send_zenduty_alert(
alert_identifier=identifier, message=identifier, resolved=True
)
if response and 200 <= response.status < 300:
to_remove.append(identifier)

for identifier in to_remove:
del self.open_alerts[identifier]

# Write open alerts to file to ensure persistence
with open(self.open_alerts_file, "w") as file:
json.dump(self.open_alerts, file)

def check_price_feed(self, state: PriceFeedState) -> List[Check]:
failed_checks: List[Check] = []

Expand Down
27 changes: 26 additions & 1 deletion pyth_observer/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,9 @@
from loguru import logger

from pyth_observer.check import Check
from pyth_observer.check.publisher import PublisherCheck
from pyth_observer.check.publisher import PublisherCheck, PublisherState
from pyth_observer.models import Publisher
from pyth_observer.zenduty import send_zenduty_alert

load_dotenv()

Expand Down Expand Up @@ -151,3 +152,27 @@ async def send(self):
logger.error(
f"Failed to send Telegram message: {response_text}"
)


class ZendutyEvent(Event):
def __init__(self, check: Check, context: Context):
self.check = check
self.context = context

async def send(self):
event_details = self.check.error_message()
summary = ""
for key, value in event_details.items():
summary += f"{key}: {value}\n"

alert_identifier = (
f"{self.check.__class__.__name__}-{self.check.state().symbol}"
)
state = self.check.state()
if isinstance(state, PublisherState):
alert_identifier += f"-{state.publisher_name}"
ali-bahjati marked this conversation as resolved.
Show resolved Hide resolved

logger.debug(f"Sending Zenduty alert for {alert_identifier}")
await send_zenduty_alert(
alert_identifier=alert_identifier, message=alert_identifier, summary=summary
)
52 changes: 52 additions & 0 deletions pyth_observer/zenduty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import asyncio
import hashlib
import os

import aiohttp
from loguru import logger

headers = {"Content-Type": "application/json"}


async def send_zenduty_alert(alert_identifier, message, resolved=False, summary=""):
url = f"https://www.zenduty.com/api/events/{os.environ['ZENDUTY_INTEGRATION_KEY']}/"
# Use a hash of the alert_identifier as a unique id for the alert.
# Take the first 32 characters due to length limit of the api.
entity_id = hashlib.sha256(alert_identifier.encode("utf-8")).hexdigest()[:32]

alert_type = "resolved" if resolved else "critical"

data = {
"alert_type": alert_type,
"message": message,
"summary": summary,
"entity_id": entity_id,
}

async with aiohttp.ClientSession() as session:
max_retries = 30
retries = 0
while retries < max_retries:
async with session.post(url, json=data, headers=headers) as response:
if 200 <= response.status < 300:
return response # Success case, return response
elif response.status == 429:
retries += 1
if retries < max_retries:
logger.error(
f"Received 429 Too Many Requests for {alert_identifier}. Retrying in 1 second..."
)
await asyncio.sleep(
min(30, 2 ^ retries)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this does the xor, should be 2**retries

) # Backoff before retrying, wait upto 30s
else:
logger.error(
f"Failed to send Zenduty event message for {alert_identifier} after {max_retries} retries."
)
return response # Return response after max retries
else:
response_text = await response.text()
logger.error(
f"{response.status} Failed to send Zenduty event message for {alert_identifier}: {response_text}"
)
return response # Non-retryable failure
Loading