diff --git a/docker/Dockerfile.api b/docker/Dockerfile.api index 291210d25..9e000384e 100644 --- a/docker/Dockerfile.api +++ b/docker/Dockerfile.api @@ -35,4 +35,7 @@ RUN chgrp -R 0 /app && chmod -R g=u /app RUN chown -R keep:keep /app RUN chown -R keep:keep /venv USER keep -ENTRYPOINT ["gunicorn", "keep.api.api:get_app", "--bind" , "0.0.0.0:8080" , "--workers", "4" , "-k" , "uvicorn.workers.UvicornWorker", "-c", "/venv/lib/python3.11/site-packages/keep/api/config.py"] + +ENTRYPOINT ["/venv/lib/python3.11/site-packages/keep/entrypoint.sh"] + +CMD ["gunicorn", "keep.api.api:get_app", "--bind" , "0.0.0.0:8080" , "--workers", "4" , "-k" , "uvicorn.workers.UvicornWorker", "-c", "/venv/lib/python3.11/site-packages/keep/api/config.py"] diff --git a/docker/Dockerfile.dev.api b/docker/Dockerfile.dev.api index 5b3f12b70..d4f4e3a7b 100644 --- a/docker/Dockerfile.dev.api +++ b/docker/Dockerfile.dev.api @@ -21,6 +21,8 @@ RUN . /venv/bin/activate && poetry install --no-root ENV PYTHONPATH="/app:${PYTHONPATH}" ENV PATH="/venv/bin:${PATH}" ENV VIRTUAL_ENV="/venv" +ENV POSTHOG_DISABLED="true" +ENTRYPOINT ["/app/keep/entrypoint.sh"] CMD ["gunicorn", "keep.api.api:get_app", "--bind" , "0.0.0.0:8080" , "--workers", "1" , "-k" , "uvicorn.workers.UvicornWorker", "-c", "./keep/api/config.py", "--reload"] diff --git a/docs/deployment/configuration.mdx b/docs/deployment/configuration.mdx index eaa04d439..ddf404912 100644 --- a/docs/deployment/configuration.mdx +++ b/docs/deployment/configuration.mdx @@ -25,6 +25,7 @@ General configuration variables control the core behavior of the Keep server. Th | **KEEP_API_URL** | Specifies the Keep API URL | No | Constructed from HOST and PORT | Valid URL | | **KEEP_STORE_RAW_ALERTS** | Enables storing of raw alerts | No | "false" | "true" or "false" | | **TENANT_CONFIGURATION_RELOAD_TIME** | Time in minutes to reload tenant configurations | No | 5 | Positive integer | +| **KEEP_LIVE_DEMO_MODE** | Keep will simulate incoming alerts and other activity | No | "false" | "true" or "false" | ### Logging and Environment diff --git a/keep-ui/auth.ts b/keep-ui/auth.ts index eab94b875..159de7439 100644 --- a/keep-ui/auth.ts +++ b/keep-ui/auth.ts @@ -197,12 +197,20 @@ const config = { } return true; }, - jwt: async ({ token, user, account }): Promise => { + jwt: async ({ token, user, account, profile }): Promise => { if (account && user) { let accessToken: string | undefined; + let tenantId: string | undefined = user.tenantId; + let role: string | undefined = user.role; // Ensure we always have an accessToken if (authType == AuthType.AUTH0) { accessToken = account.id_token; + if ((profile as any)?.keep_tenant_id) { + tenantId = (profile as any).keep_tenant_id; + } + if ((profile as any)?.keep_role) { + role = (profile as any).keep_role; + } } else { accessToken = user.accessToken || account.access_token || account.id_token; @@ -212,8 +220,8 @@ const config = { } token.accessToken = accessToken; - token.tenantId = user.tenantId; - token.role = user.role; + token.tenantId = tenantId; + token.role = role; if (authType === AuthType.KEYCLOAK) { token.refreshToken = account.refresh_token; diff --git a/keep/api/config.py b/keep/api/config.py index c420860d0..54387cf86 100644 --- a/keep/api/config.py +++ b/keep/api/config.py @@ -4,7 +4,6 @@ import keep.api.logging from keep.api.api import AUTH_TYPE from keep.api.core.db_on_start import migrate_db, try_create_single_tenant -from keep.api.core.report_uptime import launch_uptime_reporting from keep.api.core.dependencies import SINGLE_TENANT_UUID from keep.identitymanager.identitymanagerfactory import IdentityManagerTypes @@ -19,7 +18,6 @@ def on_starting(server=None): logger.info("Keep server starting") migrate_db() - launch_uptime_reporting() # Create single tenant if it doesn't exist if AUTH_TYPE in [ @@ -54,4 +52,5 @@ def on_starting(server=None): public_url = ngrok_connection.public_url logger.info(f"ngrok tunnel: {public_url}") os.environ["KEEP_API_URL"] = public_url + logger.info("Keep server started") diff --git a/keep/api/core/db.py b/keep/api/core/db.py index 2032f1cbb..f374117ee 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -58,6 +58,7 @@ from keep.api.models.db.preset import * # pylint: disable=unused-wildcard-import from keep.api.models.db.provider import * # pylint: disable=unused-wildcard-import from keep.api.models.db.rule import * # pylint: disable=unused-wildcard-import +from keep.api.models.db.system import * # pylint: disable=unused-wildcard-import from keep.api.models.db.tenant import * # pylint: disable=unused-wildcard-import from keep.api.models.db.topology import * # pylint: disable=unused-wildcard-import from keep.api.models.db.workflow import * # pylint: disable=unused-wildcard-import @@ -4482,3 +4483,46 @@ def get_resource_ids_by_resource_type( # Execute the query and return results result = session.exec(query) return result.all() + +def get_or_creat_posthog_instance_id( + session: Optional[Session] = None + ): + POSTHOG_INSTANCE_ID_KEY = "posthog_instance_id" + with Session(engine) as session: + system = session.exec(select(System).where(System.name == POSTHOG_INSTANCE_ID_KEY)).first() + if system: + return system.value + + system = System( + id=str(uuid4()), + name=POSTHOG_INSTANCE_ID_KEY, + value=str(uuid4()), + ) + session.add(system) + session.commit() + session.refresh(system) + return system.value + +def get_activity_report( + session: Optional[Session] = None + ): + from keep.api.models.db.user import User + + last_24_hours = datetime.utcnow() - timedelta(hours=24) + activity_report = {} + with Session(engine) as session: + activity_report['tenants_count'] = session.query(Tenant).count() + activity_report['providers_count'] = session.query(Provider).count() + activity_report['users_count'] = session.query(User).count() + activity_report['last_24_hours_incidents_count'] = session.query(Incident).filter( + Incident.creation_time >= last_24_hours).count() + activity_report['last_24_hours_alerts_count'] = session.query(Alert).filter( + Alert.timestamp >= last_24_hours).count() + activity_report['last_24_hours_rules_created'] = session.query(Rule).filter( + Rule.creation_time >= last_24_hours).count() + activity_report['last_24_hours_workflows_created'] = session.query(Workflow).filter( + Workflow.creation_time >= last_24_hours).count() + activity_report['last_24_hours_workflows_executed'] = session.query(WorkflowExecution).filter( + WorkflowExecution.started >= last_24_hours).count() + + return activity_report diff --git a/keep/api/core/demo_mode.py b/keep/api/core/demo_mode.py new file mode 100644 index 000000000..26ec63add --- /dev/null +++ b/keep/api/core/demo_mode.py @@ -0,0 +1,451 @@ +import datetime +import logging +import os +import random +import threading +import time +from datetime import timezone +from uuid import uuid4 + +import requests +from dateutil import parser +from requests.models import PreparedRequest + +from keep.api.core.db import get_session_sync +from keep.api.core.dependencies import SINGLE_TENANT_UUID +from keep.api.logging import CONFIG +from keep.api.models.db.topology import TopologyServiceInDto +from keep.api.tasks.process_topology_task import process_topology +from keep.api.utils.tenant_utils import get_or_create_api_key +from keep.providers.providers_factory import ProvidersFactory + +logging.config.dictConfig(CONFIG) + +logger = logging.getLogger(__name__) + +KEEP_LIVE_DEMO_MODE = os.environ.get("KEEP_LIVE_DEMO_MODE", "false").lower() == "true" + +correlation_rules_to_create = [ + { + "sqlQuery": {"sql": "((name like :name_1))", "params": {"name_1": "%mq%"}}, + "groupDescription": "This rule groups all alerts related to MQ.", + "ruleName": "Message Queue Buckle Up", + "celQuery": '(name.contains("mq"))', + "timeframeInSeconds": 86400, + "timeUnit": "hours", + "groupingCriteria": [], + "requireApprove": False, + "resolveOn": "never", + }, + { + "sqlQuery": { + "sql": "((name like :name_1) or (name = :name_2) or (name like :name_3))", + "params": { + "name_1": "%network_latency_high%", + "name_2": "high_cpu_usage", + "name_3": "%database_connection_failure%", + }, + }, + "groupDescription": "This rule groups alerts from multiple sources.", + "ruleName": "Application issue caused by DB load", + "celQuery": '(name.contains("network_latency_high")) || (name == "high_cpu_usage") || (name.contains("database_connection_failure"))', + "timeframeInSeconds": 86400, + "timeUnit": "hours", + "groupingCriteria": [], + "requireApprove": False, + "resolveOn": "never", + }, +] + +services_to_create = [ + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="api", + display_name="API Service", + environment="prod", + description="The main API service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.1", + category="Python", + manufacturer="", + dependencies={ + "db": "SQL", + "queue": "AMQP", + }, + application_ids=[], + updated_at="2024-11-18T09:23:46", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="ui", + display_name="Platform", + environment="prod", + description="The user interface (aka Platform)", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.2", + category="nextjs", + manufacturer="", + dependencies={ + "api": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T09:29:25", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="db", + display_name="DB", + environment="prod", + description="Production Database", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.3", + category="postgres", + manufacturer="", + dependencies={}, + application_ids=[], + updated_at="2024-11-18T09:30:44", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="queue", + display_name="Kafka", + environment="prod", + description="Production Queue", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.4", + category="Kafka", + dependencies={ + "processor": "AMQP", + }, + application_ids=[], + updated_at="2024-11-18T09:31:31", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="processor", + display_name="Processor", + environment="prod", + description="Processing Service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.5", + category="go", + dependencies={ + "storage": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T10:02:20", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="backoffice", + display_name="Backoffice", + environment="prod", + description="Backoffice UI to control configuration", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="172.1.1.0", + category="nextjs", + dependencies={ + "api": "HTTP/S", + }, + application_ids=[], + updated_at="2024-11-18T10:11:31", + ), + TopologyServiceInDto( + source_provider_id="Prod-Datadog", + repository="keephq/keep", + tags=[], + service="storage", + display_name="Storage", + environment="prod", + description="Storage Service", + team="keep", + email="support@keephq.dev", + slack="https://slack.keephq.dev", + ip_address="10.0.0.8", + category="python", + dependencies={}, + application_ids=[], + updated_at="2024-11-18T10:13:56", + ), +] + +application_to_create = { + "name": "Main App", + "description": "It is the most critical business process ever imaginable.", + "services": [ + {"name": "API Service", "service": "api"}, + {"name": "DB", "service": "db"}, + {"name": "Kafka", "service": "queue"}, + {"name": "Processor", "service": "processor"}, + {"name": "Storage", "service": "storage"}, + ], +} + + +def get_or_create_topology(keep_api_key, keep_api_url): + services_existing = requests.get( + f"{keep_api_url}/topology", + headers={"x-api-key": keep_api_key}, + ) + services_existing.raise_for_status() + services_existing = services_existing.json() + + # Creating services + if len(services_existing) == 0: + process_topology( + SINGLE_TENANT_UUID, services_to_create, "Prod-Datadog", "datadog" + ) + + # Create application + applications_existing = requests.get( + f"{keep_api_url}/topology/applications", + headers={"x-api-key": keep_api_key}, + ) + applications_existing.raise_for_status() + applications_existing = applications_existing.json() + + if len(applications_existing) == 0: + # Pull services again to get their ids + services_existing = requests.get( + f"{keep_api_url}/topology", + headers={"x-api-key": keep_api_key}, + ) + services_existing.raise_for_status() + services_existing = services_existing.json() + + # Update application_to_create with existing services ids + for service in application_to_create["services"]: + for existing_service in services_existing: + if service["name"] == existing_service["display_name"]: + service["id"] = existing_service["id"] + + response = requests.post( + f"{keep_api_url}/topology/applications", + headers={"x-api-key": keep_api_key}, + json=application_to_create, + ) + response.raise_for_status() + + +def get_or_create_correlation_rules(keep_api_key, keep_api_url): + correlation_rules_existing = requests.get( + f"{keep_api_url}/rules", + headers={"x-api-key": keep_api_key}, + ) + correlation_rules_existing.raise_for_status() + correlation_rules_existing = correlation_rules_existing.json() + + if len(correlation_rules_existing) == 0: + for correlation_rule in correlation_rules_to_create: + response = requests.post( + f"{keep_api_url}/rules", + headers={"x-api-key": keep_api_key}, + json=correlation_rule, + ) + response.raise_for_status() + + +def remove_old_incidents(keep_api_key, keep_api_url): + consider_old_timedelta = datetime.timedelta(minutes=30) + incidents_existing = requests.get( + f"{keep_api_url}/incidents", + headers={"x-api-key": keep_api_key}, + ) + incidents_existing.raise_for_status() + incidents_existing = incidents_existing.json()["items"] + + for incident in incidents_existing: + if parser.parse(incident["creation_time"]).replace(tzinfo=timezone.utc) < ( + datetime.datetime.now() - consider_old_timedelta + ).astimezone(timezone.utc): + incident_id = incident["id"] + response = requests.delete( + f"{keep_api_url}/incidents/{incident_id}", + headers={"x-api-key": keep_api_key}, + ) + response.raise_for_status() + + +def get_installed_providers(keep_api_key, keep_api_url): + response = requests.get( + f"{keep_api_url}/providers", + headers={"x-api-key": keep_api_key}, + ) + response.raise_for_status() + return response.json()['installed_providers'] + + +def simulate_alerts( + keep_api_url=None, + keep_api_key=None, + sleep_interval=5, + demo_correlation_rules=False, + demo_topology=False, + clean_old_incidents=False, +): + logger.info("Simulating alerts...") + + GENERATE_DEDUPLICATIONS = True + + providers = [ + "prometheus", + "grafana", + "cloudwatch", + "datadog", + ] + + providers_to_randomize_fingerprint_for = [ + "cloudwatch", + "datadog", + ] + + provider_classes = { + provider: ProvidersFactory.get_provider_class(provider) + for provider in providers + } + + existing_installed_providers = get_installed_providers(keep_api_key, keep_api_url) + logger.info(f"Existing installed providers: {existing_installed_providers}") + existing_providers_to_their_ids = {} + + for existing_provider in existing_installed_providers: + if existing_provider['type'] in providers: + existing_providers_to_their_ids[existing_provider['type']] = existing_provider['id'] + logger.info(f"Existing installed existing_providers_to_their_ids: {existing_providers_to_their_ids}") + + if demo_correlation_rules: + logger.info("Creating correlation rules...") + get_or_create_correlation_rules(keep_api_key, keep_api_url) + logger.info("Correlation rules created.") + + if demo_topology: + logger.info("Creating topology...") + get_or_create_topology(keep_api_key, keep_api_url) + logger.info("Topology created.") + + while True: + try: + logger.info("Looping to send alerts...") + + if clean_old_incidents: + logger.info("Removing old incidents...") + remove_old_incidents(keep_api_key, keep_api_url) + logger.info("Old incidents removed.") + + send_alert_url_params = {} + + # choose provider + provider_type = random.choice(providers) + send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) + + if provider_type in existing_providers_to_their_ids: + send_alert_url_params["provider_id"] = existing_providers_to_their_ids[provider_type] + logger.info(f"Provider type: {provider_type}, send_alert_url_params now are: {send_alert_url_params}") + + provider = provider_classes[provider_type] + alert = provider.simulate_alert() + + if provider_type in providers_to_randomize_fingerprint_for: + send_alert_url_params["fingerprint"] = str(uuid4()) + + # Determine number of times to send the same alert + num_iterations = 1 + if GENERATE_DEDUPLICATIONS: + num_iterations = random.randint(1, 3) + + for _ in range(num_iterations): + logger.info("Sending alert: {}".format(alert)) + try: + env = random.choice(["production", "staging", "development"]) + + if "provider_id" not in send_alert_url_params: + send_alert_url_params["provider_id"] = f"{provider_type}-{env}" + + prepared_request = PreparedRequest() + prepared_request.prepare_url(send_alert_url, send_alert_url_params) + logger.info(f"Sending alert to {prepared_request.url} with url params {send_alert_url_params}") + + response = requests.post( + prepared_request.url, + headers={"x-api-key": keep_api_key}, + json=alert, + ) + response.raise_for_status() + except requests.exceptions.RequestException as e: + logger.error("Failed to send alert: {}".format(e)) + time.sleep(sleep_interval) + continue + + if not response.ok: + logger.error("Failed to send alert: {}".format(response.text)) + else: + logger.info("Alert sent successfully") + except Exception as e: + logger.exception( + "Error in simulate_alerts", extra={"exception_str": str(e)} + ) + + logger.info( + "Sleeping for {} seconds before next iteration".format(sleep_interval) + ) + time.sleep(sleep_interval) + + +def launch_demo_mode_thread(keep_api_url=None) -> threading.Thread | None: + if not KEEP_LIVE_DEMO_MODE: + logger.info("Not launching the demo mode.") + return + + logger.info("Launching demo mode.") + + with get_session_sync() as session: + keep_api_key = get_or_create_api_key( + session=session, + tenant_id=SINGLE_TENANT_UUID, + created_by="system", + unique_api_key_id="simulate_alerts", + system_description="Simulate Alerts API key", + ) + + sleep_interval = 5 + + thread = threading.Thread( + target=simulate_alerts, + kwargs={ + "keep_api_key": keep_api_key, + "keep_api_url": keep_api_url, + "sleep_interval": sleep_interval, + "demo_correlation_rules": True, + "demo_topology": True, + "clean_old_incidents": True, + }, + ) + thread.daemon = True + thread.start() + + logger.info("Demo mode launched.") + return thread diff --git a/keep/api/core/posthog.py b/keep/api/core/posthog.py index b21b24096..143157029 100644 --- a/keep/api/core/posthog.py +++ b/keep/api/core/posthog.py @@ -1,10 +1,11 @@ import os -import uuid import posthog import requests from posthog import Posthog from importlib import metadata +from keep.api.core.db import get_or_creat_posthog_instance_id + try: KEEP_VERSION = metadata.version("keep") except metadata.PackageNotFoundError: @@ -13,8 +14,7 @@ except metadata.PackageNotFoundError: KEEP_VERSION = os.environ.get("KEEP_VERSION", "unknown") -POSTHOG_DISABLED = os.getenv("POSTHOG_DISABLED", "false") == "true" -RANDOM_TENANT_ID_PERSISTENT_WITHIN_LAUNCH = uuid.uuid4() +POSTHOG_DISABLED = os.getenv("POSTHOG_DISABLED", "false").lower() == "true" if POSTHOG_DISABLED: posthog.disabled = True @@ -37,7 +37,7 @@ def is_posthog_reachable(): feature_flags_request_timeout_seconds=3, sync_mode=True # Explicitly to trigger exception if it's not reachable. ).capture( - RANDOM_TENANT_ID_PERSISTENT_WITHIN_LAUNCH, + get_or_creat_posthog_instance_id(), "connectivity_check", ) return True diff --git a/keep/api/core/report_uptime.py b/keep/api/core/report_uptime.py index 257922c5c..99b1c326a 100644 --- a/keep/api/core/report_uptime.py +++ b/keep/api/core/report_uptime.py @@ -1,12 +1,13 @@ +import time import asyncio import logging import threading +from keep.api.core.db import get_activity_report, get_or_creat_posthog_instance_id from keep.api.core.posthog import ( posthog_client, is_posthog_reachable, KEEP_VERSION, POSTHOG_DISABLED, - RANDOM_TENANT_ID_PERSISTENT_WITHIN_LAUNCH ) logger = logging.getLogger(__name__) @@ -18,19 +19,25 @@ async def report_uptime_to_posthog(): Should be lunched in a separate thread. """ while True: + start_time = time.time() + properties = { + "status": "up", + "keep_version": KEEP_VERSION, + **get_activity_report(), + } + end_time = time.time() + properties["db_request_duration_ms"] = int((end_time - start_time) * 1000) posthog_client.capture( - RANDOM_TENANT_ID_PERSISTENT_WITHIN_LAUNCH, + get_or_creat_posthog_instance_id(), "backend_status", - properties={ - "status": "up", - "keep_version": KEEP_VERSION, - }, + properties=properties, ) + posthog_client.flush() logger.info("Uptime reported to PostHog.") - # Important to keep it async, otherwise will clog main gunicorn thread and cause timeouts. + await asyncio.sleep(UPTIME_REPORTING_CADENCE) -def launch_uptime_reporting(): +def launch_uptime_reporting_thread() -> threading.Thread | None: """ Running async uptime reporting as a sub-thread. """ @@ -39,6 +46,7 @@ def launch_uptime_reporting(): thread = threading.Thread(target=asyncio.run, args=(report_uptime_to_posthog(), )) thread.start() logger.info("Uptime Reporting to Posthog launched.") + return thread else: logger.info("Reporting to Posthog not launched because it's not reachable.") else: diff --git a/keep/api/models/db/migrations/versions/2024-11-20-15-50_192157fd5788.py b/keep/api/models/db/migrations/versions/2024-11-20-15-50_192157fd5788.py new file mode 100644 index 000000000..0d79398b8 --- /dev/null +++ b/keep/api/models/db/migrations/versions/2024-11-20-15-50_192157fd5788.py @@ -0,0 +1,36 @@ +"""system table + +Revision ID: 192157fd5788 +Revises: 620b6c048091 +Create Date: 2024-11-20 15:50:29.500867 + +""" + +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +revision = "192157fd5788" +down_revision = "620b6c048091" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "system", + sa.Column("id", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("name", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("value", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_table("system") + # ### end Alembic commands ### diff --git a/keep/api/models/db/system.py b/keep/api/models/db/system.py new file mode 100644 index 000000000..71296e358 --- /dev/null +++ b/keep/api/models/db/system.py @@ -0,0 +1,8 @@ + +from sqlmodel import Field, SQLModel + + +class System(SQLModel, table=True): + id: str = Field(primary_key=True) + name: str + value: str diff --git a/keep/entrypoint.sh b/keep/entrypoint.sh new file mode 100755 index 000000000..902926409 --- /dev/null +++ b/keep/entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +# Exit immediately if a command exits with a non-zero status +set -e + +# Print commands and their arguments as they are executed +set -x + +# Get the directory of the current script +SCRIPT_DIR=$(dirname "$0") + +python "$SCRIPT_DIR/server_jobs_bg.py" & + +# Execute the CMD provided in the Dockerfile or as arguments +exec "$@" \ No newline at end of file diff --git a/keep/providers/cloudwatch_provider/alerts_mock.py b/keep/providers/cloudwatch_provider/alerts_mock.py index 5fe2ec41e..1fc2377fe 100644 --- a/keep/providers/cloudwatch_provider/alerts_mock.py +++ b/keep/providers/cloudwatch_provider/alerts_mock.py @@ -11,7 +11,7 @@ } }, "parameters": { - "Message.AlarmName": ["HighCPUUsage-1", "HighCPUUsage-2", "HighCPUUsage-3"], + "Message.AlarmName": ["HighCPUUsage", "HighCPUUsageOnAPod", "PodRecycled"], }, }, } diff --git a/keep/providers/datadog_provider/alerts_mock.py b/keep/providers/datadog_provider/alerts_mock.py index 21205eab0..0ff032a3b 100644 --- a/keep/providers/datadog_provider/alerts_mock.py +++ b/keep/providers/datadog_provider/alerts_mock.py @@ -11,8 +11,8 @@ }, "parameters": { "tags": [ - "environment:production,team:backend,monitor", - "environment:staging,team:backend,monitor", + "environment:production,team:backend,monitor,service:api", + "environment:staging,team:backend,monitor,service:api", ], "priority": ["P2", "P3", "P4"], }, @@ -29,8 +29,8 @@ }, "parameters": { "tags": [ - "environment:production,team:analytics,monitor", - "environment:staging,team:database,monitor", + "environment:production,team:analytics,monitor,service:api", + "environment:staging,team:database,monitor,service:api", ], "priority": ["P1", "P3", "P4"], }, diff --git a/keep/providers/grafana_provider/alerts_mock.py b/keep/providers/grafana_provider/alerts_mock.py index bcb940077..4d9d0357a 100644 --- a/keep/providers/grafana_provider/alerts_mock.py +++ b/keep/providers/grafana_provider/alerts_mock.py @@ -1,6 +1,7 @@ ALERTS = { "database_connection_failure": { "severity": "critical", + "service": "api", "title": "Database Connection Failure", "alerts": [ { @@ -48,6 +49,7 @@ ], }, "high_memory_usage": { + "service": "api", "payload": { "condition": "B", "data": [ @@ -92,6 +94,7 @@ }, }, "network_latency_high": { + "service": "db", "payload": { "condition": "C", "data": [ diff --git a/keep/providers/prometheus_provider/alerts_mock.py b/keep/providers/prometheus_provider/alerts_mock.py index 1287f1a68..d29197074 100644 --- a/keep/providers/prometheus_provider/alerts_mock.py +++ b/keep/providers/prometheus_provider/alerts_mock.py @@ -11,7 +11,7 @@ }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, @@ -20,11 +20,12 @@ "summary": "Message queue is over 33% capacity", "labels": { "severity": "warning", + "customer_id": "acme" }, }, "parameters": { "labels.queue": ["queue1", "queue2", "queue3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "queue"], "labels.mq_manager": ["mq_manager1", "mq_manager2", "mq_manager3"], }, }, @@ -37,20 +38,20 @@ }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, "network_latency_high": { "payload": { - "summary": "Network latency is higher than normal", + "summary": "Network latency is higher than normal for customer_id:acme", "labels": { "severity": "info", }, }, "parameters": { "labels.host": ["host1", "host2", "host3"], - "labels.service": ["calendar-producer-java-otel-api-dd", "kafka"], + "labels.service": ["calendar-producer-java-otel-api-dd", "kafka", "api", "queue", "db"], "labels.instance": ["instance1", "instance2", "instance3"], }, }, diff --git a/keep/server_jobs_bg.py b/keep/server_jobs_bg.py new file mode 100644 index 000000000..b1cad6390 --- /dev/null +++ b/keep/server_jobs_bg.py @@ -0,0 +1,47 @@ +import os +import time +import logging +import requests + +from keep.api.core.demo_mode import launch_demo_mode_thread +from keep.api.core.report_uptime import launch_uptime_reporting_thread + +logger = logging.getLogger(__name__) + + +def main(): + logger.info("Starting background server jobs.") + + # We intentionally don't use KEEP_API_URL here to avoid going through the internet. + # Script should be launched in the same environment as the server. + keep_api_url = "http://localhost:" + str(os.environ.get("PORT", 8080)) + + while True: + try: + logger.info(f"Checking if server is up at {keep_api_url}...") + response = requests.get(keep_api_url) + response.raise_for_status() + break + except requests.exceptions.RequestException: + logger.info("API is not up yet. Waiting...") + time.sleep(5) + + threads = [] + threads.append(launch_demo_mode_thread(keep_api_url)) + threads.append(launch_uptime_reporting_thread()) + + logger.info("Background server jobs threads launched, joining them.") + + for thread in threads: + if thread is not None: + thread.join() + + logger.info("Background server jobs script executed and exiting.") + + +if __name__ == "__main__": + """ + This script should be executed alongside to the server. + Running it in the same process as the server may (and most probably will) cause issues. + """ + main() diff --git a/pyproject.toml b/pyproject.toml index 713a701e1..5002c864b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "keep" -version = "0.29.2" +version = "0.29.3" description = "Alerting. for developers, by developers." authors = ["Keep Alerting LTD"] readme = "README.md" diff --git a/scripts/shoot_alerts_from_dump.py b/scripts/shoot_alerts_from_dump.py index ebb18c327..7239c6fde 100644 --- a/scripts/shoot_alerts_from_dump.py +++ b/scripts/shoot_alerts_from_dump.py @@ -1,16 +1,14 @@ -import logging -import os import sys import json import copy import csv - +import logging +import argparse from keep.api.core.db import get_session_sync from keep.api.models.alert import AlertDto from keep.api.tasks.process_event_task import __handle_formatted_events -import sys -import argparse + # configure logging logging.basicConfig( diff --git a/scripts/simulate_alerts.py b/scripts/simulate_alerts.py index fa1d0cc35..2927a2cf2 100644 --- a/scripts/simulate_alerts.py +++ b/scripts/simulate_alerts.py @@ -1,13 +1,9 @@ -import logging import os -import random -import time - -import requests +import logging +import argparse -from keep.providers.providers_factory import ProvidersFactory +from keep.api.core.demo_mode import simulate_alerts -# configure logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s %(levelname)s %(name)s %(message)s", @@ -18,52 +14,31 @@ def main(): - GENERATE_DEDUPLICATIONS = True - SLEEP_INTERVAL = float(os.environ.get("SLEEP_INTERVAL", 0.2)) # Configurable sleep interval from env variable + parser = argparse.ArgumentParser(description="Simulate alerts for Keep API.") + parser.add_argument( + "--full-demo", + action="store_true", + help="Run the full demo including correlation rules and topology.", + ) + args = parser.parse_args() + + default_sleep_interval = 0.2 + if args.full_demo: + default_sleep_interval = 5 + + SLEEP_INTERVAL = float( + os.environ.get("SLEEP_INTERVAL", default_sleep_interval) + ) keep_api_key = os.environ.get("KEEP_API_KEY") keep_api_url = os.environ.get("KEEP_API_URL") or "http://localhost:8080" - if keep_api_key is None or keep_api_url is None: - raise Exception("KEEP_API_KEY and KEEP_API_URL must be set") - - providers = ["prometheus", "grafana"] - provider_classes = { - provider: ProvidersFactory.get_provider_class(provider) - for provider in providers - } - while True: - # choose provider - provider_type = random.choice(providers) - send_alert_url = "{}/alerts/event/{}".format(keep_api_url, provider_type) - provider = provider_classes[provider_type] - alert = provider.simulate_alert() - - # Determine number of times to send the same alert - num_iterations = 1 - if GENERATE_DEDUPLICATIONS: - num_iterations = random.randint(1, 3) - - for _ in range(num_iterations): - logger.info("Sending alert: {}".format(alert)) - try: - env = random.choice(["production", "staging", "development"]) - response = requests.post( - send_alert_url + f"?provider_id={provider_type}-{env}", - headers={"x-api-key": keep_api_key}, - json=alert, - ) - response.raise_for_status() # Raise an HTTPError for bad responses - except requests.exceptions.RequestException as e: - logger.error("Failed to send alert: {}".format(e)) - time.sleep(SLEEP_INTERVAL) - continue - - if response.status_code != 202: - logger.error("Failed to send alert: {}".format(response.text)) - else: - logger.info("Alert sent successfully") - - time.sleep(SLEEP_INTERVAL) # Wait for the configured interval before sending the next alert - + simulate_alerts( + keep_api_key=keep_api_key, + keep_api_url=keep_api_url, + sleep_interval=SLEEP_INTERVAL, + demo_correlation_rules=args.full_demo, + demo_topology=args.full_demo, + clean_old_incidents=args.full_demo, + ) if __name__ == "__main__": main()