From 9bd5bdfe6521cae0fd6148a4db703a98dc3ee59d Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 20 Nov 2024 11:52:46 +0100 Subject: [PATCH 01/59] Add monitoring related files --- config.yaml | 20 +- docker/docker-compose.yml | 21 +- src/base/clickhouse_kafka_sender.py | 21 + src/monitoring/__init__.py | 0 src/monitoring/clickhouse_batch.py | 82 ++ src/monitoring/clickhouse_connector.py | 345 ++++++++ src/monitoring/create_tables/batch_status.sql | 7 + .../create_tables/batch_timestamps.sql | 9 + src/monitoring/create_tables/dns_loglines.sql | 11 + .../create_tables/failed_dns_loglines.sql | 8 + .../create_tables/logline_status.sql | 7 + .../create_tables/logline_timestamps.sql | 8 + .../create_tables/logline_to_batches.sql | 6 + src/monitoring/create_tables/server_logs.sql | 7 + .../create_tables/server_logs_timestamps.sql | 7 + src/monitoring/monitoring_agent.py | 78 ++ tests/test_clickhouse_connector.py | 786 ++++++++++++++++++ tests/test_clickhouse_kafka_sender.py | 43 + tests/test_server.py | 482 +++-------- 19 files changed, 1587 insertions(+), 361 deletions(-) create mode 100644 src/base/clickhouse_kafka_sender.py create mode 100644 src/monitoring/__init__.py create mode 100644 src/monitoring/clickhouse_batch.py create mode 100644 src/monitoring/clickhouse_connector.py create mode 100644 src/monitoring/create_tables/batch_status.sql create mode 100644 src/monitoring/create_tables/batch_timestamps.sql create mode 100644 src/monitoring/create_tables/dns_loglines.sql create mode 100644 src/monitoring/create_tables/failed_dns_loglines.sql create mode 100644 src/monitoring/create_tables/logline_status.sql create mode 100644 src/monitoring/create_tables/logline_timestamps.sql create mode 100644 src/monitoring/create_tables/logline_to_batches.sql create mode 100644 src/monitoring/create_tables/server_logs.sql create mode 100644 src/monitoring/create_tables/server_logs_timestamps.sql create mode 100644 src/monitoring/monitoring_agent.py create mode 100644 tests/test_clickhouse_connector.py create mode 100644 tests/test_clickhouse_kafka_sender.py diff --git a/config.yaml b/config.yaml index 4a5b593..8d596ba 100644 --- a/config.yaml +++ b/config.yaml @@ -18,7 +18,6 @@ logging: pipeline: log_storage: logserver: - input_kafka_topic: "LogServer" input_file: "/opt/file.txt" max_number_of_connections: 1000 @@ -65,6 +64,11 @@ pipeline: base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/ threshold: 0.5 + monitoring: + clickhouse_connector: + batch_size: 10000 + batch_timeout: 2.0 + environment: timestamp_format: "%Y-%m-%dT%H:%M:%S.%fZ" kafka_brokers: @@ -74,7 +78,13 @@ environment: port: 8098 - hostname: 172.27.0.5 port: 8099 - logserver: - hostname: 172.27.0.8 - port_in: 9998 - port_out: 9999 + kafka_topics: + pipeline: + logserver_in: "pipeline.logserver_in" + logserver_to_collector: "pipeline.logserver_to_collector" + batch_sender_to_prefilter: "pipeline.batch_sender_to_prefilter" + prefilter_to_inspector: "pipeline.prefilter_to_inspector" + inspector_to_detector: "pipeline.inspector_to_detector" + monitoring: + clickhouse_server: + hostname: 172.27.0.11 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 82993c7..213458e 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -47,8 +47,6 @@ services: condition: service_healthy clickhouse-server: condition: service_healthy - ports: - - 9998:9998 networks: heidgaf: ipv4_address: 172.27.0.8 @@ -186,6 +184,25 @@ services: timeout: 5s retries: 3 + monitoring_agent: + build: + context: .. + dockerfile: docker/dockerfiles/Dockerfile.monitoring + network: host + restart: "unless-stopped" + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + clickhouse-server: + condition: service_healthy + networks: + heidgaf: + ipv4_address: 172.27.0.12 + networks: heidgaf: driver: bridge diff --git a/src/base/clickhouse_kafka_sender.py b/src/base/clickhouse_kafka_sender.py new file mode 100644 index 0000000..ef4338c --- /dev/null +++ b/src/base/clickhouse_kafka_sender.py @@ -0,0 +1,21 @@ +import json +import os +import sys + +sys.path.append(os.getcwd()) +from src.base.kafka_handler import SimpleKafkaProduceHandler +from src.base.log_config import get_logger + +logger = get_logger() + + +class ClickHouseKafkaSender: + def __init__(self, table_name: str): + self.table_name = table_name + self.kafka_producer = SimpleKafkaProduceHandler() + + def insert(self, data: dict): + self.kafka_producer.produce( + topic=f"clickhouse_{self.table_name}", + data=json.dumps(data, default=str), + ) diff --git a/src/monitoring/__init__.py b/src/monitoring/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/monitoring/clickhouse_batch.py b/src/monitoring/clickhouse_batch.py new file mode 100644 index 0000000..fb87795 --- /dev/null +++ b/src/monitoring/clickhouse_batch.py @@ -0,0 +1,82 @@ +import os +import sys +from threading import Timer + +import clickhouse_connect + +sys.path.append(os.getcwd()) +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CLICKHOUSE_HOSTNAME = CONFIG["environment"]["monitoring"]["clickhouse_server"][ + "hostname" +] +BATCH_SIZE = CONFIG["pipeline"]["monitoring"]["clickhouse_connector"]["batch_size"] +BATCH_TIMEOUT = CONFIG["pipeline"]["monitoring"]["clickhouse_connector"][ + "batch_timeout" +] + + +class ClickHouseBatchSender: + def __init__(self, table_name: str, column_names: list[str]): + self.table_name = table_name + self.column_names = column_names + + self.max_batch_size = BATCH_SIZE + self.batch_timeout = BATCH_TIMEOUT + + self.timer = None + self.batch = [] + self._client = clickhouse_connect.get_client( + host=CLICKHOUSE_HOSTNAME, + ) + + def __del__(self): + self.insert_all() + + def add(self, data: list[str] | list[list[str]]): + def _add_element(element): + if len(element) != len(self.column_names): + raise ValueError( + "Number of elements in the insert does not match the number of columns" + ) + + self.batch.append(element) + + if any(isinstance(e, list) for e in data): + for e in data: + _add_element(e) + else: + _add_element(data) + + if len(self.batch) >= self.max_batch_size: + self.insert_all() + elif not self.timer: + self._start_timer() + + def insert_all(self): + if self.batch: + self._client.insert( + self.table_name, + self.batch, + column_names=self.column_names, + ) + logger.debug( + f"Inserted {self.table_name=},{self.batch=},{self.column_names=}" + ) + self.batch = [] + + if self.timer: + self.timer.cancel() + + self.timer = None + + def _start_timer(self): + if self.timer: + self.timer.cancel() + + self.timer = Timer(BATCH_TIMEOUT, self.insert_all) + self.timer.start() diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py new file mode 100644 index 0000000..cb4d2c9 --- /dev/null +++ b/src/monitoring/clickhouse_connector.py @@ -0,0 +1,345 @@ +import datetime +import os +import sys +import uuid +from abc import abstractmethod + +import clickhouse_connect + +sys.path.append(os.getcwd()) +from src.monitoring.clickhouse_batch import ClickHouseBatchSender +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CLICKHOUSE_HOSTNAME = CONFIG["environment"]["monitoring"]["clickhouse_server"][ + "hostname" +] +CREATE_TABLES_DIRECTORY = "src/monitoring/create_tables" # TODO: Get from config + + +class ClickHouseConnector: + def __init__(self, table_name: str, column_names: list[str]): + self._table_name = table_name + self._column_names = column_names + + self._batch_sender = ClickHouseBatchSender( + table_name=self._table_name, + column_names=self._column_names, + ) + + def prepare_table(self): + def _load_contents(file_name: str) -> str: + with open(file_name, "r") as file: + return file.read() + + filename = self._table_name + ".sql" + file_path = os.path.join(CREATE_TABLES_DIRECTORY, filename) + sql_content = _load_contents(file_path) + + with clickhouse_connect.get_client(host=CLICKHOUSE_HOSTNAME) as client: + try: + client.command(sql_content) + except Exception as e: + logger.critical("Error in CREATE TABLE statement") + raise e + + def _add_to_batch(self, data): + self._batch_sender.add(data) + + @abstractmethod + def insert(self, *args, **kwargs): + pass + + +class ServerLogsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_id", + "timestamp_in", + "message_text", + ] + + super().__init__("server_logs", column_names) + + def insert( + self, + message_text: str, + message_id: None | str | uuid.UUID = None, + timestamp_in: str | datetime.datetime | None = None, + ) -> uuid.UUID: + # TODO: Switch to Marshmallow + if not message_id: + message_id = uuid.uuid4() + + if isinstance(message_id, str): + message_id = uuid.UUID(message_id) + + if not timestamp_in: + timestamp_in = datetime.datetime.now() + + if isinstance(timestamp_in, str): + timestamp_in = datetime.datetime.fromisoformat(timestamp_in) + + self._add_to_batch([message_id, timestamp_in, message_text]) + return message_id + + +class ServerLogsTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_id", + "event", + "event_timestamp", + ] + + super().__init__("server_logs_timestamps", column_names) + + def insert( + self, + message_id: str | uuid.UUID, + event: str, + event_timestamp: str | datetime.datetime | None = None, + ): + if isinstance(message_id, str): + message_id = uuid.UUID(message_id) + + if not event_timestamp: + event_timestamp = datetime.datetime.now() + + if isinstance(event_timestamp, str): + event_timestamp = datetime.datetime.fromisoformat(event_timestamp) + + self._add_to_batch([message_id, event, event_timestamp]) + + +class FailedDNSLoglinesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_text", + "timestamp_in", + "timestamp_failed", + "reason_for_failure", + ] + + super().__init__("failed_dns_loglines", column_names) + + def insert( + self, + message_text: str, + timestamp_in: str | datetime.datetime, + timestamp_failed: str | datetime.datetime | None = None, + reason_for_failure: str | None = None, + ) -> None: + if not timestamp_failed: + timestamp_failed = datetime.datetime.now() + + if isinstance(timestamp_in, str): + timestamp_in = datetime.datetime.fromisoformat(timestamp_in) + if isinstance(timestamp_failed, str): + timestamp_failed = datetime.datetime.fromisoformat(timestamp_failed) + + self._add_to_batch( + [message_text, timestamp_in, timestamp_failed, reason_for_failure] + ) + + +class LoglineToBatchesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "batch_id", + ] + + super().__init__("logline_to_batches", column_names) + + def insert( + self, + logline_id: str | uuid.UUID, + batch_id: str | uuid.UUID, + ): + if isinstance(logline_id, str): + logline_id = uuid.UUID(logline_id) + if isinstance(batch_id, str): + batch_id = uuid.UUID(batch_id) + + self._add_to_batch( + [ + logline_id, + batch_id, + ] + ) + + +class DNSLoglinesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "subnet_id", + "timestamp", + "status_code", + "client_ip", + "record_type", + "additional_fields", + ] + + super().__init__("dns_loglines", column_names) + + def insert( + self, + subnet_id: str, + timestamp: str | datetime.datetime, + status_code: str, + client_ip: str, + record_type: str, + additional_fields: str | None = None, + ) -> uuid.UUID: + logline_id = uuid.uuid4() + + self._add_to_batch( + [ + logline_id, + subnet_id, + timestamp, + status_code, + client_ip, + record_type, + additional_fields, + ] + ) + return logline_id + + +class LoglineStatusConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "status", + "exit_at_stage", + ] + + super().__init__("logline_status", column_names) + + def insert( + self, + logline_id: str | uuid.UUID, + status: str, + exit_at_stage: str | None = None, + ): + if isinstance(logline_id, str): + logline_id = uuid.UUID(logline_id) + + self._add_to_batch( + [ + logline_id, + status, + exit_at_stage, + ] + ) + + +class LoglineTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "stage", + "status", + "timestamp", + ] + + super().__init__("logline_timestamps", column_names) + + def insert( + self, + logline_id: str | uuid.UUID, + stage: str, + status: str, + timestamp: str | datetime.datetime = None, + ) -> None: + if isinstance(logline_id, str): + logline_id = uuid.UUID(logline_id) + + if not timestamp: + timestamp = datetime.datetime.now() + + if isinstance(timestamp, str): + timestamp = datetime.datetime.fromisoformat(timestamp) + + self._add_to_batch( + [ + logline_id, + stage, + status, + timestamp, + ] + ) + + +class BatchStatusConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "batch_id", + "status", + "exit_at_stage", + ] + + super().__init__("batch_status", column_names) + + def insert( + self, + batch_id: str | uuid.UUID, + status: str, + exit_at_stage: str | None = None, + ): + if isinstance(batch_id, str): + batch_id = uuid.UUID(batch_id) + + self._add_to_batch( + [ + batch_id, + status, + exit_at_stage, + ] + ) + + +class BatchTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "batch_id", + "stage", + "status", + "timestamp", + "message_count", + ] + + super().__init__("batch_timestamps", column_names) + + def insert( + self, + batch_id: str | uuid.UUID, + stage: str, + status: str, + message_count: int, + timestamp: str | datetime.datetime = None, + ) -> None: + if isinstance(batch_id, str): + batch_id = uuid.UUID(batch_id) + + if not timestamp: + timestamp = datetime.datetime.now() + + if isinstance(timestamp, str): + timestamp = datetime.datetime.fromisoformat(timestamp) + + self._add_to_batch( + [ + batch_id, + stage, + status, + timestamp, + message_count, + ] + ) diff --git a/src/monitoring/create_tables/batch_status.sql b/src/monitoring/create_tables/batch_status.sql new file mode 100644 index 0000000..3f515b9 --- /dev/null +++ b/src/monitoring/create_tables/batch_status.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS batch_status ( + batch_id UUID NOT NULL, + status String NOT NULL, + exit_at_stage Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY (batch_id); diff --git a/src/monitoring/create_tables/batch_timestamps.sql b/src/monitoring/create_tables/batch_timestamps.sql new file mode 100644 index 0000000..c0e7a1c --- /dev/null +++ b/src/monitoring/create_tables/batch_timestamps.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS batch_timestamps ( + batch_id UUID NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL, + message_count UInt32 +) +ENGINE = MergeTree +PRIMARY KEY (batch_id); diff --git a/src/monitoring/create_tables/dns_loglines.sql b/src/monitoring/create_tables/dns_loglines.sql new file mode 100644 index 0000000..c3468f7 --- /dev/null +++ b/src/monitoring/create_tables/dns_loglines.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS dns_loglines ( + logline_id UUID NOT NULL, + subnet_id String NOT NULL, + timestamp DateTime64(6) NOT NULL, + status_code String NOT NULL, + client_ip String NOT NULL, + record_type String NOT NULL, + additional_fields Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/failed_dns_loglines.sql b/src/monitoring/create_tables/failed_dns_loglines.sql new file mode 100644 index 0000000..846f6cd --- /dev/null +++ b/src/monitoring/create_tables/failed_dns_loglines.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS failed_dns_loglines ( + message_text String NOT NULL, + timestamp_in DateTime64(6) NOT NULL, + timestamp_failed DateTime64(6) NOT NULL, + reason_for_failure Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY(message_text, timestamp_in); diff --git a/src/monitoring/create_tables/logline_status.sql b/src/monitoring/create_tables/logline_status.sql new file mode 100644 index 0000000..cdeb6c2 --- /dev/null +++ b/src/monitoring/create_tables/logline_status.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS logline_status ( + logline_id UUID NOT NULL, + status String NOT NULL, + exit_at_stage Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/logline_timestamps.sql b/src/monitoring/create_tables/logline_timestamps.sql new file mode 100644 index 0000000..4ff9887 --- /dev/null +++ b/src/monitoring/create_tables/logline_timestamps.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS logline_timestamps ( + logline_id UUID NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/logline_to_batches.sql b/src/monitoring/create_tables/logline_to_batches.sql new file mode 100644 index 0000000..41d4348 --- /dev/null +++ b/src/monitoring/create_tables/logline_to_batches.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS logline_to_batches ( + logline_id UUID NOT NULL, + batch_id UUID NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/server_logs.sql b/src/monitoring/create_tables/server_logs.sql new file mode 100644 index 0000000..b191d83 --- /dev/null +++ b/src/monitoring/create_tables/server_logs.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS server_logs ( + message_id UUID NOT NULL, + timestamp_in DateTime64(6) NOT NULL, + message_text String NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY(message_id); diff --git a/src/monitoring/create_tables/server_logs_timestamps.sql b/src/monitoring/create_tables/server_logs_timestamps.sql new file mode 100644 index 0000000..7a6c58c --- /dev/null +++ b/src/monitoring/create_tables/server_logs_timestamps.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS server_logs_timestamps ( + message_id UUID NOT NULL, + event String NOT NULL, + event_timestamp DateTime64(6) NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY(message_id); diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py new file mode 100644 index 0000000..8ea6c09 --- /dev/null +++ b/src/monitoring/monitoring_agent.py @@ -0,0 +1,78 @@ +import asyncio +import json +import os +import sys + +sys.path.append(os.getcwd()) +from src.monitoring.clickhouse_connector import * +from src.base.kafka_handler import SimpleKafkaConsumeHandler +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CREATE_TABLES_DIRECTORY = "src/monitoring/create_tables" # TODO: Get from config + + +def prepare_all_tables(): + def _load_contents(file_name: str) -> str: + with open(file_name, "r") as file: + return file.read() + + for filename in os.listdir(CREATE_TABLES_DIRECTORY): + if filename.endswith(".sql"): + file_path = os.path.join(CREATE_TABLES_DIRECTORY, filename) + sql_content = _load_contents(file_path) + + with clickhouse_connect.get_client(host=CLICKHOUSE_HOSTNAME) as client: + try: + client.command(sql_content) + except Exception as e: + logger.critical("Error in CREATE TABLE statement") + raise e + + +class MonitoringAgent: + def __init__(self): + self.connectors = { + "server_logs": ServerLogsConnector(), + "server_logs_timestamps": ServerLogsTimestampsConnector(), + "failed_dns_loglines": FailedDNSLoglinesConnector(), + "logline_to_batches": LoglineToBatchesConnector(), + "dns_loglines": DNSLoglinesConnector(), + "logline_status": LoglineStatusConnector(), + "logline_timestamps": LoglineTimestampsConnector(), + "batch_status": BatchStatusConnector(), + "batch_timestamps": BatchTimestampsConnector(), + } + + self.topics = [f"clickhouse_{table_name}" for table_name in self.connectors] + self.kafka_consumer = SimpleKafkaConsumeHandler(self.topics) + + async def start(self): + loop = asyncio.get_running_loop() + + try: + while True: + key, value, topic = await loop.run_in_executor( + None, self.kafka_consumer.consume + ) + logger.debug(f"From Kafka: {value}") + + data = json.loads(value) + table_name = topic.replace("clickhouse_", "") + + self.connectors[table_name].insert(**data) + except KeyboardInterrupt: + logger.info("Stopped MonitoringAgent.") + + +def main(): + prepare_all_tables() + clickhouse_consumer = MonitoringAgent() + asyncio.run(clickhouse_consumer.start()) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/tests/test_clickhouse_connector.py b/tests/test_clickhouse_connector.py new file mode 100644 index 0000000..0953b6d --- /dev/null +++ b/tests/test_clickhouse_connector.py @@ -0,0 +1,786 @@ +import json +import unittest +from unittest.mock import patch, MagicMock, mock_open + +from src.monitoring.clickhouse_connector import * + + +class TestClickHouseConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + table_name = "test_table" + column_names = ["col_1", "col_2", "col_3"] + + # Act + sut = ClickHouseConnector(table_name, column_names) + + # Assert + self.assertEqual(table_name, sut._table_name) + self.assertEqual(column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=table_name, + column_names=column_names, + ) + + @patch("src.monitoring.clickhouse_connector.os.path.join") + @patch( + "src.monitoring.clickhouse_connector.open", + new_callable=mock_open, + read_data="CREATE TABLE test;", + ) + @patch("src.monitoring.clickhouse_connector.clickhouse_connect.get_client") + def test_prepare_table_success( + self, mock_get_client, mock_open_file, mock_path_join + ): + # Arrange + mock_client = MagicMock() + mock_get_client.return_value.__enter__.return_value = mock_client + mock_path_join.return_value = "/fake/path/test_table.sql" + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut.prepare_table() + + # Assert + mock_open_file.assert_called_once_with("/fake/path/test_table.sql", "r") + mock_client.command.assert_called_once_with("CREATE TABLE test;") + + @patch("src.monitoring.clickhouse_connector.os.path.join") + @patch( + "src.monitoring.clickhouse_connector.open", + new_callable=mock_open, + read_data="CREATE TABLE test;", + ) + @patch("src.monitoring.clickhouse_connector.clickhouse_connect.get_client") + @patch("src.monitoring.clickhouse_connector.logger") + def test_prepare_table_failure( + self, mock_logger, mock_get_client, mock_open_file, mock_path_join + ): + mock_client = MagicMock() + mock_get_client.return_value.__enter__.return_value = mock_client + mock_path_join.return_value = "/fake/path/test_table.sql" + mock_client.command.side_effect = Exception("Test exception") + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + with self.assertRaises(Exception): + sut.prepare_table() + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_add_to_batch(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut._add_to_batch("test_data") + + # Assert + mock_clickhouse_batch_sender_instance.add.assert_called_once_with("test_data") + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert(self, mock_clickhouse_batch_sender): + # Arrange + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut.insert("test_data") + + +class TestServerLogsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "server_logs" + expected_column_names = [ + "message_id", + "timestamp_in", + "message_text", + ] + + # Act + sut = ServerLogsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + message_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + timestamp_in = "2034-12-13 12:34:12.132412" + + sut = ServerLogsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + returned_value = sut.insert( + message_text=message_text, + message_id=message_id, + timestamp_in=timestamp_in, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + "test_message_text", + ] + ) + self.assertEqual( + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), returned_value + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + sut = ServerLogsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert("test_message_text") + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestServerLogsTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "server_logs_timestamps" + expected_column_names = [ + "message_id", + "event", + "event_timestamp", + ] + + # Act + sut = ServerLogsTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + event = "test_event" + event_timestamp = "2034-12-13 12:34:12.132412" + + sut = ServerLogsTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_id=message_id, + event=event, + event_timestamp=event_timestamp, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "test_event", + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + sut = ServerLogsTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "test_event", + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestFailedDNSLoglinesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "failed_dns_loglines" + expected_column_names = [ + "message_text", + "timestamp_in", + "timestamp_failed", + "reason_for_failure", + ] + + # Act + sut = FailedDNSLoglinesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + timestamp_in = "2034-12-13 12:34:12.132412" + timestamp_failed = "2034-12-13 12:35:35.542635" + reason_for_failure = "Wrong client_ip field" + + sut = FailedDNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_text=message_text, + timestamp_in=timestamp_in, + timestamp_failed=timestamp_failed, + reason_for_failure=reason_for_failure, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + "test_message_text", + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + "Wrong client_ip field", + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + timestamp_in = "2034-12-13 12:34:12.132412" + + sut = FailedDNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_text=message_text, + timestamp_in=datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestLoglineToBatchesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "logline_to_batches" + expected_column_names = [ + "logline_id", + "batch_id", + ] + + # Act + sut = LoglineToBatchesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given_as_str(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + batch_id = "1f855c43-8a75-4b53-b6cd-4a13b89312d6" + + sut = LoglineToBatchesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + batch_id=batch_id, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given_as_uuid(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") + + sut = LoglineToBatchesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + batch_id=batch_id, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), + ] + ) + + +class TestDNSLoglinesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "dns_loglines" + expected_column_names = [ + "logline_id", + "subnet_id", + "timestamp", + "status_code", + "client_ip", + "record_type", + "additional_fields", + ] + + # Act + sut = DNSLoglinesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + subnet_id = "127.0.0.0_24" + timestamp = "2034-12-13 12:34:12.132412" + status_code = "NXDOMAIN" + client_ip = "127.0.0.1" + record_type = "A" + additional_fields = json.dumps(dict(test="some_field")) + + sut = DNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + returned_value = sut.insert( + subnet_id=subnet_id, + timestamp=timestamp, + status_code=status_code, + client_ip=client_ip, + record_type=record_type, + additional_fields=additional_fields, + ) + + # Assert + mock_add_to_batch.assert_called_once() + self.assertTrue(isinstance(returned_value, uuid.UUID)) + + +class TestLoglineStatusConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "logline_status" + expected_column_names = [ + "logline_id", + "status", + "exit_at_stage", + ] + + # Act + sut = LoglineStatusConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + status = "inactive" + exit_at_stage = "prefilter" + + sut = LoglineStatusConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + status=status, + exit_at_stage=exit_at_stage, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "inactive", + "prefilter", + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + status = "inactive" + + sut = LoglineStatusConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + status=status, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "inactive", + None, + ] + ) + + +class TestLoglineTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "logline_timestamps" + expected_column_names = [ + "logline_id", + "stage", + "status", + "timestamp", + ] + + # Act + sut = LoglineTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + stage = "prefilter" + status = "prefilter_out" + timestamp = "2034-12-13 12:35:35.542635" + + sut = LoglineTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + stage=stage, + status=status, + timestamp=timestamp, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + stage = "prefilter" + status = "prefilter_out" + + sut = LoglineTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + stage=stage, + status=status, + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestBatchStatusConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "batch_status" + expected_column_names = [ + "batch_id", + "status", + "exit_at_stage", + ] + + # Act + sut = BatchStatusConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + batch_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + status = "inactive" + exit_at_stage = "prefilter" + + sut = BatchStatusConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + batch_id=batch_id, + status=status, + exit_at_stage=exit_at_stage, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "inactive", + "prefilter", + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + status = "inactive" + + sut = BatchStatusConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + batch_id=batch_id, + status=status, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "inactive", + None, + ] + ) + + +class TestBatchTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "batch_timestamps" + expected_column_names = [ + "batch_id", + "stage", + "status", + "timestamp", + "message_count", + ] + + # Act + sut = BatchTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + batch_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + stage = "prefilter" + status = "prefilter_out" + timestamp = "2034-12-13 12:35:35.542635" + message_count = 456 + + sut = BatchTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + batch_id=batch_id, + stage=stage, + status=status, + timestamp=timestamp, + message_count=message_count, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + 456, + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + stage = "prefilter" + status = "prefilter_out" + message_count = 456 + + sut = BatchTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + batch_id=batch_id, + stage=stage, + status=status, + message_count=message_count, + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_clickhouse_kafka_sender.py b/tests/test_clickhouse_kafka_sender.py new file mode 100644 index 0000000..0aacd1b --- /dev/null +++ b/tests/test_clickhouse_kafka_sender.py @@ -0,0 +1,43 @@ +import unittest +from unittest.mock import patch + +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender + + +class TestInit(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") + def test_init(self, mock_produce_handler): + # Arrange + table_name = "test_table" + mock_produce_handler_instance = mock_produce_handler + mock_produce_handler.return_value = mock_produce_handler_instance + + # Act + sut = ClickHouseKafkaSender(table_name) + + # Assert + self.assertEqual(table_name, sut.table_name) + self.assertEqual(mock_produce_handler_instance, sut.kafka_producer) + mock_produce_handler.assert_called_once() + + +class TestInsert(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") + def test_insert(self, mock_produce_handler): + # Arrange + mock_produce_handler_instance = mock_produce_handler + mock_produce_handler.return_value = mock_produce_handler_instance + sut = ClickHouseKafkaSender("test_table") + + # Act + sut.insert({"test_key": "test_value"}) + + # Assert + mock_produce_handler_instance.produce.assert_called_once_with( + topic="clickhouse_test_table", + data='{"test_key": "test_value"}', + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_server.py b/tests/test_server.py index 4ff3098..a8b19c8 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -2,7 +2,6 @@ import os import tempfile import unittest -from ipaddress import IPv4Address, IPv6Address from unittest.mock import AsyncMock, MagicMock, patch import aiofiles @@ -10,260 +9,161 @@ from src.logserver.server import LogServer, main LOG_SERVER_IP_ADDR = "192.168.0.1" -LOG_SERVER_PORT_IN = 9998 -LOG_SERVER_PORT_OUT = 9999 class TestInit(unittest.TestCase): - @patch("src.logserver.server.HOSTNAME", "127.0.0.1") - @patch("src.logserver.server.PORT_IN", 7777) - @patch("src.logserver.server.PORT_OUT", 8888) - @patch("src.logserver.server.LISTEN_ON_TOPIC", "test_topic") - @patch("src.logserver.server.KafkaConsumeHandler") - def test_valid_init_ipv4(self, mock_kafka_consume_handler): + @patch("src.logserver.server.CONSUME_TOPIC", "test_topic") + @patch("src.logserver.server.ClickHouseKafkaSender") + @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.SimpleKafkaConsumeHandler") + def test_valid_init( + self, mock_kafka_consume_handler, mock_kafka_produce_handler, mock_server_logs + ): mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_produce_handler_instance = MagicMock() + mock_server_logs_instance = MagicMock() - sut = LogServer() - self.assertEqual(IPv4Address("127.0.0.1"), sut.host) - self.assertEqual(7777, sut.port_in) - self.assertEqual(8888, sut.port_out) - self.assertTrue(sut.data_queue.empty()) - self.assertEqual(0, sut.number_of_connections) - self.assertEqual(mock_kafka_consume_handler_instance, sut.kafka_consume_handler) - mock_kafka_consume_handler.assert_called_once_with(topic="test_topic") - - @patch("src.logserver.server.HOSTNAME", "fe80::1") - @patch("src.logserver.server.PORT_IN", 7777) - @patch("src.logserver.server.PORT_OUT", 8888) - @patch("src.logserver.server.LISTEN_ON_TOPIC", "test_topic") - @patch("src.logserver.server.KafkaConsumeHandler") - def test_valid_init_ipv6(self, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_produce_handler.return_value = mock_kafka_produce_handler_instance mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_server_logs.return_value = mock_server_logs_instance sut = LogServer() - self.assertEqual(IPv6Address("fe80::1"), sut.host) - self.assertEqual(7777, sut.port_in) - self.assertEqual(8888, sut.port_out) - self.assertTrue(sut.data_queue.empty()) - self.assertEqual(0, sut.number_of_connections) self.assertEqual(mock_kafka_consume_handler_instance, sut.kafka_consume_handler) - mock_kafka_consume_handler.assert_called_once_with(topic="test_topic") - - @patch("src.logserver.server.HOSTNAME", "256.256.256.256") - @patch("src.logserver.server.PORT_IN", 7777) - @patch("src.logserver.server.PORT_OUT", 8888) - @patch("src.logserver.server.LISTEN_ON_TOPIC", "test_topic") - @patch("src.logserver.server.KafkaConsumeHandler") - def test_invalid_init_with_invalid_host(self, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + self.assertEqual(mock_kafka_produce_handler_instance, sut.kafka_produce_handler) + self.assertEqual(mock_server_logs_instance, sut.server_logs) + mock_kafka_consume_handler.assert_called_once_with("test_topic") + mock_server_logs.assert_called_once_with("server_logs") - with self.assertRaises(ValueError): - LogServer() - mock_kafka_consume_handler.assert_not_called() - - -class TestOpen(unittest.IsolatedAsyncioTestCase): +class TestStart(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.logger") - @patch("src.logserver.server.HOSTNAME", "127.0.0.1") - @patch("src.logserver.server.PORT_IN", 1234) - @patch("src.logserver.server.PORT_OUT", 5678) - @patch("src.logserver.server.LogServer.handle_kafka_inputs") - @patch("src.logserver.server.LogServer.async_follow") - @patch("src.logserver.server.KafkaConsumeHandler") - async def test_open( - self, mock_kafka_consume_handler, mock_follow, mock_handle_kafka, mock_logger + @patch("src.logserver.server.SimpleKafkaConsumeHandler") + @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + def setUp( + self, + mock_kafka_produce_handler, + mock_kafka_consume_handler, + mock_logger, ): - # Arrange - sut = LogServer() + self.sut = LogServer() - with patch("asyncio.start_server", new_callable=AsyncMock) as mock_start_server: - mock_send_server = MagicMock() - mock_receive_server = MagicMock() - - mock_start_server.side_effect = [mock_send_server, mock_receive_server] - - mock_send_server.serve_forever = AsyncMock() - mock_receive_server.serve_forever = AsyncMock() - mock_send_server.wait_closed = AsyncMock() - mock_receive_server.wait_closed = AsyncMock() - - # Act - await sut.open() - - # Assert - mock_start_server.assert_any_call( - sut.handle_send_logline, "127.0.0.1", 5678 - ) - mock_start_server.assert_any_call( - sut.handle_receive_logline, "127.0.0.1", 1234 - ) - mock_send_server.serve_forever.assert_awaited_once() - mock_receive_server.serve_forever.assert_awaited_once() - mock_send_server.close.assert_called_once() - mock_receive_server.close.assert_called_once() - mock_send_server.wait_closed.assert_awaited_once() - mock_receive_server.wait_closed.assert_awaited_once() - mock_handle_kafka.assert_called_once() - mock_follow.assert_called_once() + @patch("src.logserver.server.LogServer.fetch_from_kafka") + @patch("src.logserver.server.LogServer.fetch_from_file") + async def test_start( + self, + mock_fetch_from_file, + mock_fetch_from_kafka, + ): + # Act + await self.sut.start() - @patch("src.logserver.server.logger") - @patch("src.logserver.server.HOSTNAME", "127.0.0.1") - @patch("src.logserver.server.PORT_IN", 1234) - @patch("src.logserver.server.PORT_OUT", 5678) - async def test_open_keyboard_interrupt(self, mock_logger): + # Assert + mock_fetch_from_kafka.assert_called_once() + mock_fetch_from_file.assert_called_once() + + # TODO: Update + # @patch("src.logserver.server.LogServer.fetch_from_kafka") + # @patch("src.logserver.server.LogServer.fetch_from_file") + # async def test_start_handles_keyboard_interrupt( + # self, + # mock_fetch_from_file, + # mock_fetch_from_kafka, + # ): + # # Arrange + # async def mock_gather(*args, **kwargs): + # raise KeyboardInterrupt + # + # with (patch('src.logserver.server.asyncio.gather', side_effect=mock_gather) as mock): + # # Act + # await self.sut.start() + # + # # Assert + # mock.assert_called_once() + # mock_fetch_from_kafka.assert_called_once() + # mock_fetch_from_file.assert_called_once() + + +class TestSend(unittest.TestCase): + @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") + @patch("src.logserver.server.ClickHouseKafkaSender") + @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + def test_send( + self, + mock_produce_handler, + mock_server_logs, + ): # Arrange - sut = LogServer() - - with patch("asyncio.start_server", new_callable=AsyncMock) as mock_start_server: - mock_send_server = MagicMock() - mock_receive_server = MagicMock() - - mock_start_server.side_effect = [mock_send_server, mock_receive_server] - - mock_send_server.serve_forever.side_effect = KeyboardInterrupt - mock_receive_server.serve_forever = AsyncMock() - mock_send_server.wait_closed = AsyncMock() - mock_receive_server.wait_closed = AsyncMock() - - # Act & Assert - await sut.open() - - # Additional Assertions - mock_send_server.close.assert_called_once() - mock_receive_server.close.assert_called_once() - mock_send_server.wait_closed.assert_awaited_once() - mock_receive_server.wait_closed.assert_awaited_once() - - -class TestHandleConnection(unittest.IsolatedAsyncioTestCase): - async def test_handle_connection_sending(self): - server_instance = LogServer() - server_instance.send_logline = AsyncMock() - server_instance.get_next_logline = MagicMock(return_value="test logline") - - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") + mock_server_logs_instance = MagicMock() + mock_server_logs.return_value = mock_server_logs_instance + mock_kafka_produce_handler_instance = MagicMock() + mock_produce_handler.return_value = mock_kafka_produce_handler_instance - await server_instance.handle_connection(reader, writer, sending=True) - - server_instance.send_logline.assert_awaited_once_with(writer, "test logline") - writer.close.assert_called_once() - writer.wait_closed.assert_awaited_once() - self.assertEqual(0, server_instance.number_of_connections) - - async def test_handle_connection_receiving(self): - server_instance = LogServer() - server_instance.receive_logline = AsyncMock() - - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") - - await server_instance.handle_connection(reader, writer, sending=False) - - server_instance.receive_logline.assert_awaited_once_with(reader) - writer.close.assert_called_once() - writer.wait_closed.assert_awaited_once() - self.assertEqual(0, server_instance.number_of_connections) - - async def test_handle_connection_rejected(self): - server_instance = LogServer() - server_instance.number_of_connections = 5 - - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") - - await server_instance.handle_connection(reader, writer, sending=True) - - writer.close.assert_called_once() - writer.wait_closed.assert_awaited_once() - self.assertEqual(5, server_instance.number_of_connections) - - async def test_handle_connection_increases_and_decreases_connections(self): - server_instance = LogServer() - server_instance.send_logline = AsyncMock() - server_instance.get_next_logline = MagicMock(return_value="test logline") - server_instance.number_of_connections = 3 - - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") - - await server_instance.handle_connection(reader, writer, sending=True) - - self.assertEqual(3, server_instance.number_of_connections) - - async def test_handle_connection_cancelled_error(self): - server_instance = LogServer() - server_instance.send_logline = AsyncMock(side_effect=asyncio.CancelledError) - server_instance.get_next_logline = MagicMock(return_value="test logline") + message = "test_message" + sut = LogServer() - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") + # Act + sut.send(message) - await server_instance.handle_connection(reader, writer, sending=True) + # Assert + mock_kafka_produce_handler_instance.produce.assert_called_once_with( + topic="test_topic", + data=message, + ) + mock_server_logs_instance.insert.assert_called_once() - server_instance.send_logline.assert_awaited_once_with(writer, "test logline") - writer.close.assert_called_once() - writer.wait_closed.assert_awaited_once() - self.assertEqual(0, server_instance.number_of_connections) +class TestFetchFromKafka(unittest.IsolatedAsyncioTestCase): + @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.SimpleKafkaConsumeHandler") + @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") - @patch("src.logserver.server.MAX_NUMBER_OF_CONNECTIONS", 7) - async def test_handle_connection_rejects_additional_connections(self, mock_logger): - server_instance = LogServer() - server_instance.number_of_connections = 7 - - reader = AsyncMock() - writer = AsyncMock() - writer.get_extra_info = MagicMock(return_value="test_address") - - await server_instance.handle_connection(reader, writer, sending=True) - - writer.close.assert_called_once() - writer.wait_closed.assert_awaited_once() - self.assertEqual(7, server_instance.number_of_connections) - - -class TestHandleKafkaInputs(unittest.IsolatedAsyncioTestCase): - async def asyncSetUp(self): + @patch("asyncio.get_running_loop") + async def test_handle_kafka_inputs( + self, + mock_get_running_loop, + mock_logger, + mock_send, + mock_kafka_consume, + mock_kafka_produce, + ): self.sut = LogServer() - self.sut.kafka_consume_handler = AsyncMock() - self.sut.data_queue = MagicMock() - @patch("src.logserver.server.logger") - @patch("asyncio.get_running_loop") - async def test_handle_kafka_inputs(self, mock_get_running_loop, mock_logger): + mock_send_instance = AsyncMock() + mock_send.return_value = mock_send_instance mock_loop = AsyncMock() mock_get_running_loop.return_value = mock_loop - self.sut.kafka_consume_handler.consume.return_value = ("key1", "value1") + self.sut.kafka_consume_handler.consume.return_value = ( + "key1", + "value1", + "topic1", + ) mock_loop.run_in_executor.side_effect = [ - ("key1", "value1"), + ("key1", "value1", "topic1"), asyncio.CancelledError(), ] with self.assertRaises(asyncio.CancelledError): - await self.sut.handle_kafka_inputs() + await self.sut.fetch_from_kafka() - self.sut.data_queue.put.assert_called_once_with("value1") + mock_send.assert_called_once_with("value1") -class TestAsyncFollow(unittest.IsolatedAsyncioTestCase): - def setUp(self): - self.sut = LogServer() - self.sut.kafka_consume_handler = AsyncMock() - self.sut.data_queue = MagicMock() +class TestFetchFromFile(unittest.IsolatedAsyncioTestCase): + @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.SimpleKafkaConsumeHandler") + @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") + @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") - async def test_async_follow(self, mock_logger): + async def test_fetch_from_file( + self, mock_logger, mock_send, mock_kafka_consume, mock_kafka_produce + ): + self.sut = LogServer() + + mock_send_instance = AsyncMock() + mock_send.return_value = mock_send_instance + with tempfile.NamedTemporaryFile( delete=False, mode="w+", newline="" ) as temp_file: @@ -272,7 +172,7 @@ async def test_async_follow(self, mock_logger): temp_file.flush() try: - task = asyncio.create_task(self.sut.async_follow(temp_file_path)) + task = asyncio.create_task(self.sut.fetch_from_file(temp_file_path)) await asyncio.sleep(0.2) @@ -289,151 +189,25 @@ async def test_async_follow(self, mock_logger): finally: os.remove(temp_file_path) - self.sut.data_queue.put.assert_any_call("Test line 3") - self.sut.data_queue.put.assert_any_call("Test line 4") - - -class TestHandleSendLogline(unittest.IsolatedAsyncioTestCase): - async def test_handle_send_logline(self): - server_instance = LogServer() - server_instance.handle_connection = AsyncMock() - - reader = AsyncMock() - writer = AsyncMock() - - await server_instance.handle_send_logline(reader, writer) - - server_instance.handle_connection.assert_awaited_once_with(reader, writer, True) - - -class TestHandleReceiveLogline(unittest.IsolatedAsyncioTestCase): - async def test_handle_receive_logline(self): - server_instance = LogServer() - server_instance.handle_connection = AsyncMock() - - reader = AsyncMock() - writer = AsyncMock() - - await server_instance.handle_receive_logline(reader, writer) - - server_instance.handle_connection.assert_awaited_once_with( - reader, writer, False - ) - - -class TestSendLogline(unittest.IsolatedAsyncioTestCase): - @patch("src.logserver.server.logger") - async def test_send_logline_with_logline(self, mock_logger): - server_instance = LogServer() - writer = AsyncMock() - logline = "Test logline" - - await server_instance.send_logline(writer, logline) - - writer.write.assert_called_once_with(logline.encode("utf-8")) - writer.drain.assert_called_once() - - async def test_send_logline_no_logline(self): - server_instance = LogServer() - writer = AsyncMock() - logline = "" - - await server_instance.send_logline(writer, logline) - - writer.write.assert_not_called() - writer.drain.assert_not_called() - - -class TestReceiveLogline(unittest.IsolatedAsyncioTestCase): - @patch("src.logserver.server.logger") - async def test_receive_logline(self, mock_logger): - reader = AsyncMock() - data_queue = MagicMock() - server_instance = LogServer() - server_instance.data_queue = data_queue - - reader.readuntil = AsyncMock( - side_effect=[b"Test message 1\n", b"Test message 2\n", b""] - ) - - receive_task = asyncio.create_task(server_instance.receive_logline(reader)) - await receive_task - - data_queue.put.assert_any_call("Test message 1") - data_queue.put.assert_any_call("Test message 2") - - self.assertEqual(data_queue.put.call_count, 2) - - @patch("src.logserver.server.logger") - async def test_receive_without_separator(self, mock_logger): - reader = AsyncMock() - data_queue = MagicMock() - server_instance = LogServer() - server_instance.data_queue = data_queue - - reader.readuntil = AsyncMock( - side_effect=asyncio.exceptions.IncompleteReadError(b"", 100) - ) - - # noinspection PyAsyncCall - asyncio.create_task(server_instance.receive_logline(reader)) - - @patch("src.logserver.server.logger") - async def test_receive_too_long(self, mock_logger): - reader = AsyncMock() - data_queue = MagicMock() - server_instance = LogServer() - server_instance.data_queue = data_queue - - reader.readuntil = AsyncMock(side_effect=asyncio.LimitOverrunError("", 1)) - - # noinspection PyAsyncCall - asyncio.create_task(server_instance.receive_logline(reader)) - - @patch("src.logserver.server.logger") - async def test_receive_raise_other_exception(self, mock_logger): - reader = AsyncMock() - data_queue = MagicMock() - server_instance = LogServer() - server_instance.data_queue = data_queue - - reader.readuntil = AsyncMock(side_effect=ValueError("Something went wrong")) - - with self.assertRaises(ValueError): - task = asyncio.create_task(server_instance.receive_logline(reader)) - await task - - -class TestGetNextLogline(unittest.TestCase): - def test_valid(self): - server_instance = LogServer() - server_instance.data_queue.put("Element 1") - server_instance.data_queue.put("Element 2") - - self.assertEqual("Element 1", server_instance.get_next_logline()) - self.assertEqual("Element 2", server_instance.get_next_logline()) - - def test_valid_from_empty_queue(self): - server_instance = LogServer() - self.assertIsNone(server_instance.get_next_logline()) + mock_send.assert_any_call("Test line 3") + mock_send.assert_any_call("Test line 4") -class TestMainFunction(unittest.TestCase): +class TestMain(unittest.TestCase): @patch("src.logserver.server.logger") - @patch("src.logserver.server.asyncio.run") @patch("src.logserver.server.LogServer") - def test_main(self, mock_log_server_class, mock_asyncio_run, mock_logger): + @patch("asyncio.run") + def test_main(self, mock_asyncio_run, mock_instance, mock_logger): # Arrange - mock_server_instance = MagicMock() - mock_log_server_class.return_value = mock_server_instance + mock_instance_obj = MagicMock() + mock_instance.return_value = mock_instance_obj # Act main() # Assert - mock_log_server_class.assert_called_once() - mock_server_instance.open.assert_called_once() - mock_asyncio_run.assert_called_once_with(mock_server_instance.open()) + mock_instance.assert_called_once() + mock_asyncio_run.assert_called_once_with(mock_instance_obj.start()) if __name__ == "__main__": From 1e25801b90905817ec092412d529c78f19ba4c9c Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 28 Nov 2024 11:45:22 +0100 Subject: [PATCH 02/59] Make LogServer write to ClickHouse --- src/logserver/server.py | 41 ++++++++++++++++++++++++++++++++++++++--- tests/test_server.py | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/src/logserver/server.py b/src/logserver/server.py index 59fc9e7..19d9ce9 100644 --- a/src/logserver/server.py +++ b/src/logserver/server.py @@ -1,6 +1,8 @@ import asyncio +import datetime import os import sys +import uuid import aiofiles @@ -9,6 +11,7 @@ SimpleKafkaConsumeHandler, ExactlyOnceKafkaProduceHandler, ) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import generate_unique_transactional_id from src.base.utils import setup_config from src.base.log_config import get_logger @@ -42,6 +45,10 @@ def __init__(self) -> None: self.kafka_consume_handler = SimpleKafkaConsumeHandler(CONSUME_TOPIC) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) + # databases + self.server_logs = ClickHouseKafkaSender("server_logs") + self.server_logs_timestamps = ClickHouseKafkaSender("server_logs_timestamps") + async def start(self) -> None: """ Starts fetching messages from Kafka and from the input file. @@ -68,16 +75,25 @@ async def start(self) -> None: logger.info("LogServer stopped.") - def send(self, message: str) -> None: + def send(self, message_id: uuid.UUID, message: str) -> None: """ Sends a received message using Kafka. Args: + message_id (uuid.UUID): UUID of the message message (str): Message to be sent """ self.kafka_produce_handler.produce(topic=PRODUCE_TOPIC, data=message) logger.debug(f"Sent: '{message}'") + self.server_logs_timestamps.insert( + dict( + message_id=message_id, + event="timestamp_out", + event_timestamp=datetime.datetime.now(), + ) + ) + async def fetch_from_kafka(self) -> None: """ Starts a loop to continuously listen on the configured Kafka topic. If a message is consumed, it is sent. @@ -90,7 +106,16 @@ async def fetch_from_kafka(self) -> None: ) logger.debug(f"From Kafka: '{value}'") - self.send(value) + message_id = uuid.uuid4() + self.server_logs.insert( + dict( + message_id=message_id, + timestamp_in=datetime.datetime.now(), + message_text=value, + ) + ) + + self.send(message_id, value) async def fetch_from_file(self, file: str = READ_FROM_FILE) -> None: """ @@ -117,7 +142,17 @@ async def fetch_from_file(self, file: str = READ_FROM_FILE) -> None: continue logger.debug(f"From file: '{cleaned_line}'") - self.send(cleaned_line) + + message_id = uuid.uuid4() + self.server_logs.insert( + dict( + message_id=message_id, + timestamp_in=datetime.datetime.now(), + message_text=cleaned_line, + ) + ) + + self.send(message_id, cleaned_line) def main() -> None: diff --git a/tests/test_server.py b/tests/test_server.py index 06ee245..8f29c54 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -2,7 +2,9 @@ import os import tempfile import unittest +import uuid from unittest.mock import AsyncMock, MagicMock, patch +from uuid import UUID import aiofiles @@ -15,7 +17,10 @@ class TestInit(unittest.TestCase): @patch("src.logserver.server.CONSUME_TOPIC", "test_topic") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") @patch("src.logserver.server.SimpleKafkaConsumeHandler") - def test_valid_init(self, mock_kafka_consume_handler, mock_kafka_produce_handler): + @patch("src.logserver.server.ClickHouseKafkaSender") + def test_valid_init( + self, mock_clickhouse, mock_kafka_consume_handler, mock_kafka_produce_handler + ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_produce_handler_instance = MagicMock() @@ -32,8 +37,10 @@ class TestStart(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.logger") @patch("src.logserver.server.SimpleKafkaConsumeHandler") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.ClickHouseKafkaSender") def setUp( self, + mock_clickhouse, mock_kafka_produce_handler, mock_kafka_consume_handler, mock_logger, @@ -42,8 +49,10 @@ def setUp( @patch("src.logserver.server.LogServer.fetch_from_kafka") @patch("src.logserver.server.LogServer.fetch_from_file") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_start( self, + mock_clickhouse, mock_fetch_from_file, mock_fetch_from_kafka, ): @@ -56,8 +65,10 @@ async def test_start( @patch("src.logserver.server.LogServer.fetch_from_kafka") @patch("src.logserver.server.LogServer.fetch_from_file") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_start_handles_keyboard_interrupt( self, + mock_clickhouse, mock_fetch_from_file, mock_fetch_from_kafka, ): @@ -80,8 +91,10 @@ async def mock_gather(*args, **kwargs): class TestSend(unittest.TestCase): @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.ClickHouseKafkaSender") def test_send( self, + mock_clickhouse, mock_produce_handler, ): # Arrange @@ -92,7 +105,7 @@ def test_send( sut = LogServer() # Act - sut.send(message) + sut.send(uuid.uuid4(), message) # Assert mock_kafka_produce_handler_instance.produce.assert_called_once_with( @@ -107,8 +120,12 @@ class TestFetchFromKafka(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") @patch("asyncio.get_running_loop") + @patch("src.logserver.server.ClickHouseKafkaSender") + @patch("src.logserver.server.uuid") async def test_handle_kafka_inputs( self, + mock_uuid, + mock_clickhouse, mock_get_running_loop, mock_logger, mock_send, @@ -117,6 +134,9 @@ async def test_handle_kafka_inputs( ): self.sut = LogServer() + mock_uuid_instance = MagicMock() + mock_uuid.return_value = mock_uuid_instance + mock_uuid.uuid4.return_value = UUID("bd72ccb4-0ef2-4100-aa22-e787122d6875") mock_send_instance = AsyncMock() mock_send.return_value = mock_send_instance mock_loop = AsyncMock() @@ -135,7 +155,9 @@ async def test_handle_kafka_inputs( with self.assertRaises(asyncio.CancelledError): await self.sut.fetch_from_kafka() - mock_send.assert_called_once_with("value1") + mock_send.assert_called_once_with( + UUID("bd72ccb4-0ef2-4100-aa22-e787122d6875"), "value1" + ) class TestFetchFromFile(unittest.IsolatedAsyncioTestCase): @@ -145,8 +167,14 @@ class TestFetchFromFile(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_fetch_from_file( - self, mock_logger, mock_send, mock_kafka_consume, mock_kafka_produce + self, + mock_clickhouse, + mock_logger, + mock_send, + mock_kafka_consume, + mock_kafka_produce, ): self.sut = LogServer() @@ -178,8 +206,7 @@ async def test_fetch_from_file( finally: os.remove(temp_file_path) - mock_send.assert_any_call("Test line 3") - mock_send.assert_any_call("Test line 4") + self.assertEqual(2, mock_send.call_count) class TestMain(unittest.TestCase): From 2986ea935c670ff1408e803f1c9e78c3e70072f8 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 28 Nov 2024 11:45:35 +0100 Subject: [PATCH 03/59] Add requirements.monitoring.txt --- requirements/requirements.monitoring.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 requirements/requirements.monitoring.txt diff --git a/requirements/requirements.monitoring.txt b/requirements/requirements.monitoring.txt new file mode 100644 index 0000000..a2e9d98 --- /dev/null +++ b/requirements/requirements.monitoring.txt @@ -0,0 +1,5 @@ +clickhouse_connect~=0.8.3 +confluent-kafka~=2.4.0 +marshmallow_dataclass~=8.7.1 +colorlog~=6.8.2 +PyYAML~=6.0.1 From 04199b1f8eaa49163b010fffff7005caa12d8f8f Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 28 Nov 2024 11:46:28 +0100 Subject: [PATCH 04/59] Make Collector write to ClickHouse --- src/logcollector/collector.py | 22 ++++-- tests/test_collector.py | 132 +++++++++++++++++++++++++++------- 2 files changed, 124 insertions(+), 30 deletions(-) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 5af98b7..265ce53 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -1,10 +1,12 @@ import asyncio +import datetime import ipaddress import json import os import sys sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.kafka_handler import ExactlyOnceKafkaConsumeHandler from src.base.logline_handler import LoglineHandler from src.base import utils @@ -38,6 +40,9 @@ def __init__(self) -> None: self.logline_handler = LoglineHandler() self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + # databases + self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") + async def start(self) -> None: """ Starts fetching messages from Kafka and sending them to the :class:`Prefilter`. @@ -74,7 +79,7 @@ async def fetch(self) -> None: ) logger.debug(f"From Kafka: '{value}'") - await self.store(value) + await self.store(datetime.datetime.now(), value) async def send(self) -> None: """ @@ -85,12 +90,20 @@ async def send(self) -> None: try: while True: if not self.loglines.empty(): - logline = await self.loglines.get() + timestamp_in, logline = await self.loglines.get() try: fields = self.logline_handler.validate_logline_and_get_fields_as_json( logline ) except ValueError: + self.failed_dns_loglines.insert( + dict( + message_text=logline, + timestamp_in=timestamp_in, + timestamp_failed=datetime.datetime.now(), + ) + ) + continue subnet_id = self.get_subnet_id( @@ -115,14 +128,15 @@ async def send(self) -> None: logger.info("Stopped LogCollector.") - async def store(self, message: str): + async def store(self, timestamp_in: datetime.datetime, message: str): """ Stores the given message temporarily. Args: + timestamp_in (datetime.datetime): Timestamp of entering the pipeline message (str): Message to be stored """ - await self.loglines.put(message) + await self.loglines.put((timestamp_in, message)) @staticmethod def get_subnet_id(address: ipaddress.IPv4Address | ipaddress.IPv6Address) -> str: diff --git a/tests/test_collector.py b/tests/test_collector.py index 033a4ce..5192256 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -1,4 +1,5 @@ import asyncio +import datetime import ipaddress import unittest from unittest.mock import MagicMock, patch, AsyncMock @@ -11,8 +12,13 @@ class TestInit(unittest.TestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_valid_init( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): mock_batch_handler_instance = MagicMock() mock_logline_handler_instance = MagicMock() @@ -37,8 +43,10 @@ class TestStart(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def setUp( self, + mock_clickhouse, mock_logline_handler, mock_batch_handler, mock_kafka_consume_handler, @@ -89,8 +97,13 @@ class TestFetch(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.LoglineHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def asyncSetUp( - self, mock_kafka_handler, mock_batch_sender, mock_logline_handler + self, + mock_clickhouse, + mock_kafka_handler, + mock_batch_sender, + mock_logline_handler, ): self.sut = LogCollector() self.sut.kafka_consume_handler = AsyncMock() @@ -98,8 +111,9 @@ async def asyncSetUp( @patch("src.logcollector.collector.LogCollector.store") @patch("src.logcollector.collector.logger") @patch("asyncio.get_running_loop") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_handle_kafka_inputs( - self, mock_get_running_loop, mock_logger, mock_store + self, mock_clickhouse, mock_get_running_loop, mock_logger, mock_store ): mock_store_instance = AsyncMock() mock_store.return_value = mock_store_instance @@ -119,7 +133,7 @@ async def test_handle_kafka_inputs( with self.assertRaises(asyncio.CancelledError): await self.sut.fetch() - mock_store.assert_called_once_with("value1") + mock_store.assert_called_once() class TestSend(unittest.IsolatedAsyncioTestCase): @@ -128,8 +142,14 @@ class TestSend(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_with_one_logline( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() @@ -162,7 +182,7 @@ async def test_send_with_one_logline( ) sut = LogCollector() - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -176,8 +196,14 @@ async def test_send_with_one_logline( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_keyboard_interrupt( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() @@ -207,10 +233,10 @@ async def test_send_keyboard_interrupt( ) sut = LogCollector() - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -224,8 +250,10 @@ async def test_send_keyboard_interrupt( @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @patch("src.logcollector.collector.asyncio.Queue") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_empty( self, + mock_clickhouse, mock_queue, mock_logline_handler, mock_batch_handler, @@ -253,8 +281,14 @@ async def test_send_empty( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_value_error( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() @@ -288,9 +322,9 @@ async def test_send_value_error( ) sut = LogCollector() - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -304,18 +338,24 @@ class TestStore(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_store( - self, mock_logline_handler, mock_batch_handler, mock_kafka_consume_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_consume_handler, ): # Arrange sut = LogCollector() self.assertTrue(sut.loglines.empty()) # Act - await sut.store("test_message") + await sut.store(datetime.datetime.now(), "test_message") # Assert - self.assertEqual("test_message", await sut.loglines.get()) + stored_timestamp, stored_message = await sut.loglines.get() + self.assertEqual("test_message", stored_message) self.assertTrue(sut.loglines.empty()) @@ -324,8 +364,13 @@ class TestGetSubnetId(unittest.TestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("192.168.1.1") @@ -342,8 +387,13 @@ def test_get_subnet_id_ipv4( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4_zero( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("0.0.0.0") @@ -360,8 +410,13 @@ def test_get_subnet_id_ipv4_zero( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4_max( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("255.255.255.255") @@ -378,8 +433,13 @@ def test_get_subnet_id_ipv4_max( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("2001:db8:85a3:1234:5678:8a2e:0370:7334") @@ -396,8 +456,13 @@ def test_get_subnet_id_ipv6( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6_zero( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("::") @@ -414,8 +479,13 @@ def test_get_subnet_id_ipv6_zero( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6_max( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff") @@ -433,8 +503,13 @@ def test_get_subnet_id_ipv6_max( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_unsupported_type( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = "192.168.1.1" # String instead of IPv4Address or IPv6Address @@ -450,8 +525,13 @@ def test_get_subnet_id_unsupported_type( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_none( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = None From 150fe7d76a5abda17fb86cf431cc53909670ec12 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:16:08 +0100 Subject: [PATCH 05/59] Add forbidden field names to logline_handler.py --- src/base/logline_handler.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/base/logline_handler.py b/src/base/logline_handler.py index e3d97d5..c0199a5 100644 --- a/src/base/logline_handler.py +++ b/src/base/logline_handler.py @@ -8,6 +8,10 @@ CONFIG = setup_config() LOGLINE_FIELDS = CONFIG["pipeline"]["log_collection"]["collector"]["logline_format"] REQUIRED_FIELDS = ["timestamp", "status_code", "client_ip", "record_type"] +FORBIDDEN_FIELD_NAMES = [ + "logline_id", + "batch_id", +] # field names that are used internally class FieldType: @@ -139,6 +143,12 @@ def __init__(self): for field in LOGLINE_FIELDS: instance = self._create_instance_from_list_entry(field) + if instance.name in FORBIDDEN_FIELD_NAMES: + raise ValueError( + f"Forbidden field name included. These fields are used internally " + f"and cannot be used as names: {FORBIDDEN_FIELD_NAMES}" + ) + if self.instances_by_name.get(instance.name): raise ValueError("Multiple fields with same name") From a501ba231f45aa9d50d1eeb3d2852330bb70bc76 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:16:52 +0100 Subject: [PATCH 06/59] Fix dns_loglines insertion --- src/monitoring/clickhouse_connector.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index cb4d2c9..a883d25 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -189,6 +189,7 @@ def __init__(self): def insert( self, + logline_id: str | uuid.UUID, subnet_id: str, timestamp: str | datetime.datetime, status_code: str, @@ -196,7 +197,8 @@ def insert( record_type: str, additional_fields: str | None = None, ) -> uuid.UUID: - logline_id = uuid.uuid4() + if isinstance(logline_id, str): + logline_id = uuid.UUID(logline_id) self._add_to_batch( [ From 63746d788a6862850e2c62b7796517f3117cd5ff Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:18:44 +0100 Subject: [PATCH 07/59] Add dns_loglines insertion in collector.py --- src/logcollector/collector.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 265ce53..856fbbc 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -4,6 +4,7 @@ import json import os import sys +import uuid sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender @@ -13,7 +14,8 @@ from src.logcollector.batch_handler import BufferedBatchSender from src.base.log_config import get_logger -logger = get_logger("log_collection.collector") +module_name = "log_collection.collector" +logger = get_logger(module_name) config = utils.setup_config() IPV4_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ @@ -22,6 +24,7 @@ IPV6_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ "ipv6_prefix_length" ] +REQUIRED_FIELDS = ["timestamp", "status_code", "client_ip", "record_type"] BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ "logserver_to_collector" @@ -42,6 +45,7 @@ def __init__(self) -> None: # databases self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") + self.dns_loglines = ClickHouseKafkaSender("dns_loglines") async def start(self) -> None: """ @@ -91,6 +95,7 @@ async def send(self) -> None: while True: if not self.loglines.empty(): timestamp_in, logline = await self.loglines.get() + try: fields = self.logline_handler.validate_logline_and_get_fields_as_json( logline @@ -103,7 +108,6 @@ async def send(self) -> None: timestamp_failed=datetime.datetime.now(), ) ) - continue subnet_id = self.get_subnet_id( @@ -111,6 +115,30 @@ async def send(self) -> None: ) self.batch_handler.add_message(subnet_id, json.dumps(fields)) + additional_fields = fields.copy() + for field in REQUIRED_FIELDS: + additional_fields.pop(field) + + logline_id = uuid.uuid4() + + self.dns_loglines.insert( + dict( + logline_id=logline_id, + subnet_id=subnet_id, + timestamp=fields.get("timestamp"), + status_code=fields.get("status_code"), + client_ip=fields.get("client_ip"), + record_type=fields.get("record_type"), + additional_fields=additional_fields, + ) + ) + + message_fields = fields.copy() + message_fields["logline_id"] = logline_id + + self.batch_handler.add_message( + subnet_id, json.dumps(message_fields) + ) logger.debug(f"Sent: '{logline}'") else: await asyncio.sleep(0.1) From 393cc01e6459b48b3fa7a718b1f1c8a3f380906c Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:19:34 +0100 Subject: [PATCH 08/59] Add logline_status insertion in collector.py --- src/logcollector/collector.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 856fbbc..f0bd52b 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -46,6 +46,7 @@ def __init__(self) -> None: # databases self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") self.dns_loglines = ClickHouseKafkaSender("dns_loglines") + self.logline_status = ClickHouseKafkaSender("logline_status") async def start(self) -> None: """ @@ -114,7 +115,6 @@ async def send(self) -> None: ipaddress.ip_address(fields.get("client_ip")) ) - self.batch_handler.add_message(subnet_id, json.dumps(fields)) additional_fields = fields.copy() for field in REQUIRED_FIELDS: additional_fields.pop(field) @@ -133,6 +133,13 @@ async def send(self) -> None: ) ) + self.logline_status.insert( + dict( + logline_id=logline_id, + status="active", + ) + ) + message_fields = fields.copy() message_fields["logline_id"] = logline_id From db16775b3238f9c97fefe4faa2e02a5d71a1729e Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:19:49 +0100 Subject: [PATCH 09/59] Add logline_timestamps insertion in collector.py --- src/logcollector/collector.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index f0bd52b..f2db474 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -47,6 +47,7 @@ def __init__(self) -> None: self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") self.dns_loglines = ClickHouseKafkaSender("dns_loglines") self.logline_status = ClickHouseKafkaSender("logline_status") + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") async def start(self) -> None: """ @@ -140,6 +141,15 @@ async def send(self) -> None: ) ) + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="in_process", + timestamp=timestamp_in, + ) + ) + message_fields = fields.copy() message_fields["logline_id"] = logline_id From 61776ffd26c39236d716eca2c1252c197b770f8b Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 15:21:13 +0100 Subject: [PATCH 10/59] Add development query script --- docker/dev-query.py | 10 +++++++ docker/docker-compose.dev-query.yml | 40 +++++++++++++++++++++++++ docker/dockerfiles/Dockerfile.dev-query | 14 +++++++++ 3 files changed, 64 insertions(+) create mode 100644 docker/dev-query.py create mode 100644 docker/docker-compose.dev-query.yml create mode 100644 docker/dockerfiles/Dockerfile.dev-query diff --git a/docker/dev-query.py b/docker/dev-query.py new file mode 100644 index 0000000..f816f88 --- /dev/null +++ b/docker/dev-query.py @@ -0,0 +1,10 @@ +import clickhouse_connect + +QUERY_TABLE = "dns_loglines" + +client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) + +result = client.query(f"SELECT * FROM {QUERY_TABLE};") + +for row in result.result_rows: + print(row) diff --git a/docker/docker-compose.dev-query.yml b/docker/docker-compose.dev-query.yml new file mode 100644 index 0000000..7bbbce0 --- /dev/null +++ b/docker/docker-compose.dev-query.yml @@ -0,0 +1,40 @@ +include: + - "docker-compose.kafka.yml" + +services: + sandbox: + build: + context: .. + dockerfile: dockerfiles/Dockerfile.dev-query + network: host + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + networks: + heidgaf: + ipv4_address: 172.27.0.100 + memswap_limit: 768m + deploy: + resources: + limits: + cpus: '2' + memory: 512m + reservations: + cpus: '1' + memory: 256m + volumes: + - "${MOUNT_PATH:?MOUNT_PATH not set}:/opt/file.txt" + + +networks: + heidgaf: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.27.0.0/16 + gateway: 172.27.0.1 diff --git a/docker/dockerfiles/Dockerfile.dev-query b/docker/dockerfiles/Dockerfile.dev-query new file mode 100644 index 0000000..080fd2e --- /dev/null +++ b/docker/dockerfiles/Dockerfile.dev-query @@ -0,0 +1,14 @@ +FROM python:3.11-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /usr/src/app + +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile clickhouse_connect + +COPY src/base ./src/base +COPY config.yaml . + +RUN rm -rf /root/.cache + +CMD [ "python", "docker/dev-query.py"] From a321ebf89547e3fef90ed426f80c1119e9011490 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sat, 30 Nov 2024 16:11:22 +0100 Subject: [PATCH 11/59] Update logline_timestamps insertion in collector.py --- src/logcollector/collector.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index f2db474..98e95ce 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -156,6 +156,15 @@ async def send(self) -> None: self.batch_handler.add_message( subnet_id, json.dumps(message_fields) ) + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="completed", + timestamp=datetime.datetime.now(), + ) + ) logger.debug(f"Sent: '{logline}'") else: await asyncio.sleep(0.1) From 9203bc2ad5a9b209976cf4dbecb1d6b41ac9a5c5 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 3 Dec 2024 16:58:07 +0100 Subject: [PATCH 12/59] Bug fixing --- config.yaml | 1 - docker/docker-compose.dev-query.yml | 2 +- src/base/kafka_handler.py | 4 ++-- src/inspector/inspector.py | 2 +- src/logcollector/batch_handler.py | 34 ++++++++++++++++++----------- src/logcollector/collector.py | 2 +- 6 files changed, 26 insertions(+), 19 deletions(-) diff --git a/config.yaml b/config.yaml index 8d596ba..531365a 100644 --- a/config.yaml +++ b/config.yaml @@ -19,7 +19,6 @@ pipeline: log_storage: logserver: input_file: "/opt/file.txt" - max_number_of_connections: 1000 log_collection: collector: diff --git a/docker/docker-compose.dev-query.yml b/docker/docker-compose.dev-query.yml index 7bbbce0..b1011df 100644 --- a/docker/docker-compose.dev-query.yml +++ b/docker/docker-compose.dev-query.yml @@ -5,7 +5,7 @@ services: sandbox: build: context: .. - dockerfile: dockerfiles/Dockerfile.dev-query + dockerfile: docker/dockerfiles/Dockerfile.dev-query network: host depends_on: kafka1: diff --git a/src/base/kafka_handler.py b/src/base/kafka_handler.py index 430eaf2..da4db0a 100644 --- a/src/base/kafka_handler.py +++ b/src/base/kafka_handler.py @@ -323,7 +323,6 @@ class ExactlyOnceKafkaConsumeHandler(KafkaConsumeHandler): """ def __init__(self, topics: str | list[str]) -> None: - self.batch_schema = marshmallow_dataclass.class_schema(Batch)() super().__init__(topics) def consume(self) -> tuple[str | None, str | None, str | None]: @@ -396,7 +395,8 @@ def consume_as_object(self) -> tuple[None | str, Batch]: ast.literal_eval(item) for item in eval_data.get("data") ] - eval_data: Batch = self.batch_schema.load(eval_data) + batch_schema = marshmallow_dataclass.class_schema(Batch)() + eval_data: Batch = batch_schema.load(eval_data) if isinstance(eval_data, Batch): return key, eval_data diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 26f2ff6..22ca754 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -430,7 +430,7 @@ def send_data(self): "end_timestamp": self.end_timestamp.strftime(TIMESTAMP_FORMAT), "data": value, } - self.kafka_produce_handler.send( + self.kafka_produce_handler.produce( topic="Detector", data=json.dumps(data_to_send), key=key, diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 7509d79..7a2a891 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -1,16 +1,21 @@ +import datetime import json import os import sys -from datetime import datetime +import uuid from threading import Timer -from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler -from src.base.utils import setup_config +import marshmallow_dataclass sys.path.append(os.getcwd()) +from src.base import Batch +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler +from src.base.utils import setup_config from src.base.log_config import get_logger -logger = get_logger("log_collection.batch_handler") +module_name = "log_collection.batch_handler" +logger = get_logger(module_name) config = setup_config() BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] @@ -93,7 +98,7 @@ def sort_messages( List of log lines as strings sorted by timestamps (ascending) """ sorted_data = sorted( - data, key=lambda x: datetime.strptime(x[0], timestamp_format) + data, key=lambda x: datetime.datetime.strptime(x[0], timestamp_format) ) loglines = [message for _, message in sorted_data] @@ -217,8 +222,15 @@ def complete_batch(self, key: str) -> dict: begin_timestamp = self.get_first_timestamp_of_buffer(key) data = { - "begin_timestamp": begin_timestamp, - "end_timestamp": self.get_last_timestamp_of_batch(key), + "batch_id": uuid.uuid4(), + "begin_timestamp": datetime.datetime.strptime( + begin_timestamp, + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + "end_timestamp": datetime.datetime.strptime( + self.get_last_timestamp_of_batch(key), + "%Y-%m-%dT%H:%M:%S.%fZ", + ), "data": buffer_data + self.batch[key], } @@ -310,7 +322,6 @@ def add_message(self, key: str, message: str) -> None: f" ⤷ {number_of_messages_for_key} messages sent." ) elif not self.timer: # First time setting the timer - logger.debug("Timer not set yet. Calling _reset_timer()...") self._reset_timer() logger.debug(f"Message '{message}' successfully added to batch for {key=}.") @@ -352,15 +363,13 @@ def _send_all_batches(self, reset_timer: bool = True) -> None: ) def _send_batch_for_key(self, key: str) -> None: - logger.debug(f"Starting to send the batch for {key=}...") - try: - data_packet = self.batch.complete_batch(key) + data = self.batch.complete_batch(key) except ValueError as e: logger.debug(e) return - self._send_data_packet(key, data_packet) + self._send_data_packet(key, data) def _send_data_packet(self, key: str, data: dict) -> None: logger.debug("Sending data to KafkaProduceHandler...") @@ -370,7 +379,6 @@ def _send_data_packet(self, key: str, data: dict) -> None: data=json.dumps(data), key=key, ) - logger.debug(f"{data=}") def _reset_timer(self) -> None: """ diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 98e95ce..0411248 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -151,7 +151,7 @@ async def send(self) -> None: ) message_fields = fields.copy() - message_fields["logline_id"] = logline_id + message_fields["logline_id"] = str(logline_id) self.batch_handler.add_message( subnet_id, json.dumps(message_fields) From 6b0e27c3b438ab047f48a5e95b3093d52b019538 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 3 Dec 2024 16:59:12 +0100 Subject: [PATCH 13/59] Add batch_id to Batch object and make Prefilter use Batch --- src/base/__init__.py | 11 ++++++++--- src/logcollector/batch_handler.py | 6 +++--- src/prefilter/prefilter.py | 24 +++++++++++++++--------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/base/__init__.py b/src/base/__init__.py index d44faf7..3fce56d 100644 --- a/src/base/__init__.py +++ b/src/base/__init__.py @@ -1,11 +1,16 @@ -from typing import List +import datetime +import uuid from dataclasses import dataclass, field +from typing import List + import marshmallow.validate -import datetime @dataclass class Batch: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) begin_timestamp: datetime.datetime = field( metadata={ "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") @@ -16,4 +21,4 @@ class Batch: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") } ) - data: List[dict] = field(default_factory=list) + data: List = field(default_factory=list) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 7a2a891..23d706d 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -372,11 +372,11 @@ def _send_batch_for_key(self, key: str) -> None: self._send_data_packet(key, data) def _send_data_packet(self, key: str, data: dict) -> None: - logger.debug("Sending data to KafkaProduceHandler...") - logger.debug(f"{data=}") + batch_schema = marshmallow_dataclass.class_schema(Batch)() + self.kafka_produce_handler.produce( topic=self.topic, - data=json.dumps(data), + data=batch_schema.dumps(data), key=key, ) diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 785c04d..35f79b1 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -1,9 +1,10 @@ -import ast -import json import os import sys +import marshmallow_dataclass + sys.path.append(os.getcwd()) +from src.base import Batch from src.base.logline_handler import LoglineHandler from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -39,6 +40,7 @@ class Prefilter: """ def __init__(self): + self.batch_id = None self.begin_timestamp = None self.end_timestamp = None self.subnet_id = None @@ -63,13 +65,14 @@ def get_and_fill_data(self) -> None: logger.debug("Cleared existing data.") logger.debug("Calling KafkaConsumeHandler for consuming JSON data...") - key, data = self.kafka_consume_handler.consume_as_json() + key, data = self.kafka_consume_handler.consume_as_object() self.subnet_id = key if data: - self.begin_timestamp = data.get("begin_timestamp") - self.end_timestamp = data.get("end_timestamp") - self.unfiltered_data = data.get("data") + self.batch_id = data.batch_id + self.begin_timestamp = data.begin_timestamp + self.end_timestamp = data.end_timestamp + self.unfiltered_data = data.data if not self.unfiltered_data: logger.info( @@ -94,8 +97,7 @@ def filter_by_error(self) -> None: logger.debug("Filtering data...") for e in self.unfiltered_data: - e_as_json = ast.literal_eval(e) - if self.logline_handler.check_relevance(e_as_json): + if self.logline_handler.check_relevance(e): self.filtered_data.append(e) logger.debug("Data filtered and now available in filtered_data.") @@ -115,15 +117,19 @@ def send_filtered_data(self): raise ValueError("Failed to send data: No filtered data.") data_to_send = { + "batch_id": self.batch_id, "begin_timestamp": self.begin_timestamp, "end_timestamp": self.end_timestamp, "data": self.filtered_data, } + + batch_schema = marshmallow_dataclass.class_schema(Batch)() + logger.debug("Calling KafkaProduceHandler...") logger.debug(f"{data_to_send=}") self.kafka_produce_handler.produce( topic=PRODUCE_TOPIC, - data=json.dumps(data_to_send), + data=batch_schema.dumps(data_to_send), key=self.subnet_id, ) logger.debug( From 386494a9dc729e73a1e03d0ed76fef067d5bfd5b Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 3 Dec 2024 16:59:42 +0100 Subject: [PATCH 14/59] Add logline_timestamps insertion in batch_handler.py --- src/logcollector/batch_handler.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 23d706d..91755a3 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -283,7 +283,9 @@ def __init__(self): self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler( transactional_id="collector" ) - logger.debug(f"Initialized KafkaBatchSender.") + + # databases + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") def __del__(self): logger.debug(f"Closing KafkaBatchSender ({self.topic=})...") @@ -307,8 +309,28 @@ def add_message(self, key: str, message: str) -> None: """ logger.debug(f"Adding message '{message}' to batch.") + logline_id = json.loads(message).get("logline_id") + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + ) + ) + self.batch.add_message(key, message) + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="batched", + timestamp=datetime.datetime.now(), + ) + ) + logger.debug(f"Batch: {self.batch.batch}") number_of_messages_for_key = self.batch.get_number_of_messages(key) From 56787c51321c0fcfdca08bafe1a047ae951020b6 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 4 Dec 2024 11:47:14 +0100 Subject: [PATCH 15/59] Move Batch datatype to its own file --- src/base/__init__.py | 24 ------------------------ src/base/data_classes/batch.py | 24 ++++++++++++++++++++++++ src/base/kafka_handler.py | 2 +- src/logcollector/batch_handler.py | 2 +- 4 files changed, 26 insertions(+), 26 deletions(-) create mode 100644 src/base/data_classes/batch.py diff --git a/src/base/__init__.py b/src/base/__init__.py index 3fce56d..e69de29 100644 --- a/src/base/__init__.py +++ b/src/base/__init__.py @@ -1,24 +0,0 @@ -import datetime -import uuid -from dataclasses import dataclass, field -from typing import List - -import marshmallow.validate - - -@dataclass -class Batch: - batch_id: uuid.UUID = field( - metadata={"marshmallow_field": marshmallow.fields.UUID()} - ) - begin_timestamp: datetime.datetime = field( - metadata={ - "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") - } - ) - end_timestamp: datetime.datetime = field( - metadata={ - "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") - } - ) - data: List = field(default_factory=list) diff --git a/src/base/data_classes/batch.py b/src/base/data_classes/batch.py new file mode 100644 index 0000000..3fce56d --- /dev/null +++ b/src/base/data_classes/batch.py @@ -0,0 +1,24 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from typing import List + +import marshmallow.validate + + +@dataclass +class Batch: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + begin_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") + } + ) + end_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") + } + ) + data: List = field(default_factory=list) diff --git a/src/base/kafka_handler.py b/src/base/kafka_handler.py index da4db0a..7fde819 100644 --- a/src/base/kafka_handler.py +++ b/src/base/kafka_handler.py @@ -20,7 +20,7 @@ ) sys.path.append(os.getcwd()) -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.log_config import get_logger from src.base.utils import kafka_delivery_report, setup_config diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 91755a3..0098be7 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -8,7 +8,7 @@ import marshmallow_dataclass sys.path.append(os.getcwd()) -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler from src.base.utils import setup_config From a3335115479721193817ed293130bb77fd466011 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 4 Dec 2024 11:47:38 +0100 Subject: [PATCH 16/59] Rename clickhouse_batch.py to clickhouse_batch_sender.py --- .../{clickhouse_batch.py => clickhouse_batch_sender.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/monitoring/{clickhouse_batch.py => clickhouse_batch_sender.py} (100%) diff --git a/src/monitoring/clickhouse_batch.py b/src/monitoring/clickhouse_batch_sender.py similarity index 100% rename from src/monitoring/clickhouse_batch.py rename to src/monitoring/clickhouse_batch_sender.py From e20f441827bf7cea3aff8c15919311ef93581844 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 4 Dec 2024 11:49:50 +0100 Subject: [PATCH 17/59] Use Marshmallow to transmit data to MonitoringAgent --- src/base/clickhouse_kafka_sender.py | 9 +- .../data_classes/clickhouse_connectors.py | 153 ++++++++++++++++++ src/monitoring/clickhouse_connector.py | 114 +++---------- src/monitoring/monitoring_agent.py | 12 +- 4 files changed, 192 insertions(+), 96 deletions(-) create mode 100644 src/base/data_classes/clickhouse_connectors.py diff --git a/src/base/clickhouse_kafka_sender.py b/src/base/clickhouse_kafka_sender.py index ef4338c..f45a425 100644 --- a/src/base/clickhouse_kafka_sender.py +++ b/src/base/clickhouse_kafka_sender.py @@ -1,8 +1,10 @@ -import json import os import sys +import marshmallow_dataclass + sys.path.append(os.getcwd()) +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE from src.base.kafka_handler import SimpleKafkaProduceHandler from src.base.log_config import get_logger @@ -13,9 +15,12 @@ class ClickHouseKafkaSender: def __init__(self, table_name: str): self.table_name = table_name self.kafka_producer = SimpleKafkaProduceHandler() + self.data_schema = marshmallow_dataclass.class_schema( + TABLE_NAME_TO_TYPE.get(table_name) + )() def insert(self, data: dict): self.kafka_producer.produce( topic=f"clickhouse_{self.table_name}", - data=json.dumps(data, default=str), + data=self.data_schema.dumps(data), ) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py new file mode 100644 index 0000000..11c6d15 --- /dev/null +++ b/src/base/data_classes/clickhouse_connectors.py @@ -0,0 +1,153 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from typing import Optional + +import marshmallow.validate + + +@dataclass +class ServerLogs: + message_text: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + message_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + timestamp_in: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + +@dataclass +class ServerLogsTimestamps: + message_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + event: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + event_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + +@dataclass +class FailedDNSLoglines: + message_text: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + timestamp_in: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + timestamp_failed: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + reason_for_failure: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class LoglineToBatches: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + + +@dataclass +class DNSLoglines: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + subnet_id: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + status_code: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + record_type: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + additional_fields: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class LoglineStatus: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + exit_at_stage: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class LoglineTimestamps: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + +@dataclass +class BatchStatus: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + exit_at_stage: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class BatchTimestamps: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + message_count: int = field( + metadata={"marshmallow_field": marshmallow.fields.Integer()} + ) + + +TABLE_NAME_TO_TYPE = { + "server_logs": ServerLogs, + "server_logs_timestamps": ServerLogsTimestamps, + "failed_dns_loglines": FailedDNSLoglines, + "logline_to_batches": LoglineToBatches, + "dns_loglines": DNSLoglines, + "logline_status": LoglineStatus, + "logline_timestamps": LoglineTimestamps, + "batch_status": BatchStatus, + "batch_timestamps": BatchTimestamps, +} diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index a883d25..239059b 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -3,11 +3,12 @@ import sys import uuid from abc import abstractmethod +from typing import Optional import clickhouse_connect sys.path.append(os.getcwd()) -from src.monitoring.clickhouse_batch import ClickHouseBatchSender +from src.monitoring.clickhouse_batch_sender import ClickHouseBatchSender from src.base.log_config import get_logger from src.base.utils import setup_config @@ -67,24 +68,10 @@ def __init__(self): def insert( self, message_text: str, - message_id: None | str | uuid.UUID = None, - timestamp_in: str | datetime.datetime | None = None, - ) -> uuid.UUID: - # TODO: Switch to Marshmallow - if not message_id: - message_id = uuid.uuid4() - - if isinstance(message_id, str): - message_id = uuid.UUID(message_id) - - if not timestamp_in: - timestamp_in = datetime.datetime.now() - - if isinstance(timestamp_in, str): - timestamp_in = datetime.datetime.fromisoformat(timestamp_in) - + message_id: uuid.UUID, + timestamp_in: datetime.datetime, + ): self._add_to_batch([message_id, timestamp_in, message_text]) - return message_id class ServerLogsTimestampsConnector(ClickHouseConnector): @@ -99,19 +86,10 @@ def __init__(self): def insert( self, - message_id: str | uuid.UUID, + message_id: uuid.UUID, event: str, - event_timestamp: str | datetime.datetime | None = None, + event_timestamp: datetime.datetime, ): - if isinstance(message_id, str): - message_id = uuid.UUID(message_id) - - if not event_timestamp: - event_timestamp = datetime.datetime.now() - - if isinstance(event_timestamp, str): - event_timestamp = datetime.datetime.fromisoformat(event_timestamp) - self._add_to_batch([message_id, event, event_timestamp]) @@ -129,18 +107,10 @@ def __init__(self): def insert( self, message_text: str, - timestamp_in: str | datetime.datetime, - timestamp_failed: str | datetime.datetime | None = None, - reason_for_failure: str | None = None, + timestamp_in: datetime.datetime, + timestamp_failed: datetime.datetime, + reason_for_failure: Optional[str] = None, ) -> None: - if not timestamp_failed: - timestamp_failed = datetime.datetime.now() - - if isinstance(timestamp_in, str): - timestamp_in = datetime.datetime.fromisoformat(timestamp_in) - if isinstance(timestamp_failed, str): - timestamp_failed = datetime.datetime.fromisoformat(timestamp_failed) - self._add_to_batch( [message_text, timestamp_in, timestamp_failed, reason_for_failure] ) @@ -157,20 +127,10 @@ def __init__(self): def insert( self, - logline_id: str | uuid.UUID, - batch_id: str | uuid.UUID, + logline_id: uuid.UUID, + batch_id: uuid.UUID, ): - if isinstance(logline_id, str): - logline_id = uuid.UUID(logline_id) - if isinstance(batch_id, str): - batch_id = uuid.UUID(batch_id) - - self._add_to_batch( - [ - logline_id, - batch_id, - ] - ) + self._add_to_batch([logline_id, batch_id]) class DNSLoglinesConnector(ClickHouseConnector): @@ -195,11 +155,8 @@ def insert( status_code: str, client_ip: str, record_type: str, - additional_fields: str | None = None, - ) -> uuid.UUID: - if isinstance(logline_id, str): - logline_id = uuid.UUID(logline_id) - + additional_fields: Optional[str] = None, + ): self._add_to_batch( [ logline_id, @@ -211,7 +168,6 @@ def insert( additional_fields, ] ) - return logline_id class LoglineStatusConnector(ClickHouseConnector): @@ -226,13 +182,10 @@ def __init__(self): def insert( self, - logline_id: str | uuid.UUID, + logline_id: uuid.UUID, status: str, - exit_at_stage: str | None = None, + exit_at_stage: Optional[str] = None, ): - if isinstance(logline_id, str): - logline_id = uuid.UUID(logline_id) - self._add_to_batch( [ logline_id, @@ -255,20 +208,11 @@ def __init__(self): def insert( self, - logline_id: str | uuid.UUID, + logline_id: uuid.UUID, stage: str, status: str, - timestamp: str | datetime.datetime = None, + timestamp: datetime.datetime, ) -> None: - if isinstance(logline_id, str): - logline_id = uuid.UUID(logline_id) - - if not timestamp: - timestamp = datetime.datetime.now() - - if isinstance(timestamp, str): - timestamp = datetime.datetime.fromisoformat(timestamp) - self._add_to_batch( [ logline_id, @@ -291,13 +235,10 @@ def __init__(self): def insert( self, - batch_id: str | uuid.UUID, + batch_id: uuid.UUID, status: str, - exit_at_stage: str | None = None, + exit_at_stage: Optional[str] = None, ): - if isinstance(batch_id, str): - batch_id = uuid.UUID(batch_id) - self._add_to_batch( [ batch_id, @@ -321,21 +262,12 @@ def __init__(self): def insert( self, - batch_id: str | uuid.UUID, + batch_id: uuid.UUID, stage: str, status: str, message_count: int, - timestamp: str | datetime.datetime = None, + timestamp: datetime.datetime, ) -> None: - if isinstance(batch_id, str): - batch_id = uuid.UUID(batch_id) - - if not timestamp: - timestamp = datetime.datetime.now() - - if isinstance(timestamp, str): - timestamp = datetime.datetime.fromisoformat(timestamp) - self._add_to_batch( [ batch_id, diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index 8ea6c09..eee99a2 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -1,11 +1,14 @@ import asyncio -import json import os import sys +from dataclasses import asdict + +import marshmallow_dataclass sys.path.append(os.getcwd()) from src.monitoring.clickhouse_connector import * from src.base.kafka_handler import SimpleKafkaConsumeHandler +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE from src.base.log_config import get_logger from src.base.utils import setup_config @@ -60,10 +63,13 @@ async def start(self): ) logger.debug(f"From Kafka: {value}") - data = json.loads(value) table_name = topic.replace("clickhouse_", "") + data_schema = marshmallow_dataclass.class_schema( + TABLE_NAME_TO_TYPE.get(table_name) + )() + data = data_schema.loads(value) - self.connectors[table_name].insert(**data) + self.connectors[table_name].insert(**asdict(data)) except KeyboardInterrupt: logger.info("Stopped MonitoringAgent.") From 360ff509e27322d243f6a2b337c19810fdad037f Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 5 Dec 2024 12:40:46 +0100 Subject: [PATCH 18/59] Make BufferedBatch store Batch IDs --- src/logcollector/batch_handler.py | 32 ++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 0098be7..0dbafd3 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -34,22 +34,40 @@ class BufferedBatch: def __init__(self): self.batch = {} # Batch for the latest messages coming in self.buffer = {} # Former batch with previous messages + self.batch_id = {} # Batch ID per key def add_message(self, key: str, message: str) -> None: + def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: """ Adds a given message to the messages list of the given key. If the key already exists, the message is simply added, otherwise, the key is created. Args: + logline_id (uuid.UUID): Logline ID of the added message key (str): Key to which the message is added message (str): Message to be added """ if key in self.batch: # key already has messages associated self.batch[key].append(message) - logger.debug(f"Message '{message}' added to {key}'s batch.") + + batch_id = self.batch_id.get(key) + self.logline_to_batches.insert( + dict( + logline_id=logline_id, + batch_id=batch_id, + ) + ) + + self.batch_status.insert( + dict( + batch_id=batch_id, + status=1, + exit_at_stage=None, + ) + ) else: # key has no messages associated yet self.batch[key] = [message] - logger.debug(f"Message '{message}' added to newly created {key}'s batch.") + self.batch_id[key] = [uuid.uuid4()] def get_number_of_messages(self, key: str) -> int: """ @@ -203,7 +221,8 @@ def complete_batch(self, key: str) -> dict: key (str): Key for which to complete the current batch and return data packet Returns: - Dictionary of begin_timestamp, end_timestamp and messages (including buffered data) associated with a key + Set of new Logline IDs and dictionary of begin_timestamp, end_timestamp and messages (including buffered + data) associated with a key Raises: ValueError: No data is available for sending. @@ -222,7 +241,7 @@ def complete_batch(self, key: str) -> dict: begin_timestamp = self.get_first_timestamp_of_buffer(key) data = { - "batch_id": uuid.uuid4(), + "batch_id": self.batch_id.get(key), "begin_timestamp": datetime.datetime.strptime( begin_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ", @@ -238,6 +257,9 @@ def complete_batch(self, key: str) -> dict: self.buffer[key] = self.batch[key] del self.batch[key] + # Batch ID is not needed anymore + del self.batch_id[key] + return data if self.buffer: # Variant 3: Only buffer has entries @@ -320,7 +342,7 @@ def add_message(self, key: str, message: str) -> None: ) ) - self.batch.add_message(key, message) + self.batch.add_message(key, logline_id, message) self.logline_timestamps.insert( dict( From 0318a92ebb40de4c065458b04d5975cea5da32d3 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 5 Dec 2024 12:42:08 +0100 Subject: [PATCH 19/59] Add database insertion in BufferedBatch --- src/logcollector/batch_handler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 0dbafd3..dbb452d 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -36,7 +36,10 @@ def __init__(self): self.buffer = {} # Former batch with previous messages self.batch_id = {} # Batch ID per key - def add_message(self, key: str, message: str) -> None: + # databases + self.logline_to_batches = ClickHouseKafkaSender("logline_to_batches") + self.batch_status = ClickHouseKafkaSender("batch_status") + def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: """ Adds a given message to the messages list of the given key. If the key already exists, the message is simply From 63dacf915005500822a45dd0b61e9c50158f03bf Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 5 Dec 2024 12:43:39 +0100 Subject: [PATCH 20/59] Switch status field in connectors by is_active --- src/base/data_classes/clickhouse_connectors.py | 8 ++++++-- src/logcollector/collector.py | 2 +- src/monitoring/clickhouse_connector.py | 12 ++++++------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index 11c6d15..e81f023 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -92,7 +92,9 @@ class LoglineStatus: logline_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) - status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) exit_at_stage: Optional[str] = field( metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} ) @@ -117,7 +119,9 @@ class BatchStatus: batch_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) - status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) exit_at_stage: Optional[str] = field( metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} ) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 0411248..070e43c 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -137,7 +137,7 @@ async def send(self) -> None: self.logline_status.insert( dict( logline_id=logline_id, - status="active", + is_active=1, ) ) diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 239059b..95158c2 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -174,7 +174,7 @@ class LoglineStatusConnector(ClickHouseConnector): def __init__(self): column_names = [ "logline_id", - "status", + "is_active", "exit_at_stage", ] @@ -183,13 +183,13 @@ def __init__(self): def insert( self, logline_id: uuid.UUID, - status: str, + is_active: bool, exit_at_stage: Optional[str] = None, ): self._add_to_batch( [ logline_id, - status, + is_active, exit_at_stage, ] ) @@ -227,7 +227,7 @@ class BatchStatusConnector(ClickHouseConnector): def __init__(self): column_names = [ "batch_id", - "status", + "is_active", "exit_at_stage", ] @@ -236,13 +236,13 @@ def __init__(self): def insert( self, batch_id: uuid.UUID, - status: str, + is_active: bool, exit_at_stage: Optional[str] = None, ): self._add_to_batch( [ batch_id, - status, + is_active, exit_at_stage, ] ) From 13aab6b69a85dc899bd70c33abb59244ff5a1e3c Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 5 Dec 2024 12:47:49 +0100 Subject: [PATCH 21/59] Use correct transactional_id in batch_handler.py --- src/logcollector/batch_handler.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index dbb452d..d76bb37 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -11,7 +11,7 @@ from src.base.data_classes.batch import Batch from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler -from src.base.utils import setup_config +from src.base.utils import setup_config, generate_unique_transactional_id from src.base.log_config import get_logger module_name = "log_collection.batch_handler" @@ -23,6 +23,12 @@ PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ "batch_sender_to_prefilter" ] +KAFKA_BROKERS = ",".join( + [ + f"{broker['hostname']}:{broker['port']}" + for broker in config["environment"]["kafka_brokers"] + ] +) class BufferedBatch: @@ -304,10 +310,8 @@ def __init__(self): self.batch = BufferedBatch() self.timer = None - logger.debug(f"Calling KafkaProduceHandler(transactional_id='collector')...") - self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler( - transactional_id="collector" - ) + transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) + self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) # databases self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") From eabde815e4a824e36e765acc2dcc3f2c26e742fa Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 5 Dec 2024 13:07:14 +0100 Subject: [PATCH 22/59] Update tests for batch_handler.py --- tests/test_batch_handler.py | 97 ++++++++++++++++++++++++++++++++----- 1 file changed, 84 insertions(+), 13 deletions(-) diff --git a/tests/test_batch_handler.py b/tests/test_batch_handler.py index bd69c0d..b9fc52c 100644 --- a/tests/test_batch_handler.py +++ b/tests/test_batch_handler.py @@ -1,4 +1,6 @@ +import json import unittest +import uuid from unittest.mock import patch, MagicMock from src.logcollector.batch_handler import BufferedBatchSender @@ -8,7 +10,10 @@ class TestInit(unittest.TestCase): @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.BufferedBatch") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") - def test_init_with_buffer(self, mock_kafka_produce_handler, mock_buffered_batch): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_init_with_buffer( + self, mock_clickhouse, mock_kafka_produce_handler, mock_buffered_batch + ): # Arrange mock_handler_instance = MagicMock() mock_kafka_produce_handler.return_value = mock_handler_instance @@ -25,7 +30,9 @@ def test_init_with_buffer(self, mock_kafka_produce_handler, mock_buffered_batch) self.assertEqual(mock_handler_instance, sut.kafka_produce_handler) mock_buffered_batch.assert_called_once() - mock_kafka_produce_handler.assert_called_once_with(transactional_id="collector") + mock_kafka_produce_handler.assert_called_once_with( + "log_collection.batch_handler" + ) class TestDel(unittest.TestCase): @@ -40,8 +47,10 @@ class TestAddMessage(unittest.TestCase): @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") @patch("src.logcollector.batch_handler.BufferedBatch.get_number_of_messages") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_normal( self, + mock_clickhouse, mock_send_batch, mock_get_nr_messages, mock_reset_timer, @@ -54,7 +63,12 @@ def test_add_message_normal( mock_get_nr_messages.return_value = 1 key = "test_key" - message = "test_message" + message = json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"test_message", + ) + ) sut = BufferedBatchSender() sut.timer = MagicMock() @@ -71,8 +85,9 @@ def test_add_message_normal( @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages( - self, mock_send_batch, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -85,19 +100,34 @@ def test_add_message_full_messages( # Act for i in range(99): - sut.add_message(key, f"message_{i}") + test_message = json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ) + sut.add_message(key, test_message) # Assert mock_send_batch.assert_not_called() - sut.add_message(key, f"message_100") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="message_100", + ) + ), + ) mock_send_batch.assert_called_once() @patch("src.logcollector.batch_handler.logger") @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages_with_different_keys( - self, mock_send_batch, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -111,23 +141,56 @@ def test_add_message_full_messages_with_different_keys( # Act for i in range(79): - sut.add_message(key, f"message_{i}") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) for i in range(15): - sut.add_message(other_key, f"message_{i}") + sut.add_message( + other_key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) for i in range(20): - sut.add_message(key, f"message_{i}") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) # Assert mock_send_batch.assert_not_called() - sut.add_message(key, f"message_100") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="message_100", + ) + ), + ) mock_send_batch.assert_called_once() @patch("src.logcollector.batch_handler.logger") @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_no_timer( - self, mock_reset_timer, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_reset_timer, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -137,7 +200,15 @@ def test_add_message_no_timer( sut.timer = None # Act - sut.add_message("test_key", "test_message") + sut.add_message( + "test_key", + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="test_message", + ) + ), + ) # Assert mock_reset_timer.assert_called_once() From 133320f73bc0a0951edeec597508a61045068e2d Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 14:14:09 +0100 Subject: [PATCH 23/59] Update tests and fix bugs --- src/logcollector/batch_handler.py | 18 ++- src/prefilter/prefilter.py | 2 +- tests/test_batch_handler.py | 15 +- tests/test_buffered_batch.py | 106 +++++++++---- tests/test_clickhouse_connector.py | 143 +++++------------- tests/test_clickhouse_kafka_sender.py | 11 +- tests/test_collector.py | 21 ++- tests/test_detector.py | 14 +- ...test_exactly_once_kafka_consume_handler.py | 2 +- tests/test_inspector.py | 4 +- tests/test_marshmallow.py | 4 +- 11 files changed, 176 insertions(+), 164 deletions(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index d76bb37..fae5bc5 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -67,16 +67,26 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: ) ) + else: # key has no messages associated yet + # create new batch + self.batch[key] = [message] + new_batch_id = uuid.uuid4() + self.batch_id[key] = [new_batch_id] + self.batch_status.insert( dict( - batch_id=batch_id, + batch_id=new_batch_id, status=1, exit_at_stage=None, ) ) - else: # key has no messages associated yet - self.batch[key] = [message] - self.batch_id[key] = [uuid.uuid4()] + + self.logline_to_batches.insert( + dict( + logline_id=logline_id, + batch_id=new_batch_id, + ) + ) def get_number_of_messages(self, key: str) -> int: """ diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 35f79b1..797ac27 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -4,7 +4,7 @@ import marshmallow_dataclass sys.path.append(os.getcwd()) -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.logline_handler import LoglineHandler from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, diff --git a/tests/test_batch_handler.py b/tests/test_batch_handler.py index b9fc52c..cc3f56d 100644 --- a/tests/test_batch_handler.py +++ b/tests/test_batch_handler.py @@ -1,3 +1,4 @@ +import datetime import json import unittest import uuid @@ -364,7 +365,8 @@ def test_send_batch_for_key_value_error( class TestSendDataPacket(unittest.TestCase): @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") - def test_send_data_packet(self, mock_produce_handler): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_send_data_packet(self, mock_clickhouse, mock_produce_handler): # Arrange mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance @@ -372,9 +374,10 @@ def test_send_data_packet(self, mock_produce_handler): key = "test_key" data = { - "begin_timestamp": "test_begin", - "end_timestamp": "test_end", - "data": "test_data", + "batch_id": uuid.UUID("b4b6f13e-d064-4ab7-94ed-d02b46063308"), + "begin_timestamp": datetime.datetime(2024, 12, 6, 13, 12, 30, 324015), + "end_timestamp": datetime.datetime(2024, 12, 6, 13, 12, 31, 832173), + "data": ["test_data"], } sut = BufferedBatchSender() @@ -385,7 +388,9 @@ def test_send_data_packet(self, mock_produce_handler): # Assert mock_produce_handler_instance.produce.assert_called_once_with( topic="test_topic", - data='{"begin_timestamp": "test_begin", "end_timestamp": "test_end", "data": "test_data"}', + data='{"batch_id": "b4b6f13e-d064-4ab7-94ed-d02b46063308", "begin_timestamp": ' + '"2024-12-06T13:12:30.324015Z", "end_timestamp": "2024-12-06T13:12:31.832173Z", ' + '"data": ["test_data"]}', key=key, ) diff --git a/tests/test_buffered_batch.py b/tests/test_buffered_batch.py index 56d355d..31e35ec 100644 --- a/tests/test_buffered_batch.py +++ b/tests/test_buffered_batch.py @@ -1,4 +1,7 @@ +import datetime import unittest +import uuid +from unittest.mock import patch from src.logcollector.batch_handler import BufferedBatch @@ -14,7 +17,8 @@ def test_init(self): class TestAddMessage(unittest.TestCase): - def test_add_message_empty_batch_and_empty_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_empty_batch_and_empty_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -22,7 +26,7 @@ def test_add_message_empty_batch_and_empty_buffer(self): sut = BufferedBatch() # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -30,7 +34,8 @@ def test_add_message_empty_batch_and_empty_buffer(self): ) self.assertEqual({}, sut.buffer, "Buffer should remain empty") - def test_add_message_empty_batch_and_used_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_empty_batch_and_used_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -40,7 +45,7 @@ def test_add_message_empty_batch_and_used_buffer(self): sut.buffer = {key: [old_message]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -52,7 +57,8 @@ def test_add_message_empty_batch_and_used_buffer(self): "Buffer should still contain key with old message", ) - def test_add_message_used_batch_and_empty_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_used_batch_and_empty_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -62,7 +68,7 @@ def test_add_message_used_batch_and_empty_buffer(self): sut.batch = {key: [old_message]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -72,7 +78,8 @@ def test_add_message_used_batch_and_empty_buffer(self): ) self.assertEqual({}, sut.buffer, "Buffer should remain empty") - def test_add_message_used_batch_and_used_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_used_batch_and_used_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -84,7 +91,7 @@ def test_add_message_used_batch_and_used_buffer(self): sut.buffer = {key: [old_message_1]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -98,7 +105,8 @@ def test_add_message_used_batch_and_used_buffer(self): "Buffer should still contain key with old message", ) - def test_add_message_with_existing_other_key(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_with_existing_other_key(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -111,7 +119,7 @@ def test_add_message_with_existing_other_key(self): sut.buffer = {old_key: [old_message_1]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -755,50 +763,88 @@ def test_sort_unsorted_buffer(self): class TestCompleteBatch(unittest.TestCase): - def test_complete_batch_variant_1(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_complete_batch_variant_1(self, mock_clickhouse): # Arrange key = "test_key" - message_1 = '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_2 = '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + message_1 = ( + '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_2 = ( + '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) expected_messages = [message_1, message_2] sut = BufferedBatch() # Act - sut.add_message(key, message_2) - sut.add_message(key, message_1) + sut.add_message(key, uuid.uuid4(), message_2) + sut.add_message(key, uuid.uuid4(), message_1) data = sut.complete_batch(key) # Assert - self.assertEqual("2024-05-21T08:31:28.119Z", data["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.249Z", data["end_timestamp"]) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 249000), data["end_timestamp"] + ) self.assertEqual(expected_messages, data["data"]) - def test_complete_batch_variant_2(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_complete_batch_variant_2(self, mock_clickhouse): # Arrange key = "test_key" - message_1 = '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_2 = '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' - message_3 = '{"timestamp": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_4 = '{"timestamp": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + message_1 = ( + '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_2 = ( + '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) + message_3 = ( + '{"timestamp": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_4 = ( + '{"timestamp": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) sut = BufferedBatch() # Act - sut.add_message(key, message_1) - sut.add_message(key, message_2) + sut.add_message(key, uuid.uuid4(), message_1) + sut.add_message(key, uuid.uuid4(), message_2) data_1 = sut.complete_batch(key) - sut.add_message(key, message_3) - sut.add_message(key, message_4) + sut.add_message(key, uuid.uuid4(), message_3) + sut.add_message(key, uuid.uuid4(), message_4) data_2 = sut.complete_batch(key) # Assert - self.assertEqual("2024-05-21T08:31:28.119Z", data_1["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.249Z", data_1["end_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.119Z", data_2["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.749Z", data_2["end_timestamp"]) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data_1["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 249000), data_1["end_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data_2["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 749000), data_2["end_timestamp"] + ) self.assertEqual({key: [message_3, message_4]}, sut.buffer) self.assertEqual({}, sut.batch) diff --git a/tests/test_clickhouse_connector.py b/tests/test_clickhouse_connector.py index 0953b6d..fce2bb8 100644 --- a/tests/test_clickhouse_connector.py +++ b/tests/test_clickhouse_connector.py @@ -133,14 +133,14 @@ def test_init(self, mock_clickhouse_batch_sender): def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange message_text = "test_message_text" - message_id = "7299539b-6215-4f6b-b39f-69335aafbeff" - timestamp_in = "2034-12-13 12:34:12.132412" + message_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) sut = ServerLogsConnector() with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: # Act - returned_value = sut.insert( + sut.insert( message_text=message_text, message_id=message_id, timestamp_in=timestamp_in, @@ -154,21 +154,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): "test_message_text", ] ) - self.assertEqual( - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), returned_value - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - sut = ServerLogsConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert("test_message_text") - - # Assert - mock_add_to_batch.assert_called_once() class TestServerLogsTimestampsConnector(unittest.TestCase): @@ -203,9 +188,9 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange - message_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + message_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") event = "test_event" - event_timestamp = "2034-12-13 12:34:12.132412" + event_timestamp = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) sut = ServerLogsTimestampsConnector() @@ -226,21 +211,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): ] ) - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - sut = ServerLogsTimestampsConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - "test_event", - ) - - # Assert - mock_add_to_batch.assert_called_once() - class TestFailedDNSLoglinesConnector(unittest.TestCase): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") @@ -276,8 +246,8 @@ def test_init(self, mock_clickhouse_batch_sender): def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange message_text = "test_message_text" - timestamp_in = "2034-12-13 12:34:12.132412" - timestamp_failed = "2034-12-13 12:35:35.542635" + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + timestamp_failed = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) reason_for_failure = "Wrong client_ip field" sut = FailedDNSLoglinesConnector() @@ -305,7 +275,8 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): def test_insert_none_given(self, mock_clickhouse_batch_sender): # Arrange message_text = "test_message_text" - timestamp_in = "2034-12-13 12:34:12.132412" + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + timestamp_failed = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) sut = FailedDNSLoglinesConnector() @@ -314,6 +285,8 @@ def test_insert_none_given(self, mock_clickhouse_batch_sender): sut.insert( message_text=message_text, timestamp_in=datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + timestamp_failed=datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + reason_for_failure=None, ) # Assert @@ -351,8 +324,8 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given_as_str(self, mock_clickhouse_batch_sender): # Arrange - logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" - batch_id = "1f855c43-8a75-4b53-b6cd-4a13b89312d6" + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") sut = LoglineToBatchesConnector() @@ -431,8 +404,9 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange + logline_id = uuid.UUID("d7add097-40a5-42f6-89df-1e7b20c4a4b8") subnet_id = "127.0.0.0_24" - timestamp = "2034-12-13 12:34:12.132412" + timestamp = datetime.datetime(2024, 12, 6, 13, 41, 53, 589594) status_code = "NXDOMAIN" client_ip = "127.0.0.1" record_type = "A" @@ -442,7 +416,8 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: # Act - returned_value = sut.insert( + sut.insert( + logline_id=logline_id, subnet_id=subnet_id, timestamp=timestamp, status_code=status_code, @@ -453,7 +428,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): # Assert mock_add_to_batch.assert_called_once() - self.assertTrue(isinstance(returned_value, uuid.UUID)) class TestLoglineStatusConnector(unittest.TestCase): @@ -468,7 +442,7 @@ def test_init(self, mock_clickhouse_batch_sender): expected_table_name = "logline_status" expected_column_names = [ "logline_id", - "status", + "is_active", "exit_at_stage", ] @@ -488,8 +462,8 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange - logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" - status = "inactive" + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + is_active = False exit_at_stage = "prefilter" sut = LoglineStatusConnector() @@ -498,7 +472,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): # Act sut.insert( logline_id=logline_id, - status=status, + is_active=is_active, exit_at_stage=exit_at_stage, ) @@ -506,7 +480,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): mock_add_to_batch.assert_called_once_with( [ uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - "inactive", + False, "prefilter", ] ) @@ -515,7 +489,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): def test_insert_none_given(self, mock_clickhouse_batch_sender): # Arrange logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - status = "inactive" + is_active = True sut = LoglineStatusConnector() @@ -523,14 +497,14 @@ def test_insert_none_given(self, mock_clickhouse_batch_sender): # Act sut.insert( logline_id=logline_id, - status=status, + is_active=is_active, ) # Assert mock_add_to_batch.assert_called_once_with( [ uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - "inactive", + True, None, ] ) @@ -569,10 +543,10 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange - logline_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") stage = "prefilter" status = "prefilter_out" - timestamp = "2034-12-13 12:35:35.542635" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) sut = LoglineTimestampsConnector() @@ -595,26 +569,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): ] ) - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - stage = "prefilter" - status = "prefilter_out" - - sut = LoglineTimestampsConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - logline_id=logline_id, - stage=stage, - status=status, - ) - - # Assert - mock_add_to_batch.assert_called_once() - class TestBatchStatusConnector(unittest.TestCase): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") @@ -628,7 +582,7 @@ def test_init(self, mock_clickhouse_batch_sender): expected_table_name = "batch_status" expected_column_names = [ "batch_id", - "status", + "is_active", "exit_at_stage", ] @@ -648,8 +602,8 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange - batch_id = "7299539b-6215-4f6b-b39f-69335aafbeff" - status = "inactive" + batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + is_active = False exit_at_stage = "prefilter" sut = BatchStatusConnector() @@ -658,7 +612,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): # Act sut.insert( batch_id=batch_id, - status=status, + is_active=is_active, exit_at_stage=exit_at_stage, ) @@ -666,7 +620,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): mock_add_to_batch.assert_called_once_with( [ uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - "inactive", + False, "prefilter", ] ) @@ -675,7 +629,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): def test_insert_none_given(self, mock_clickhouse_batch_sender): # Arrange batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - status = "inactive" + is_active = False sut = BatchStatusConnector() @@ -683,14 +637,15 @@ def test_insert_none_given(self, mock_clickhouse_batch_sender): # Act sut.insert( batch_id=batch_id, - status=status, + is_active=is_active, + exit_at_stage=None, ) # Assert mock_add_to_batch.assert_called_once_with( [ uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - "inactive", + False, None, ] ) @@ -730,10 +685,10 @@ def test_init(self, mock_clickhouse_batch_sender): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange - batch_id = "7299539b-6215-4f6b-b39f-69335aafbeff" + batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") stage = "prefilter" status = "prefilter_out" - timestamp = "2034-12-13 12:35:35.542635" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) message_count = 456 sut = BatchTimestampsConnector() @@ -759,28 +714,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): ] ) - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - stage = "prefilter" - status = "prefilter_out" - message_count = 456 - - sut = BatchTimestampsConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - batch_id=batch_id, - stage=stage, - status=status, - message_count=message_count, - ) - - # Assert - mock_add_to_batch.assert_called_once() - if __name__ == "__main__": unittest.main() diff --git a/tests/test_clickhouse_kafka_sender.py b/tests/test_clickhouse_kafka_sender.py index 0aacd1b..9bb5dbc 100644 --- a/tests/test_clickhouse_kafka_sender.py +++ b/tests/test_clickhouse_kafka_sender.py @@ -5,8 +5,9 @@ class TestInit(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.marshmallow_dataclass") @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") - def test_init(self, mock_produce_handler): + def test_init(self, mock_produce_handler, mock_marshmallow): # Arrange table_name = "test_table" mock_produce_handler_instance = mock_produce_handler @@ -22,8 +23,9 @@ def test_init(self, mock_produce_handler): class TestInsert(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.marshmallow_dataclass") @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") - def test_insert(self, mock_produce_handler): + def test_insert(self, mock_produce_handler, mock_marshmallow): # Arrange mock_produce_handler_instance = mock_produce_handler mock_produce_handler.return_value = mock_produce_handler_instance @@ -33,10 +35,7 @@ def test_insert(self, mock_produce_handler): sut.insert({"test_key": "test_value"}) # Assert - mock_produce_handler_instance.produce.assert_called_once_with( - topic="clickhouse_test_table", - data='{"test_key": "test_value"}', - ) + mock_produce_handler_instance.produce.assert_called_once() if __name__ == "__main__": diff --git a/tests/test_collector.py b/tests/test_collector.py index 5192256..54f1c74 100644 --- a/tests/test_collector.py +++ b/tests/test_collector.py @@ -2,6 +2,7 @@ import datetime import ipaddress import unittest +import uuid from unittest.mock import MagicMock, patch, AsyncMock from src.logcollector.collector import LogCollector, main @@ -142,10 +143,12 @@ class TestSend(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.uuid") @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_with_one_logline( self, mock_clickhouse, + mock_uuid, mock_logline_handler, mock_batch_handler, mock_kafka_handler, @@ -159,10 +162,11 @@ async def test_send_with_one_logline( KeyboardInterrupt, ] mock_logline_handler.return_value = mock_logline_handler_instance + mock_uuid.uuid4.return_value = uuid.UUID("8ac2e82b-9252-4e67-a691-4924f98bc605") mock_logline_handler_instance.validate_logline_and_get_fields_as_json.return_value = { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -171,10 +175,10 @@ async def test_send_with_one_logline( "size": "150b", } expected_message = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", ' '"record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", ' - '"size": "150b"}' + '"size": "150b", "logline_id": "8ac2e82b-9252-4e67-a691-4924f98bc605"}' ) input_logline = ( "2024-05-21T08:31:28.119Z NOERROR 192.168.0.105 8.8.8.8 www.heidelberg-botanik.de A " @@ -219,7 +223,7 @@ async def test_send_keyboard_interrupt( mock_logline_handler_instance.validate_logline_and_get_fields_as_json.return_value = { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -281,10 +285,12 @@ async def test_send_empty( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.uuid") @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_value_error( self, mock_clickhouse, + mock_uuid, mock_logline_handler, mock_batch_handler, mock_kafka_handler, @@ -295,12 +301,13 @@ async def test_send_value_error( mock_logline_handler_instance = MagicMock() mock_batch_handler.return_value = mock_batch_handler_instance mock_logline_handler.return_value = mock_logline_handler_instance + mock_uuid.uuid4.return_value = uuid.UUID("8ac2e82b-9252-4e67-a691-4924f98bc605") mock_logline_handler_instance.validate_logline_and_get_fields_as_json.side_effect = [ ValueError, { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -311,10 +318,10 @@ async def test_send_value_error( KeyboardInterrupt, ] expected_message = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", ' '"record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", ' - '"size": "150b"}' + '"size": "150b", "logline_id": "8ac2e82b-9252-4e67-a691-4924f98bc605"}' ) input_logline = ( "2024-05-21T08:31:28.119Z NOERROR 192.168.0.105 8.8.8.8 www.heidelberg-botanik.de A " diff --git a/tests/test_detector.py b/tests/test_detector.py index be7c487..a82c8a6 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -1,12 +1,13 @@ import os import tempfile import unittest +import uuid from datetime import datetime, timedelta from unittest.mock import MagicMock, patch, mock_open from requests import HTTPError -from src.base import Batch +from src.base.data_classes.batch import Batch from src.detector.detector import Detector, WrongChecksum @@ -152,6 +153,7 @@ def test_get_data_without_return_data( self, mock_kafka_consume_handler, mock_logger ): test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=datetime.now(), end_timestamp=datetime.now() + timedelta(0, 3), data=[], @@ -175,6 +177,7 @@ def test_get_data_with_return_data(self, mock_kafka_consume_handler, mock_logger begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[{"test": "test_message_2"}], @@ -201,6 +204,7 @@ def test_get_data_while_busy(self, mock_kafka_consume_handler, mock_logger): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[{"test": "test_message_2"}], @@ -325,7 +329,9 @@ def test_clear_data_without_existing_data( ): begin = datetime.now() end = begin + timedelta(0, 3) - test_batch = Batch(begin_timestamp=begin, end_timestamp=end, data=[]) + test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + ) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -347,7 +353,9 @@ def test_clear_data_with_existing_data( ): begin = datetime.now() end = begin + timedelta(0, 3) - test_batch = Batch(begin_timestamp=begin, end_timestamp=end, data=[]) + test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + ) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance diff --git a/tests/test_exactly_once_kafka_consume_handler.py b/tests/test_exactly_once_kafka_consume_handler.py index 901b94a..c0104fc 100644 --- a/tests/test_exactly_once_kafka_consume_handler.py +++ b/tests/test_exactly_once_kafka_consume_handler.py @@ -5,7 +5,7 @@ from confluent_kafka import KafkaException, KafkaError -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.kafka_handler import ExactlyOnceKafkaConsumeHandler CONSUMER_GROUP_ID = "test_group_id" diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 5c5f7c7..1729277 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -1,11 +1,12 @@ import unittest +import uuid from datetime import datetime, timedelta from unittest.mock import MagicMock, patch import numpy as np import json from streamad.model import ZScoreDetector, RShashDetector -from src.base import Batch +from src.base.data_classes.batch import Batch from src.inspector.inspector import Inspector, main DEFAULT_DATA = { @@ -26,6 +27,7 @@ def get_batch(data): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=data if data != None else [], diff --git a/tests/test_marshmallow.py b/tests/test_marshmallow.py index 039e2be..3edf93a 100644 --- a/tests/test_marshmallow.py +++ b/tests/test_marshmallow.py @@ -1,13 +1,15 @@ import unittest +import uuid import marshmallow_dataclass -from src.base import Batch +from src.base.data_classes.batch import Batch class TestClearData(unittest.TestCase): def test_clear_data_with_existing_data(self): json_data = { + "batch_id": str(uuid.uuid4()), "begin_timestamp": "2024-05-21T08:31:27.000000Z", "end_timestamp": "2024-05-21T08:31:29.000000Z", "data": [ From abadc209d2b000b3b3e61598b126a90a28c49b97 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 14:28:56 +0100 Subject: [PATCH 24/59] Add batch_timestamps insertion in batch_handler.py --- src/logcollector/batch_handler.py | 35 ++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index fae5bc5..d238f65 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -45,6 +45,7 @@ def __init__(self): # databases self.logline_to_batches = ClickHouseKafkaSender("logline_to_batches") self.batch_status = ClickHouseKafkaSender("batch_status") + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: """ @@ -67,6 +68,16 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: ) ) + self.batch_timestamps.insert( + dict( + batch_id=batch_id, + stage=module_name, + status="waiting", + timestamp=datetime.datetime.now(), + message_count=self.get_number_of_messages(key), + ) + ) + else: # key has no messages associated yet # create new batch self.batch[key] = [message] @@ -88,6 +99,16 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: ) ) + self.batch_timestamps.insert( + dict( + batch_id=new_batch_id, + stage=module_name, + status="waiting", + timestamp=datetime.datetime.now(), + message_count=1, + ) + ) + def get_number_of_messages(self, key: str) -> int: """ Returns the number of entries in the batch of the latest messages. @@ -259,8 +280,10 @@ def complete_batch(self, key: str) -> dict: buffer_data = self.buffer[key] begin_timestamp = self.get_first_timestamp_of_buffer(key) + batch_id = self.batch_id.get(key) + data = { - "batch_id": self.batch_id.get(key), + "batch_id": batch_id, "begin_timestamp": datetime.datetime.strptime( begin_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ", @@ -272,6 +295,16 @@ def complete_batch(self, key: str) -> dict: "data": buffer_data + self.batch[key], } + self.batch_timestamps.insert( + dict( + batch_id=batch_id, + stage=module_name, + status="completed", + timestamp=datetime.datetime.now(), + message_count=self.get_number_of_messages(key), + ) + ) + # Move data from batch to buffer self.buffer[key] = self.batch[key] del self.batch[key] From 468803d16da34681c48492329d32c05f3f089a4b Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 14:29:12 +0100 Subject: [PATCH 25/59] Update test_inspector.py --- tests/test_inspector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 1729277..5c04225 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -794,7 +794,7 @@ def test_send(self, mock_kafka_consume_handler, mock_produce_handler): sut.messages = [data] sut.send_data() - mock_produce_handler_instance.send.assert_called_once_with( + mock_produce_handler_instance.produce.assert_called_once_with( topic="Detector", data=json.dumps( { From b1812804bb674ebc43ab4647637f9b3f312e8e11 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 14:29:55 +0100 Subject: [PATCH 26/59] Update status field for Collector database insertion --- src/logcollector/collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 070e43c..7c0a695 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -161,7 +161,7 @@ async def send(self) -> None: dict( logline_id=logline_id, stage=module_name, - status="completed", + status="finished", timestamp=datetime.datetime.now(), ) ) From 7f9655c0a9e1c061c9f3aa109b3d04c98fb24791 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 16:07:20 +0100 Subject: [PATCH 27/59] Update Prefilter and add database insertion --- src/prefilter/prefilter.py | 90 +++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 39 deletions(-) diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 797ac27..1a33c75 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -1,9 +1,11 @@ +import datetime import os import sys import marshmallow_dataclass sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.data_classes.batch import Batch from src.base.logline_handler import LoglineHandler from src.base.kafka_handler import ( @@ -53,18 +55,18 @@ def __init__(self): transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) + # databases + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.logline_status = ClickHouseKafkaSender("logline_status") + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") + def get_and_fill_data(self) -> None: """ - Clears data already stored and consumes new data. Unpacks the data and checks if it is empty. If that is the - case, an info message is shown, otherwise the data is stored internally, including timestamps. + Clears data already stored and consumes new data. Unpacks the data and checks if it is empty. Data is stored + internally, including timestamps. """ - logger.debug("Checking for existing data...") - if self.unfiltered_data: - logger.warning("Overwriting existing data by new message...") - self.clear_data() - logger.debug("Cleared existing data.") + self.clear_data() # clear in case we already have data stored - logger.debug("Calling KafkaConsumeHandler for consuming JSON data...") key, data = self.kafka_consume_handler.consume_as_object() self.subnet_id = key @@ -74,6 +76,16 @@ def get_and_fill_data(self) -> None: self.end_timestamp = data.end_timestamp self.unfiltered_data = data.data + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + message_count=len(self.unfiltered_data), + ) + ) + if not self.unfiltered_data: logger.info( f"Received message:\n" @@ -86,34 +98,39 @@ def get_and_fill_data(self) -> None: f"subnet_id: '{self.subnet_id}'." ) - logger.debug("Received consumer message as JSON data.") - logger.debug(f"{data=}") - def filter_by_error(self) -> None: """ Applies the filter to the data in ``unfiltered_data``, i.e. all loglines whose error status is in the given error types are kept and added to ``filtered_data``, all other ones are discarded. """ - logger.debug("Filtering data...") - for e in self.unfiltered_data: if self.logline_handler.check_relevance(e): self.filtered_data.append(e) - - logger.debug("Data filtered and now available in filtered_data.") - logger.info("Data successfully filtered.") + else: # not relevant, filtered out + logline_id = e.get("logline_id") # TODO: Check + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="filtered_out", + timestamp=datetime.datetime.now(), + ) + ) + + self.logline_status.insert( + dict( + logline_id=logline_id, + is_active=False, + exit_at_stage=module_name, + ) + ) def send_filtered_data(self): """ Sends the filtered data if available via the :class:`KafkaProduceHandler`. """ - if not self.unfiltered_data: - logger.debug("No unfiltered or filtered data is available.") - return - if not self.filtered_data: - logger.info("No errors in filtered data.") - logger.debug("No data sent. No filtered or unfiltered data exists.") raise ValueError("Failed to send data: No filtered data.") data_to_send = { @@ -123,9 +140,18 @@ def send_filtered_data(self): "data": self.filtered_data, } + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="finished", + timestamp=datetime.datetime.now(), + message_count=len(self.filtered_data), + ) + ) + batch_schema = marshmallow_dataclass.class_schema(Batch)() - logger.debug("Calling KafkaProduceHandler...") logger.debug(f"{data_to_send=}") self.kafka_produce_handler.produce( topic=PRODUCE_TOPIC, @@ -143,17 +169,14 @@ def send_filtered_data(self): ) def clear_data(self): - """ - Clears the data in the internal data structures. - """ + """Clears the data in the internal data structures.""" self.unfiltered_data = [] self.filtered_data = [] - logger.debug("Cleared data.") def main(one_iteration: bool = False) -> None: """ - Runs the main loop with by + Runs the main loop by 1. Retrieving new data, 2. Filtering the data and @@ -164,29 +187,18 @@ def main(one_iteration: bool = False) -> None: Args: one_iteration (bool): Only one iteration is done if True (for testing purposes). False by default. """ - logger.info("Starting Prefilter...") prefilter = Prefilter() - logger.info(f"Prefilter started.") iterations = 0 - while True: if one_iteration and iterations > 0: break iterations += 1 try: - logger.debug("Before getting and filling data") prefilter.get_and_fill_data() - logger.debug("After getting and filling data") - - logger.debug("Before filtering by error") prefilter.filter_by_error() - logger.debug("After filtering by error") - - logger.debug("Before adding filtered data to batch") prefilter.send_filtered_data() - logger.debug("After adding filtered data to batch") except IOError as e: logger.error(e) raise From e6711dbfffa317c59cc6b9cf6e69eb280ac9b3ba Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 16:32:41 +0100 Subject: [PATCH 28/59] Bug fix --- src/logcollector/batch_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index d238f65..d8a6aa1 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -87,7 +87,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: self.batch_status.insert( dict( batch_id=new_batch_id, - status=1, + status=True, exit_at_stage=None, ) ) From 1a2f2b940411e059887f0caadf6e28bfea1af936 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 16:33:19 +0100 Subject: [PATCH 29/59] Update Inspector and add database insertion --- src/inspector/inspector.py | 50 +++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 22ca754..fd35814 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -9,6 +9,7 @@ from streamad.util import StreamGenerator, CustomDS sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import setup_config from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -76,21 +77,25 @@ class Inspector: """Finds anomalies in a batch of requests and produces it to the ``Detector``.""" def __init__(self) -> None: + self.batch_id = None + self.X = None self.key = None self.begin_timestamp = None self.end_timestamp = None + self.messages = [] self.anomalies = [] - logger.debug(f"Initializing Inspector...") - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) - logger.debug(f"Initialized Inspector.") + + # databases + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.batch_status = ClickHouseKafkaSender("batch_status") def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" - logger.debug("Getting and filling data...") if self.messages: logger.warning( "Inspector is busy: Not consuming new messages. Wait for the Inspector to finish the " @@ -98,17 +103,25 @@ def get_and_fill_data(self) -> None: ) return - logger.debug( - "Inspector is not busy: Calling KafkaConsumeHandler to consume new JSON messages..." - ) key, data = self.kafka_consume_handler.consume_as_object() if data: + self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="in_process", + timestamp=datetime.now(), + message_count=len(self.messages), + ) + ) + if not self.messages: logger.info( "Received message:\n" @@ -120,9 +133,6 @@ def get_and_fill_data(self) -> None: f" ⤷ Contains data field of {len(self.messages)} message(s). Belongs to subnet_id {key}." ) - logger.debug("Received consumer message as json data.") - logger.debug(f"(data={self.messages})") - def clear_data(self): """Clears the data in the internal data structures.""" self.messages = [] @@ -412,7 +422,7 @@ def send_data(self): total_anomalies = np.count_nonzero( np.greater_equal(np.array(self.anomalies), SCORE_THRESHOLD) ) - if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: + if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: # subnet is suspicious logger.debug("Sending data to KafkaProduceHandler...") logger.info("Sending anomalies to detector for further analysation.") buckets = {} @@ -435,6 +445,24 @@ def send_data(self): data=json.dumps(data_to_send), key=key, ) + else: # subnet is not suspicious + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="filtered_out", + timestamp=datetime.now(), + message_count=len(self.messages), + ) + ) + + self.batch_status.insert( + dict( + batch_id=self.batch_id, + is_active=False, + exit_at_stage=module_name, + ) + ) def main(one_iteration: bool = False): From 51188e589d0eecda0420497506f718430eb68180 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 6 Dec 2024 16:33:39 +0100 Subject: [PATCH 30/59] Update Detector and add database insertion --- src/detector/detector.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/detector/detector.py b/src/detector/detector.py index cceb9cc..772e252 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -1,3 +1,4 @@ +import datetime import hashlib import json import os @@ -10,7 +11,9 @@ import requests from numpy import median + sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import setup_config from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -47,6 +50,8 @@ class Detector: """ def __init__(self) -> None: + self.batch_id = None + self.key = None self.messages = [] self.warnings = [] self.begin_timestamp = None @@ -55,14 +60,16 @@ def __init__(self) -> None: tempfile.gettempdir(), f"{MODEL}_{CHECKSUM}.pickle" ) - logger.debug(f"Initializing Detector...") self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) self.model = self._get_model() + # databases + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.batch_status = ClickHouseKafkaSender("batch_status") + def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" - logger.debug("Getting and filling data...") if self.messages: logger.warning( "Detector is busy: Not consuming new messages. Wait for the Detector to finish the " @@ -70,17 +77,25 @@ def get_and_fill_data(self) -> None: ) return - logger.debug( - "Detector is not busy: Calling KafkaConsumeHandler to consume new JSON messages..." - ) key, data = self.kafka_consume_handler.consume_as_object() if data: + self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + message_count=len(self.messages), + ) + ) + if not self.messages: logger.info( "Received message:\n" @@ -92,9 +107,6 @@ def get_and_fill_data(self) -> None: f" ⤷ Contains data field of {len(self.messages)} message(s). Belongs to subnet_id {key}." ) - logger.debug("Received consumer message as json data.") - logger.debug(f"(data={self.messages})") - def _sha256sum(self, file_path: str) -> str: """Return a SHA265 sum check to validate the model. From d25038304c6d37b7ba436d715f255e4c6c56093c Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Mon, 9 Dec 2024 22:00:25 +0100 Subject: [PATCH 31/59] Update tests and some bug fixing --- src/detector/detector.py | 2 +- src/inspector/inspector.py | 2 +- tests/test_detector.py | 11 ++- tests/test_inspector.py | 135 +++++++++++++++++++++++++------------ 4 files changed, 101 insertions(+), 49 deletions(-) diff --git a/src/detector/detector.py b/src/detector/detector.py index 772e252..3f2f320 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -128,7 +128,7 @@ def _sha256sum(self, file_path: str) -> str: return h.hexdigest() - def _get_model(self) -> None: + def _get_model(self): """ Downloads model from server. If model already exists, it returns the current model. In addition, it checks the sha256 sum in case a model has been updated. diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index fd35814..040fbfd 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -413,7 +413,7 @@ def _inspect_univariate(self, model: str): for x in stream.iter_item(): score = self.model.fit_score(x) - if score != None: + if score is not None: self.anomalies.append(score) else: self.anomalies.append(0) diff --git a/tests/test_detector.py b/tests/test_detector.py index a82c8a6..9b15f45 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -135,7 +135,8 @@ class TestInit(unittest.TestCase): @patch("src.detector.detector.CONSUME_TOPIC", "test_topic") @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_init(self, mock_kafka_consume_handler, mock_logger): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_init(self, mock_clickhouse, mock_kafka_consume_handler, mock_logger): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -149,8 +150,9 @@ def test_init(self, mock_kafka_consume_handler, mock_logger): class TestGetData(unittest.TestCase): @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") def test_get_data_without_return_data( - self, mock_kafka_consume_handler, mock_logger + self, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): test_batch = Batch( batch_id=uuid.uuid4(), @@ -173,7 +175,10 @@ def test_get_data_without_return_data( @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_get_data_with_return_data(self, mock_kafka_consume_handler, mock_logger): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_data_with_return_data( + self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + ): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 5c04225..0be66ac 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -30,7 +30,7 @@ def get_batch(data): batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, - data=data if data != None else [], + data=data if data is not None else [], ) return test_batch @@ -56,8 +56,13 @@ class TestGetData(unittest.TestCase): @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_without_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -76,8 +81,13 @@ def test_get_data_without_return_data( @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_with_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch([{"test": "test_message_1"}, {"test": "test_message_2"}]) mock_kafka_consume_handler_instance = MagicMock() @@ -102,8 +112,13 @@ def test_get_data_with_return_data( @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_with_no_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): begin = None end = None @@ -321,7 +336,7 @@ def test_inspect_none_models( "src.inspector.inspector.MODELS", "", ) - def test_inspect_empy_models( + def test_inspect_empty_models( self, mock_kafka_consume_handler, mock_produce_handler, mock_logger ): mock_kafka_consume_handler_instance = MagicMock() @@ -337,39 +352,41 @@ def test_inspect_empy_models( with self.assertRaises(NotImplementedError): sut.inspect() - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], - ) - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - def test_inspect_univariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertEqual([0, 0], sut.anomalies) + # TODO: Update this test as it is not being called + # @patch("src.inspector.inspector.logger") + # @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + # @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + # @patch( + # "src.inspector.inspector.MODELS", + # [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], + # ) + # @patch("src.inspector.inspector.TIME_TYPE", "ms") + # @patch("src.inspector.inspector.TIME_RANGE", 1) + # def test_inspect_univariate( + # self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + # ): + # test_batch = get_batch(None) + # test_batch.begin_timestamp = datetime.now() + # test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) + # data = DEFAULT_DATA + # data["timestamp"] = datetime.strftime( + # test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + # ) + # test_batch.data = [data] + # mock_kafka_consume_handler_instance = MagicMock() + # mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + # mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + # "test", + # test_batch, + # ) + # mock_produce_handler_instance = MagicMock() + # mock_produce_handler.return_value = mock_produce_handler_instance + # + # with patch("src.inspector.inspector.ClickHouseKafkaSender") as mock_clickhouse: + # sut = Inspector() + # sut.get_and_fill_data() + # sut.inspect() + # self.assertEqual([0, 0], sut.anomalies) @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @@ -457,8 +474,13 @@ def test_inspect_univariate_two_models( [{"model": "RShashDetector", "module": "streamad.model", "model_args": {}}], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -496,8 +518,13 @@ def test_inspect_multivariate( ], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate_window_len( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -532,8 +559,13 @@ def test_inspect_multivariate_window_len( ], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate_two_models( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -577,8 +609,13 @@ def test_inspect_multivariate_two_models( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -629,8 +666,13 @@ def test_inspect_ensemble( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble_window_len( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -673,8 +715,13 @@ def test_inspect_ensemble_window_len( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble_invalid( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() From 8d4b345f272ec78a2af271151a914c17efc263dc Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Mon, 9 Dec 2024 22:01:02 +0100 Subject: [PATCH 32/59] Add a dev-query module for quick database checks --- docker/dev-query.py | 51 ++++++++++++++++++++++--- docker/docker-compose.dev-query.yml | 16 ++++++++ docker/dockerfiles/Dockerfile.dev-query | 5 ++- 3 files changed, 65 insertions(+), 7 deletions(-) diff --git a/docker/dev-query.py b/docker/dev-query.py index f816f88..114b20d 100644 --- a/docker/dev-query.py +++ b/docker/dev-query.py @@ -1,10 +1,51 @@ +import datetime +import os +import sys +import uuid + import clickhouse_connect -QUERY_TABLE = "dns_loglines" +sys.path.append(os.getcwd()) +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE + + +def get_tables(): + tables = {} + + for table_name in TABLE_NAME_TO_TYPE: + tables[table_name] = [] + + return tables + + +def query_once(client, tables): + for table_name in tables.keys(): + tables[table_name] = client.query(f"SELECT * FROM {table_name};") + + return tables + + +def main(): + client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) + tables = get_tables() + + client.insert( + "server_logs", + [[uuid.uuid4(), datetime.datetime.now(), "This is a logline"]], + ["message_id", "timestamp_in", "message_text"], + ) + + results = query_once(client, tables) + + for key in results: + print(f"'{key}':") -client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) + if results[key].result_rows: + for row in results[key].result_rows: + print("\t", row) + else: + print("\t -") -result = client.query(f"SELECT * FROM {QUERY_TABLE};") -for row in result.result_rows: - print(row) +if __name__ == "__main__": + main() diff --git a/docker/docker-compose.dev-query.yml b/docker/docker-compose.dev-query.yml index b1011df..c868cb3 100644 --- a/docker/docker-compose.dev-query.yml +++ b/docker/docker-compose.dev-query.yml @@ -29,6 +29,22 @@ services: volumes: - "${MOUNT_PATH:?MOUNT_PATH not set}:/opt/file.txt" + clickhouse-server: + image: clickhouse/clickhouse-server:24.3.12.75-alpine + container_name: clickhouse-server + networks: + heidgaf: + ipv4_address: 172.27.0.11 + restart: "unless-stopped" + ports: + - "8123:8123" + - "9000:9000" + healthcheck: + test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] + interval: 10s + timeout: 5s + retries: 3 + networks: heidgaf: diff --git a/docker/dockerfiles/Dockerfile.dev-query b/docker/dockerfiles/Dockerfile.dev-query index 080fd2e..e6cbf47 100644 --- a/docker/dockerfiles/Dockerfile.dev-query +++ b/docker/dockerfiles/Dockerfile.dev-query @@ -4,11 +4,12 @@ ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /usr/src/app -RUN pip --disable-pip-version-check install --no-cache-dir --no-compile clickhouse_connect +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile clickhouse_connect marshmallow_dataclass colorlog pyYAML confluent_kafka COPY src/base ./src/base COPY config.yaml . +COPY docker/dev-query.py . RUN rm -rf /root/.cache -CMD [ "python", "docker/dev-query.py"] +CMD [ "python", "dev-query.py"] From 649641b81a512180093ad6d5ed7ded572e4eac5a Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 10 Dec 2024 13:29:43 +0100 Subject: [PATCH 33/59] Remove batch_status table --- .../data_classes/clickhouse_connectors.py | 14 +- src/detector/detector.py | 2 +- src/inspector/inspector.py | 11 +- src/logcollector/batch_handler.py | 14 +- src/monitoring/clickhouse_connector.py | 28 +-- src/monitoring/create_tables/batch_status.sql | 7 - .../create_tables/batch_timestamps.sql | 3 +- src/monitoring/monitoring_agent.py | 1 - tests/test_clickhouse_connector.py | 167 +----------------- tests/test_inspector.py | 6 +- 10 files changed, 25 insertions(+), 228 deletions(-) delete mode 100644 src/monitoring/create_tables/batch_status.sql diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index e81f023..01588d6 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -112,19 +112,9 @@ class LoglineTimestamps: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") } ) - - -@dataclass -class BatchStatus: - batch_id: uuid.UUID = field( - metadata={"marshmallow_field": marshmallow.fields.UUID()} - ) is_active: bool = field( metadata={"marshmallow_field": marshmallow.fields.Boolean()} ) - exit_at_stage: Optional[str] = field( - metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} - ) @dataclass @@ -139,6 +129,9 @@ class BatchTimestamps: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") } ) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) message_count: int = field( metadata={"marshmallow_field": marshmallow.fields.Integer()} ) @@ -152,6 +145,5 @@ class BatchTimestamps: "dns_loglines": DNSLoglines, "logline_status": LoglineStatus, "logline_timestamps": LoglineTimestamps, - "batch_status": BatchStatus, "batch_timestamps": BatchTimestamps, } diff --git a/src/detector/detector.py b/src/detector/detector.py index 3f2f320..bbaaec1 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -66,7 +66,6 @@ def __init__(self) -> None: # databases self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") - self.batch_status = ClickHouseKafkaSender("batch_status") def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" @@ -92,6 +91,7 @@ def get_and_fill_data(self) -> None: stage=module_name, status="in_process", timestamp=datetime.datetime.now(), + is_active=True, message_count=len(self.messages), ) ) diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 040fbfd..7c9ea98 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -92,7 +92,6 @@ def __init__(self) -> None: # databases self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") - self.batch_status = ClickHouseKafkaSender("batch_status") def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" @@ -118,6 +117,7 @@ def get_and_fill_data(self) -> None: stage=module_name, status="in_process", timestamp=datetime.now(), + is_active=True, message_count=len(self.messages), ) ) @@ -452,15 +452,8 @@ def send_data(self): stage=module_name, status="filtered_out", timestamp=datetime.now(), - message_count=len(self.messages), - ) - ) - - self.batch_status.insert( - dict( - batch_id=self.batch_id, is_active=False, - exit_at_stage=module_name, + message_count=len(self.messages), ) ) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index d8a6aa1..d576790 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -44,7 +44,6 @@ def __init__(self): # databases self.logline_to_batches = ClickHouseKafkaSender("logline_to_batches") - self.batch_status = ClickHouseKafkaSender("batch_status") self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: @@ -74,6 +73,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: stage=module_name, status="waiting", timestamp=datetime.datetime.now(), + is_active=True, message_count=self.get_number_of_messages(key), ) ) @@ -84,14 +84,6 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: new_batch_id = uuid.uuid4() self.batch_id[key] = [new_batch_id] - self.batch_status.insert( - dict( - batch_id=new_batch_id, - status=True, - exit_at_stage=None, - ) - ) - self.logline_to_batches.insert( dict( logline_id=logline_id, @@ -105,6 +97,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: stage=module_name, status="waiting", timestamp=datetime.datetime.now(), + is_active=True, message_count=1, ) ) @@ -301,6 +294,7 @@ def complete_batch(self, key: str) -> dict: stage=module_name, status="completed", timestamp=datetime.datetime.now(), + is_active=True, message_count=self.get_number_of_messages(key), ) ) @@ -389,6 +383,7 @@ def add_message(self, key: str, message: str) -> None: stage=module_name, status="in_process", timestamp=datetime.datetime.now(), + is_active=True, ) ) @@ -400,6 +395,7 @@ def add_message(self, key: str, message: str) -> None: stage=module_name, status="batched", timestamp=datetime.datetime.now(), + is_active=True, ) ) diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 95158c2..57c872a 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -223,31 +223,6 @@ def insert( ) -class BatchStatusConnector(ClickHouseConnector): - def __init__(self): - column_names = [ - "batch_id", - "is_active", - "exit_at_stage", - ] - - super().__init__("batch_status", column_names) - - def insert( - self, - batch_id: uuid.UUID, - is_active: bool, - exit_at_stage: Optional[str] = None, - ): - self._add_to_batch( - [ - batch_id, - is_active, - exit_at_stage, - ] - ) - - class BatchTimestampsConnector(ClickHouseConnector): def __init__(self): column_names = [ @@ -255,6 +230,7 @@ def __init__(self): "stage", "status", "timestamp", + "is_active", "message_count", ] @@ -265,6 +241,7 @@ def insert( batch_id: uuid.UUID, stage: str, status: str, + is_active: bool, message_count: int, timestamp: datetime.datetime, ) -> None: @@ -274,6 +251,7 @@ def insert( stage, status, timestamp, + is_active, message_count, ] ) diff --git a/src/monitoring/create_tables/batch_status.sql b/src/monitoring/create_tables/batch_status.sql deleted file mode 100644 index 3f515b9..0000000 --- a/src/monitoring/create_tables/batch_status.sql +++ /dev/null @@ -1,7 +0,0 @@ -CREATE TABLE IF NOT EXISTS batch_status ( - batch_id UUID NOT NULL, - status String NOT NULL, - exit_at_stage Nullable(String) -) -ENGINE = MergeTree -PRIMARY KEY (batch_id); diff --git a/src/monitoring/create_tables/batch_timestamps.sql b/src/monitoring/create_tables/batch_timestamps.sql index c0e7a1c..45ef849 100644 --- a/src/monitoring/create_tables/batch_timestamps.sql +++ b/src/monitoring/create_tables/batch_timestamps.sql @@ -3,7 +3,8 @@ CREATE TABLE IF NOT EXISTS batch_timestamps ( stage String NOT NULL, status String NOT NULL, timestamp DateTime64(6) NOT NULL, - message_count UInt32 + message_count UInt32, + is_active Bool NOT NULL ) ENGINE = MergeTree PRIMARY KEY (batch_id); diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index eee99a2..3668352 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -46,7 +46,6 @@ def __init__(self): "dns_loglines": DNSLoglinesConnector(), "logline_status": LoglineStatusConnector(), "logline_timestamps": LoglineTimestampsConnector(), - "batch_status": BatchStatusConnector(), "batch_timestamps": BatchTimestampsConnector(), } diff --git a/tests/test_clickhouse_connector.py b/tests/test_clickhouse_connector.py index fce2bb8..7ff84fe 100644 --- a/tests/test_clickhouse_connector.py +++ b/tests/test_clickhouse_connector.py @@ -430,86 +430,6 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): mock_add_to_batch.assert_called_once() -class TestLoglineStatusConnector(unittest.TestCase): - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_init(self, mock_clickhouse_batch_sender): - # Arrange - mock_clickhouse_batch_sender_instance = MagicMock() - mock_clickhouse_batch_sender.return_value = ( - mock_clickhouse_batch_sender_instance - ) - - expected_table_name = "logline_status" - expected_column_names = [ - "logline_id", - "is_active", - "exit_at_stage", - ] - - # Act - sut = LoglineStatusConnector() - - # Assert - self.assertEqual(expected_table_name, sut._table_name) - self.assertEqual(expected_column_names, sut._column_names) - self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) - - mock_clickhouse_batch_sender.assert_called_once_with( - table_name=expected_table_name, - column_names=expected_column_names, - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_all_given(self, mock_clickhouse_batch_sender): - # Arrange - logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - is_active = False - exit_at_stage = "prefilter" - - sut = LoglineStatusConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - logline_id=logline_id, - is_active=is_active, - exit_at_stage=exit_at_stage, - ) - - # Assert - mock_add_to_batch.assert_called_once_with( - [ - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - False, - "prefilter", - ] - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - is_active = True - - sut = LoglineStatusConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - logline_id=logline_id, - is_active=is_active, - ) - - # Assert - mock_add_to_batch.assert_called_once_with( - [ - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - True, - None, - ] - ) - - class TestLoglineTimestampsConnector(unittest.TestCase): @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") def test_init(self, mock_clickhouse_batch_sender): @@ -525,6 +445,7 @@ def test_init(self, mock_clickhouse_batch_sender): "stage", "status", "timestamp", + "is_active", ] # Act @@ -557,6 +478,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): stage=stage, status=status, timestamp=timestamp, + is_active=True, ) # Assert @@ -566,87 +488,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): "prefilter", "prefilter_out", datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), - ] - ) - - -class TestBatchStatusConnector(unittest.TestCase): - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_init(self, mock_clickhouse_batch_sender): - # Arrange - mock_clickhouse_batch_sender_instance = MagicMock() - mock_clickhouse_batch_sender.return_value = ( - mock_clickhouse_batch_sender_instance - ) - - expected_table_name = "batch_status" - expected_column_names = [ - "batch_id", - "is_active", - "exit_at_stage", - ] - - # Act - sut = BatchStatusConnector() - - # Assert - self.assertEqual(expected_table_name, sut._table_name) - self.assertEqual(expected_column_names, sut._column_names) - self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) - - mock_clickhouse_batch_sender.assert_called_once_with( - table_name=expected_table_name, - column_names=expected_column_names, - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_all_given(self, mock_clickhouse_batch_sender): - # Arrange - batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - is_active = False - exit_at_stage = "prefilter" - - sut = BatchStatusConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - batch_id=batch_id, - is_active=is_active, - exit_at_stage=exit_at_stage, - ) - - # Assert - mock_add_to_batch.assert_called_once_with( - [ - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - False, - "prefilter", - ] - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_none_given(self, mock_clickhouse_batch_sender): - # Arrange - batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - is_active = False - - sut = BatchStatusConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - batch_id=batch_id, - is_active=is_active, - exit_at_stage=None, - ) - - # Assert - mock_add_to_batch.assert_called_once_with( - [ - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - False, - None, + True, ] ) @@ -666,6 +508,7 @@ def test_init(self, mock_clickhouse_batch_sender): "stage", "status", "timestamp", + "is_active", "message_count", ] @@ -700,6 +543,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): stage=stage, status=status, timestamp=timestamp, + is_active=True, message_count=message_count, ) @@ -710,6 +554,7 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): "prefilter", "prefilter_out", datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + True, 456, ] ) diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 0be66ac..187fdc3 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -382,9 +382,9 @@ def test_inspect_empty_models( # mock_produce_handler_instance = MagicMock() # mock_produce_handler.return_value = mock_produce_handler_instance # - # with patch("src.inspector.inspector.ClickHouseKafkaSender") as mock_clickhouse: - # sut = Inspector() - # sut.get_and_fill_data() + # # with patch("src.inspector.inspector.ClickHouseKafkaSender") as mock_clickhouse: + # sut = Inspector() + # sut.get_and_fill_data() # sut.inspect() # self.assertEqual([0, 0], sut.anomalies) From a5147bd6c8794f6572ac323718507fd82be4e162 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 10 Dec 2024 13:30:47 +0100 Subject: [PATCH 34/59] Remove logline_status table --- .../data_classes/clickhouse_connectors.py | 14 ---------- src/logcollector/collector.py | 10 ++----- src/monitoring/clickhouse_connector.py | 27 ++----------------- .../create_tables/logline_status.sql | 7 ----- .../create_tables/logline_timestamps.sql | 3 ++- src/monitoring/monitoring_agent.py | 1 - src/prefilter/prefilter.py | 10 ++----- 7 files changed, 8 insertions(+), 64 deletions(-) delete mode 100644 src/monitoring/create_tables/logline_status.sql diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index 01588d6..4e40ff0 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -87,19 +87,6 @@ class DNSLoglines: ) -@dataclass -class LoglineStatus: - logline_id: uuid.UUID = field( - metadata={"marshmallow_field": marshmallow.fields.UUID()} - ) - is_active: bool = field( - metadata={"marshmallow_field": marshmallow.fields.Boolean()} - ) - exit_at_stage: Optional[str] = field( - metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} - ) - - @dataclass class LoglineTimestamps: logline_id: uuid.UUID = field( @@ -143,7 +130,6 @@ class BatchTimestamps: "failed_dns_loglines": FailedDNSLoglines, "logline_to_batches": LoglineToBatches, "dns_loglines": DNSLoglines, - "logline_status": LoglineStatus, "logline_timestamps": LoglineTimestamps, "batch_timestamps": BatchTimestamps, } diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 7c0a695..8e15042 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -46,7 +46,6 @@ def __init__(self) -> None: # databases self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") self.dns_loglines = ClickHouseKafkaSender("dns_loglines") - self.logline_status = ClickHouseKafkaSender("logline_status") self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") async def start(self) -> None: @@ -134,19 +133,13 @@ async def send(self) -> None: ) ) - self.logline_status.insert( - dict( - logline_id=logline_id, - is_active=1, - ) - ) - self.logline_timestamps.insert( dict( logline_id=logline_id, stage=module_name, status="in_process", timestamp=timestamp_in, + is_active=True, ) ) @@ -163,6 +156,7 @@ async def send(self) -> None: stage=module_name, status="finished", timestamp=datetime.datetime.now(), + is_active=True, ) ) logger.debug(f"Sent: '{logline}'") diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 57c872a..96416d4 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -170,31 +170,6 @@ def insert( ) -class LoglineStatusConnector(ClickHouseConnector): - def __init__(self): - column_names = [ - "logline_id", - "is_active", - "exit_at_stage", - ] - - super().__init__("logline_status", column_names) - - def insert( - self, - logline_id: uuid.UUID, - is_active: bool, - exit_at_stage: Optional[str] = None, - ): - self._add_to_batch( - [ - logline_id, - is_active, - exit_at_stage, - ] - ) - - class LoglineTimestampsConnector(ClickHouseConnector): def __init__(self): column_names = [ @@ -202,6 +177,7 @@ def __init__(self): "stage", "status", "timestamp", + "is_active", ] super().__init__("logline_timestamps", column_names) @@ -212,6 +188,7 @@ def insert( stage: str, status: str, timestamp: datetime.datetime, + is_active: bool, ) -> None: self._add_to_batch( [ diff --git a/src/monitoring/create_tables/logline_status.sql b/src/monitoring/create_tables/logline_status.sql deleted file mode 100644 index cdeb6c2..0000000 --- a/src/monitoring/create_tables/logline_status.sql +++ /dev/null @@ -1,7 +0,0 @@ -CREATE TABLE IF NOT EXISTS logline_status ( - logline_id UUID NOT NULL, - status String NOT NULL, - exit_at_stage Nullable(String) -) -ENGINE = MergeTree -PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/logline_timestamps.sql b/src/monitoring/create_tables/logline_timestamps.sql index 4ff9887..d7f25ca 100644 --- a/src/monitoring/create_tables/logline_timestamps.sql +++ b/src/monitoring/create_tables/logline_timestamps.sql @@ -2,7 +2,8 @@ CREATE TABLE IF NOT EXISTS logline_timestamps ( logline_id UUID NOT NULL, stage String NOT NULL, status String NOT NULL, - timestamp DateTime64(6) NOT NULL + timestamp DateTime64(6) NOT NULL, + is_active Bool NOT NULL ) ENGINE = MergeTree PRIMARY KEY (logline_id); diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index 3668352..6daa2ad 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -44,7 +44,6 @@ def __init__(self): "failed_dns_loglines": FailedDNSLoglinesConnector(), "logline_to_batches": LoglineToBatchesConnector(), "dns_loglines": DNSLoglinesConnector(), - "logline_status": LoglineStatusConnector(), "logline_timestamps": LoglineTimestampsConnector(), "batch_timestamps": BatchTimestampsConnector(), } diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 1a33c75..248f87f 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -57,7 +57,6 @@ def __init__(self): # databases self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") - self.logline_status = ClickHouseKafkaSender("logline_status") self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") def get_and_fill_data(self) -> None: @@ -82,6 +81,7 @@ def get_and_fill_data(self) -> None: stage=module_name, status="in_process", timestamp=datetime.datetime.now(), + is_active=True, message_count=len(self.unfiltered_data), ) ) @@ -115,14 +115,7 @@ def filter_by_error(self) -> None: stage=module_name, status="filtered_out", timestamp=datetime.datetime.now(), - ) - ) - - self.logline_status.insert( - dict( - logline_id=logline_id, is_active=False, - exit_at_stage=module_name, ) ) @@ -146,6 +139,7 @@ def send_filtered_data(self): stage=module_name, status="finished", timestamp=datetime.datetime.now(), + is_active=True, message_count=len(self.filtered_data), ) ) From 5c07ab29931b397ea4c0c07cab184ae1a7fe27e3 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 10 Dec 2024 13:55:18 +0100 Subject: [PATCH 35/59] Add alerts and suspicious_batch_timestamps tables --- src/monitoring/create_tables/alerts.sql | 6 ++++++ .../create_tables/suspicious_batch_timestamps.sql | 12 ++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 src/monitoring/create_tables/alerts.sql create mode 100644 src/monitoring/create_tables/suspicious_batch_timestamps.sql diff --git a/src/monitoring/create_tables/alerts.sql b/src/monitoring/create_tables/alerts.sql new file mode 100644 index 0000000..6a4c26f --- /dev/null +++ b/src/monitoring/create_tables/alerts.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS server_logs_timestamps ( + client_ip String NOT NULL, + alert_timestamp DateTime64(6) NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY(client_ip, alert_timestamp); diff --git a/src/monitoring/create_tables/suspicious_batch_timestamps.sql b/src/monitoring/create_tables/suspicious_batch_timestamps.sql new file mode 100644 index 0000000..d4ee40a --- /dev/null +++ b/src/monitoring/create_tables/suspicious_batch_timestamps.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS batch_timestamps ( + suspicious_batch_id UUID NOT NULL, + batch_id UUID NOT NULL, + client_ip String NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL, + message_count UInt32, + is_active Bool NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (suspicious_batch_id); From 4c2b5478bd72987a3d929a679d11c3a6ec8180d5 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 11:36:58 +0100 Subject: [PATCH 36/59] Add alerts connectors --- .../data_classes/clickhouse_connectors.py | 14 +++++++++++ src/monitoring/clickhouse_connector.py | 25 +++++++++++++++++++ src/monitoring/create_tables/alerts.sql | 5 ++-- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index 4e40ff0..9087219 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -124,6 +124,19 @@ class BatchTimestamps: ) +@dataclass +class Alerts: + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + alert_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + TABLE_NAME_TO_TYPE = { "server_logs": ServerLogs, "server_logs_timestamps": ServerLogsTimestamps, @@ -132,4 +145,5 @@ class BatchTimestamps: "dns_loglines": DNSLoglines, "logline_timestamps": LoglineTimestamps, "batch_timestamps": BatchTimestamps, + "alerts": Alerts, } diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 96416d4..efadb32 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -232,3 +232,28 @@ def insert( message_count, ] ) + + +class AlertsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "client_ip", + "alert_timestamp", + "suspicious_batch_id", + ] + + super().__init__("alerts", column_names) + + def insert( + self, + client_ip: str, + alert_timestamp: datetime.datetime, + suspicious_batch_id: uuid.UUID, + ) -> None: + self._add_to_batch( + [ + client_ip, + alert_timestamp, + suspicious_batch_id, + ] + ) diff --git a/src/monitoring/create_tables/alerts.sql b/src/monitoring/create_tables/alerts.sql index 6a4c26f..df96561 100644 --- a/src/monitoring/create_tables/alerts.sql +++ b/src/monitoring/create_tables/alerts.sql @@ -1,6 +1,7 @@ -CREATE TABLE IF NOT EXISTS server_logs_timestamps ( +CREATE TABLE IF NOT EXISTS alerts ( client_ip String NOT NULL, - alert_timestamp DateTime64(6) NOT NULL + alert_timestamp DateTime64(6) NOT NULL, + suspicious_batch_id UUID NOT NULL ) ENGINE = MergeTree PRIMARY KEY(client_ip, alert_timestamp); From da9528a634d74d532bff4c6231e6ea9e61c6d9f0 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 11:38:43 +0100 Subject: [PATCH 37/59] Add suspicious_batches_to_batch connectors --- .../data_classes/clickhouse_connectors.py | 11 ++++ src/monitoring/clickhouse_connector.py | 59 +++++++++++++++++++ .../suspicious_batches_to_batch.sql | 6 ++ 3 files changed, 76 insertions(+) create mode 100644 src/monitoring/create_tables/suspicious_batches_to_batch.sql diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index 9087219..ff8c509 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -120,6 +120,16 @@ class BatchTimestamps: metadata={"marshmallow_field": marshmallow.fields.Boolean()} ) message_count: int = field( +@dataclass +class SuspiciousBatchesToBatch: + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + + metadata={"marshmallow_field": marshmallow.fields.Integer()} ) @@ -145,5 +155,6 @@ class Alerts: "dns_loglines": DNSLoglines, "logline_timestamps": LoglineTimestamps, "batch_timestamps": BatchTimestamps, + "suspicious_batches_to_batch": SuspiciousBatchesToBatch, "alerts": Alerts, } diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index efadb32..78fb3ee 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -234,6 +234,65 @@ def insert( ) +class SuspiciousBatchesToBatchConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "suspicious_batch_id", + "batch_id", + ] + + super().__init__("suspicious_batches_to_batch", column_names) + + def insert( + self, + suspicious_batch_id: uuid.UUID, + batch_id: uuid.UUID, + ) -> None: + self._add_to_batch( + [ + suspicious_batch_id, + batch_id, + ] + ) + + +class SuspiciousBatchTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "suspicious_batch_id", + "client_ip", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + super().__init__("suspicious_batch_timestamps", column_names) + + def insert( + self, + suspicious_batch_id: uuid.UUID, + client_ip: str, + stage: str, + status: str, + is_active: bool, + message_count: int, + timestamp: datetime.datetime, + ) -> None: + self._add_to_batch( + [ + suspicious_batch_id, + client_ip, + stage, + status, + timestamp, + is_active, + message_count, + ] + ) + + class AlertsConnector(ClickHouseConnector): def __init__(self): column_names = [ diff --git a/src/monitoring/create_tables/suspicious_batches_to_batch.sql b/src/monitoring/create_tables/suspicious_batches_to_batch.sql new file mode 100644 index 0000000..d587a22 --- /dev/null +++ b/src/monitoring/create_tables/suspicious_batches_to_batch.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS suspicious_batches_to_batch ( + suspicious_batch_id UUID NOT NULL, + batch_id UUID NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (suspicious_batch_id); From 2d503eb2cf0a55234b0b4a770b1e326044cfd9d5 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 11:39:32 +0100 Subject: [PATCH 38/59] Update suspicious_batch_timestamps connectors --- .../data_classes/clickhouse_connectors.py | 22 +++++++++++++++++++ .../suspicious_batch_timestamps.sql | 3 +-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index ff8c509..be69655 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -120,6 +120,10 @@ class BatchTimestamps: metadata={"marshmallow_field": marshmallow.fields.Boolean()} ) message_count: int = field( + metadata={"marshmallow_field": marshmallow.fields.Integer()} + ) + + @dataclass class SuspiciousBatchesToBatch: suspicious_batch_id: uuid.UUID = field( @@ -130,6 +134,23 @@ class SuspiciousBatchesToBatch: ) +@dataclass +class SuspiciousBatchTimestamps: + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) + message_count: int = field( metadata={"marshmallow_field": marshmallow.fields.Integer()} ) @@ -156,5 +177,6 @@ class Alerts: "logline_timestamps": LoglineTimestamps, "batch_timestamps": BatchTimestamps, "suspicious_batches_to_batch": SuspiciousBatchesToBatch, + "suspicious_batch_timestamps": SuspiciousBatchTimestamps, "alerts": Alerts, } diff --git a/src/monitoring/create_tables/suspicious_batch_timestamps.sql b/src/monitoring/create_tables/suspicious_batch_timestamps.sql index d4ee40a..8c02e82 100644 --- a/src/monitoring/create_tables/suspicious_batch_timestamps.sql +++ b/src/monitoring/create_tables/suspicious_batch_timestamps.sql @@ -1,6 +1,5 @@ -CREATE TABLE IF NOT EXISTS batch_timestamps ( +CREATE TABLE IF NOT EXISTS suspicious_batch_timestamps ( suspicious_batch_id UUID NOT NULL, - batch_id UUID NOT NULL, client_ip String NOT NULL, stage String NOT NULL, status String NOT NULL, From 4f560c77c3342022dcc75e0ad752d7e3fa5df2b9 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 12:19:38 +0100 Subject: [PATCH 39/59] Add overall_score and result to Alerts --- src/base/data_classes/clickhouse_connectors.py | 4 ++++ src/monitoring/clickhouse_connector.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index be69655..dd5977a 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -166,6 +166,10 @@ class Alerts: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") } ) + overall_score: float = field( + metadata={"marshmallow_field": marshmallow.fields.Float()} + ) + result: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) TABLE_NAME_TO_TYPE = { diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 78fb3ee..9f466a6 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -299,6 +299,8 @@ def __init__(self): "client_ip", "alert_timestamp", "suspicious_batch_id", + "overall_score", + "result", ] super().__init__("alerts", column_names) @@ -308,11 +310,15 @@ def insert( client_ip: str, alert_timestamp: datetime.datetime, suspicious_batch_id: uuid.UUID, + overall_score: float, + result: str, ) -> None: self._add_to_batch( [ client_ip, alert_timestamp, suspicious_batch_id, + overall_score, + result, ] ) From 4b2f5087173b8d8165004e2e61a64c409765bf3f Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 12:19:48 +0100 Subject: [PATCH 40/59] Add overall_score and result to Alerts --- src/monitoring/create_tables/alerts.sql | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/monitoring/create_tables/alerts.sql b/src/monitoring/create_tables/alerts.sql index df96561..92e4ad1 100644 --- a/src/monitoring/create_tables/alerts.sql +++ b/src/monitoring/create_tables/alerts.sql @@ -1,7 +1,9 @@ CREATE TABLE IF NOT EXISTS alerts ( client_ip String NOT NULL, alert_timestamp DateTime64(6) NOT NULL, - suspicious_batch_id UUID NOT NULL + suspicious_batch_id UUID NOT NULL, + overall_score Float32 NOT NULL, + result String, ) ENGINE = MergeTree PRIMARY KEY(client_ip, alert_timestamp); From a311d27e7782a7be032b7e3f69194e1f8372b874 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 12:20:36 +0100 Subject: [PATCH 41/59] Add detector database insertion --- src/detector/detector.py | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/src/detector/detector.py b/src/detector/detector.py index bbaaec1..935d3ff 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -11,7 +11,6 @@ import requests from numpy import median - sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import setup_config @@ -50,7 +49,7 @@ class Detector: """ def __init__(self) -> None: - self.batch_id = None + self.suspicious_batch_id = None self.key = None self.messages = [] self.warnings = [] @@ -65,7 +64,10 @@ def __init__(self) -> None: self.model = self._get_model() # databases - self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.suspicious_batch_timestamps = ClickHouseKafkaSender( + "suspicious_batch_timestamps" + ) + self.alerts = ClickHouseKafkaSender("alerts") def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" @@ -79,15 +81,16 @@ def get_and_fill_data(self) -> None: key, data = self.kafka_consume_handler.consume_as_object() if data: - self.batch_id = data.batch_id + self.suspicious_batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key - self.batch_timestamps.insert( + self.suspicious_batch_timestamps.insert( dict( - batch_id=self.batch_id, + suspicious_batch_id=self.suspicious_batch_id, + client_ip=key, stage=module_name, status="in_process", timestamp=datetime.datetime.now(), @@ -294,19 +297,42 @@ def detect(self) -> None: # pragma: no cover self.warnings.append(warning) def send_warning(self) -> None: - logger.info("Store alert to file.") + logger.info("Store alert.") if len(self.warnings) > 0: overall_score = median( [warning["probability"] for warning in self.warnings] ) alert = {"overall_score": overall_score, "result": self.warnings} + logger.info(f"Add alert: {alert}") with open(os.path.join(tempfile.gettempdir(), "warnings.json"), "a+") as f: json.dump(alert, f) f.write("\n") + + self.alerts.insert( + dict( + client_ip=self.key, + alert_timestamp=datetime.datetime.now(), + suspicious_batch_id=self.suspicious_batch_id, + overall_score=overall_score, + result=json.dumps(self.warnings), + ) + ) else: logger.info("No warning produced.") + self.suspicious_batch_timestamps.insert( + dict( + suspicious_batch_id=self.suspicious_batch_id, + client_ip=self.key, + stage=module_name, + status="finished", + timestamp=datetime.datetime.now(), + is_active=False, + message_count=len(self.messages), + ) + ) + def main(one_iteration: bool = False): # pragma: no cover """ From 27a6e326686fcd66a8b4cd4573134063867d8d84 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 12:21:12 +0100 Subject: [PATCH 42/59] Update prefilter debug messages --- src/prefilter/prefilter.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 248f87f..ad9b60a 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -146,16 +146,11 @@ def send_filtered_data(self): batch_schema = marshmallow_dataclass.class_schema(Batch)() - logger.debug(f"{data_to_send=}") self.kafka_produce_handler.produce( topic=PRODUCE_TOPIC, data=batch_schema.dumps(data_to_send), key=self.subnet_id, ) - logger.debug( - f"Sent filtered data with time frame from {self.begin_timestamp} to {self.end_timestamp} and data" - f" ({len(self.filtered_data)} message(s))." - ) logger.info( f"Filtered data was successfully sent:\n" f" ⤷ Contains data field of {len(self.filtered_data)} message(s). Originally: " From 964e8fe4590c78fa99db2bd79b9f3474e380f150 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Wed, 11 Dec 2024 12:21:36 +0100 Subject: [PATCH 43/59] Update inspector database insertion --- src/inspector/inspector.py | 49 ++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 7c9ea98..334718f 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -1,15 +1,17 @@ import importlib -import json import os import sys +import uuid from datetime import datetime from enum import Enum, unique +import marshmallow_dataclass import numpy as np from streamad.util import StreamGenerator, CustomDS sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.data_classes.batch import Batch from src.base.utils import setup_config from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -92,6 +94,12 @@ def __init__(self) -> None: # databases self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.suspicious_batch_timestamps = ClickHouseKafkaSender( + "suspicious_batch_timestamps" + ) + self.suspicious_batches_to_batch = ClickHouseKafkaSender( + "suspicious_batches_to_batch" + ) def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" @@ -423,28 +431,55 @@ def send_data(self): np.greater_equal(np.array(self.anomalies), SCORE_THRESHOLD) ) if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: # subnet is suspicious - logger.debug("Sending data to KafkaProduceHandler...") - logger.info("Sending anomalies to detector for further analysation.") + logger.info("Sending anomalies to detector for further analysis.") buckets = {} + for message in self.messages: if message["client_ip"] in buckets.keys(): buckets[message["client_ip"]].append(message) else: buckets[message["client_ip"]] = [] buckets.get(message["client_ip"]).append(message) + for key, value in buckets.items(): logger.info(f"Sending anomalies to detector for {key}.") logger.info(f"Sending anomalies to detector for {value}.") + + suspicious_batch_id = uuid.uuid4() # generate new suspicious_batch_id + + self.suspicious_batches_to_batch.insert( + dict( + suspicious_batch_id=suspicious_batch_id, + batch_id=self.batch_id, + ) + ) + data_to_send = { - "begin_timestamp": self.begin_timestamp.strftime(TIMESTAMP_FORMAT), - "end_timestamp": self.end_timestamp.strftime(TIMESTAMP_FORMAT), + "batch_id": suspicious_batch_id, + "begin_timestamp": self.begin_timestamp, + "end_timestamp": self.end_timestamp, "data": value, } + + batch_schema = marshmallow_dataclass.class_schema(Batch)() + self.kafka_produce_handler.produce( - topic="Detector", - data=json.dumps(data_to_send), + topic=PRODUCE_TOPIC, + data=batch_schema.dumps(data_to_send), key=key, ) + + self.suspicious_batch_timestamps.insert( + dict( + suspicious_batch_id=suspicious_batch_id, + client_ip=key, + stage=module_name, + status="finished", + timestamp=datetime.now(), + is_active=True, + message_count=len(value), + ) + ) else: # subnet is not suspicious self.batch_timestamps.insert( dict( From 1ea7ece9af43fc6bad31d57ebbb0ee36bbb32811 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 12 Dec 2024 10:07:50 +0100 Subject: [PATCH 44/59] Bug fixing and update all tests --- src/monitoring/clickhouse_connector.py | 1 + src/prefilter/prefilter.py | 5 +- tests/test_detector.py | 6 +- ...test_exactly_once_kafka_consume_handler.py | 44 +++++----- tests/test_inspector.py | 44 +++++++--- tests/test_prefilter.py | 86 +++++++++++++------ 6 files changed, 126 insertions(+), 60 deletions(-) diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index 9f466a6..bb14f81 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -196,6 +196,7 @@ def insert( stage, status, timestamp, + is_active, ] ) diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index ad9b60a..88eb3ab 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -1,6 +1,8 @@ import datetime +import json import os import sys +import uuid import marshmallow_dataclass @@ -107,7 +109,7 @@ def filter_by_error(self) -> None: if self.logline_handler.check_relevance(e): self.filtered_data.append(e) else: # not relevant, filtered out - logline_id = e.get("logline_id") # TODO: Check + logline_id = uuid.UUID(json.loads(e).get("logline_id")) self.logline_timestamps.insert( dict( @@ -145,7 +147,6 @@ def send_filtered_data(self): ) batch_schema = marshmallow_dataclass.class_schema(Batch)() - self.kafka_produce_handler.produce( topic=PRODUCE_TOPIC, data=batch_schema.dumps(data_to_send), diff --git a/tests/test_detector.py b/tests/test_detector.py index 9b15f45..6bc878d 100644 --- a/tests/test_detector.py +++ b/tests/test_detector.py @@ -245,7 +245,8 @@ def setUp(self): "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_save_warning(self, mock_kafka_consume_handler): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_save_warning(self, mock_clickhouse, mock_kafka_consume_handler): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -282,7 +283,8 @@ def test_save_warning(self, mock_kafka_consume_handler): "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_save_empty_warning(self, mock_kafka_consume_handler): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_save_empty_warning(self, mock_clickhouse, mock_kafka_consume_handler): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance diff --git a/tests/test_exactly_once_kafka_consume_handler.py b/tests/test_exactly_once_kafka_consume_handler.py index c0104fc..381a53a 100644 --- a/tests/test_exactly_once_kafka_consume_handler.py +++ b/tests/test_exactly_once_kafka_consume_handler.py @@ -1,8 +1,10 @@ import datetime import json import unittest +import uuid from unittest.mock import patch, Mock +import marshmallow_dataclass from confluent_kafka import KafkaException, KafkaError from src.base.data_classes.batch import Batch @@ -305,10 +307,6 @@ class TestConsumeAsObject(unittest.TestCase): @patch("src.base.kafka_handler.Consumer") def setUp(self, mock_consumer): self.sut = ExactlyOnceKafkaConsumeHandler(topics="test_topic") - self.sut.batch_schema = Mock() - self.sut.batch_schema.load = Mock( - return_value=Batch(datetime.datetime.now(), datetime.datetime.now()) - ) def test_consume_as_object_no_key_no_value(self): with patch( @@ -322,14 +320,16 @@ def test_consume_as_object_no_key_no_value(self): def test_consume_as_object_valid_data(self): key = "valid_key" - value = json.dumps({"data": [{"field1": "value1", "field2": "value2"}]}) - topic = "test_topic" - batch_data = [{"field1": "value1", "field2": "value2"}] - self.sut.batch_schema.load.return_value = Batch( - datetime.datetime.now(), - datetime.datetime.now(), - batch_data, + batch_schema = marshmallow_dataclass.class_schema(Batch)() + value = batch_schema.dumps( + { + "batch_id": uuid.uuid4(), + "begin_timestamp": datetime.datetime.now(), + "end_timestamp": datetime.datetime.now(), + "data": [{"field1": "value1", "field2": "value2"}], + } ) + topic = "test_topic" with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" @@ -343,21 +343,19 @@ def test_consume_as_object_valid_data(self): def test_consume_as_object_valid_data_with_inner_strings(self): key = "valid_key" - value = json.dumps( + batch_schema = marshmallow_dataclass.class_schema(Batch)() + value = batch_schema.dumps( { + "batch_id": uuid.uuid4(), + "begin_timestamp": datetime.datetime.now(), + "end_timestamp": datetime.datetime.now(), "data": [ '{"field1": "value1", "field2": "value2"}', '{"field3": "value3", "field4": "value4"}', - ] + ], } ) topic = "test_topic" - batch_data = [{"field1": "value1", "field2": "value2"}] - self.sut.batch_schema.load.return_value = Batch( - datetime.datetime.now(), - datetime.datetime.now(), - batch_data, - ) with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" @@ -384,12 +382,16 @@ def test_consume_as_object_invalid_data(self): with self.assertRaises(ValueError): self.sut.consume_as_object() - def test_consume_as_object_invalid_batch(self): + @patch("src.base.kafka_handler.marshmallow_dataclass.class_schema") + def test_consume_as_object_invalid_batch(self, mock_schema): key = "valid_key" value = json.dumps({"data": [{"field1": "value1", "field2": "value2"}]}) topic = "test_topic" - self.sut.batch_schema.load.return_value = None + mock_schema_instance = Mock() + mock_schema.return_value = mock_schema_instance + + mock_schema_instance.load.return_value = None with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" diff --git a/tests/test_inspector.py b/tests/test_inspector.py index 187fdc3..c9e7b95 100644 --- a/tests/test_inspector.py +++ b/tests/test_inspector.py @@ -2,10 +2,11 @@ import uuid from datetime import datetime, timedelta from unittest.mock import MagicMock, patch -import numpy as np -import json +import marshmallow_dataclass +import numpy as np from streamad.model import ZScoreDetector, RShashDetector + from src.base.data_classes.batch import Batch from src.inspector.inspector import Inspector, main @@ -403,8 +404,13 @@ def test_inspect_empty_models( ) @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_univariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -440,8 +446,13 @@ def test_inspect_univariate( ) @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_univariate_two_models( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -819,15 +830,24 @@ def test_invalid_mode(self, mock_kafka_consume_handler, mock_produce_handler): class TestSend(unittest.TestCase): + @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) - def test_send(self, mock_kafka_consume_handler, mock_produce_handler): + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() sut = Inspector() sut.anomalies = [0.9, 0.9] @@ -839,14 +859,18 @@ def test_send(self, mock_kafka_consume_handler, mock_produce_handler): sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT ) sut.messages = [data] - sut.send_data() + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() mock_produce_handler_instance.produce.assert_called_once_with( - topic="Detector", - data=json.dumps( + topic="pipeline.inspector_to_detector", + data=batch_schema.dumps( { - "begin_timestamp": sut.begin_timestamp.strftime(TIMESTAMP_FORMAT), - "end_timestamp": sut.end_timestamp.strftime(TIMESTAMP_FORMAT), + "batch_id": mock_batch_id, + "begin_timestamp": sut.begin_timestamp, + "end_timestamp": sut.end_timestamp, "data": [data], } ), diff --git a/tests/test_prefilter.py b/tests/test_prefilter.py index 04fa6cc..cf01d85 100644 --- a/tests/test_prefilter.py +++ b/tests/test_prefilter.py @@ -1,7 +1,10 @@ +import datetime import json import unittest +import uuid from unittest.mock import MagicMock, patch +from src.base.data_classes.batch import Batch from src.base.kafka_handler import KafkaMessageFetchException from src.prefilter.prefilter import Prefilter, main @@ -36,8 +39,10 @@ class TestGetAndFillData(unittest.TestCase): @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_without_new_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -47,9 +52,14 @@ def test_get_data_without_new_data( mock_produce_handler.return_value = mock_produce_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( + mock_consume_handler_instance.consume_as_object.return_value = ( None, - {}, + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=[], + ), ) sut = Prefilter() @@ -59,14 +69,16 @@ def test_get_data_without_new_data( self.assertEqual([], sut.filtered_data) self.assertEqual(None, sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() @patch("src.prefilter.prefilter.logger") @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_with_new_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -76,13 +88,14 @@ def test_get_data_with_new_data( mock_produce_handler.return_value = mock_produce_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( - "127.0.0.0/24", - { - "begin_timestamp": "2024-05-21T08:31:28.119Z", - "end_timestamp": "2024-05-21T08:31:29.432Z", - "data": ["test_data_1", "test_data_2"], - }, + mock_consume_handler_instance.consume_as_object.return_value = ( + "127.0.0.0_24", + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=["test_data_1", "test_data_2"], + ), ) sut = Prefilter() @@ -90,16 +103,18 @@ def test_get_data_with_new_data( self.assertEqual(["test_data_1", "test_data_2"], sut.unfiltered_data) self.assertEqual([], sut.filtered_data) - self.assertEqual("127.0.0.0/24", sut.subnet_id) + self.assertEqual("127.0.0.0_24", sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() @patch("src.prefilter.prefilter.logger") @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_with_existing_data( self, + mock_clickhouse, mock_batch_handler, mock_consume_handler, mock_logline_handler, @@ -109,13 +124,14 @@ def test_get_data_with_existing_data( mock_batch_handler.return_value = mock_batch_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( - "127.0.0.0/24", - { - "begin_timestamp": "2024-05-21T08:31:28.119Z", - "end_timestamp": "2024-05-21T08:31:29.432Z", - "data": ["test_data_1", "test_data_2"], - }, + mock_consume_handler_instance.consume_as_object.return_value = ( + "127.0.0.0_24", + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=["test_data_1", "test_data_2"], + ), ) sut = Prefilter() @@ -124,9 +140,9 @@ def test_get_data_with_existing_data( self.assertEqual(["test_data_1", "test_data_2"], sut.unfiltered_data) self.assertEqual([], sut.filtered_data) - self.assertEqual("127.0.0.0/24", sut.subnet_id) + self.assertEqual("127.0.0.0_24", sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() class TestFilterByError(unittest.TestCase): @@ -152,8 +168,10 @@ def test_filter_by_error_empty_data( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_no_error_types( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -161,6 +179,7 @@ def test_filter_by_error_with_data_no_error_types( ): first_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": "192.168.0.105", @@ -173,6 +192,7 @@ def test_filter_by_error_with_data_no_error_types( ) second_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T02:31:07.943Z", "status_code": "NXDOMAIN", "client_ip": "192.168.1.206", @@ -185,6 +205,7 @@ def test_filter_by_error_with_data_no_error_types( ) third_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T01:37:41.796Z", "status_code": "NXDOMAIN", "client_ip": "192.168.1.206", @@ -208,8 +229,10 @@ def test_filter_by_error_with_data_no_error_types( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_one_error_type( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -217,6 +240,7 @@ def test_filter_by_error_with_data_one_error_type( ): first_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": "192.168.0.105", @@ -229,6 +253,7 @@ def test_filter_by_error_with_data_one_error_type( ) second_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T02:31:07.943Z", "status_code": "NXDOMAIN", "client_ip": "192.168.1.206", @@ -241,6 +266,7 @@ def test_filter_by_error_with_data_one_error_type( ) third_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T01:37:41.796Z", "status_code": "NXDOMAIN", "client_ip": "192.168.1.206", @@ -264,8 +290,10 @@ def test_filter_by_error_with_data_one_error_type( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_two_error_types( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -273,6 +301,7 @@ def test_filter_by_error_with_data_two_error_types( ): first_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": "192.168.0.105", @@ -285,6 +314,7 @@ def test_filter_by_error_with_data_two_error_types( ) second_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T02:31:07.943Z", "status_code": "NXDOMAIN", "client_ip": "192.168.1.206", @@ -297,6 +327,7 @@ def test_filter_by_error_with_data_two_error_types( ) third_entry = json.dumps( { + "logline_id": str(uuid.uuid4()), "timestamp": "2024-06-01T01:37:41.796Z", "status_code": "OTHER_TYPE", "client_ip": "192.168.1.206", @@ -323,8 +354,10 @@ class TestSendFilteredData(unittest.TestCase): @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_send_with_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -358,10 +391,12 @@ def test_send_with_data( sut.unfiltered_data = [first_entry, second_entry] sut.filtered_data = [first_entry, second_entry] sut.subnet_id = "192.168.1.0_24" - sut.begin_timestamp = "2024-05-21T08:31:27.000Z" - sut.end_timestamp = "2024-05-21T08:31:29.000Z" + sut.batch_id = uuid.UUID("5236b147-5b0d-44a8-981f-bd7da8c54733") + sut.begin_timestamp = datetime.datetime(2024, 5, 21, 8, 31, 27, 000000) + sut.end_timestamp = datetime.datetime(2024, 5, 21, 8, 31, 29, 000000) expected_message = ( - '{"begin_timestamp": "2024-05-21T08:31:27.000Z", "end_timestamp": "2024-05-21T08:31:29.000Z", "data": [{' + '{"batch_id": "5236b147-5b0d-44a8-981f-bd7da8c54733", "begin_timestamp": "2024-05-21T08:31:27.000000Z", ' + '"end_timestamp": "2024-05-21T08:31:29.000000Z", "data": [{' '"timestamp": "2024-05-21T08:31:28.119Z", "status": "NXDOMAIN", "client_ip": "192.168.1.105", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}, {"timestamp": "2024-06-01T02:31:07.943Z", ' @@ -413,7 +448,8 @@ def test_send_without_data( sut.unfiltered_data = [] sut.filtered_data = [] - self.assertIsNone(sut.send_filtered_data()) + with self.assertRaises(ValueError): + sut.send_filtered_data() mock_produce_handler.add_message.assert_not_called() From bbaa72270ad9f9abbb453e17a8c5232daf21b67a Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Thu, 12 Dec 2024 10:12:26 +0100 Subject: [PATCH 45/59] Update requirements --- requirements/requirements.detector.txt | 1 + requirements/requirements.inspector.txt | 1 + requirements/requirements.logcollector.txt | 1 + requirements/requirements.logserver.txt | 1 + requirements/requirements.prefilter.txt | 1 + 5 files changed, 5 insertions(+) diff --git a/requirements/requirements.detector.txt b/requirements/requirements.detector.txt index eeec3d7..d8bd3ea 100644 --- a/requirements/requirements.detector.txt +++ b/requirements/requirements.detector.txt @@ -5,3 +5,4 @@ colorlog~=6.8.2 PyYAML~=6.0.1 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.inspector.txt b/requirements/requirements.inspector.txt index 23137fd..b1c97b2 100644 --- a/requirements/requirements.inspector.txt +++ b/requirements/requirements.inspector.txt @@ -4,3 +4,4 @@ colorlog~=6.8.2 streamad~=0.3.1 numpy~=1.26.4 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.logcollector.txt b/requirements/requirements.logcollector.txt index 4e02cce..8e8937f 100644 --- a/requirements/requirements.logcollector.txt +++ b/requirements/requirements.logcollector.txt @@ -2,3 +2,4 @@ PyYAML~=6.0.1 colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.logserver.txt b/requirements/requirements.logserver.txt index 3215e11..a241460 100644 --- a/requirements/requirements.logserver.txt +++ b/requirements/requirements.logserver.txt @@ -3,3 +3,4 @@ colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 aiofiles~=24.1.0 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.prefilter.txt b/requirements/requirements.prefilter.txt index 4e02cce..8e8937f 100644 --- a/requirements/requirements.prefilter.txt +++ b/requirements/requirements.prefilter.txt @@ -2,3 +2,4 @@ PyYAML~=6.0.1 colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 From 270df8b99c8d6b9d58606d0f163d448459e29fca Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 09:01:36 +0100 Subject: [PATCH 46/59] Move message_text field in ServerLogs table --- src/base/data_classes/clickhouse_connectors.py | 6 +++--- src/monitoring/clickhouse_connector.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index dd5977a..957fd3a 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -8,9 +8,6 @@ @dataclass class ServerLogs: - message_text: str = field( - metadata={"marshmallow_field": marshmallow.fields.String()} - ) message_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) @@ -19,6 +16,9 @@ class ServerLogs: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") } ) + message_text: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) @dataclass diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py index bb14f81..efe652a 100644 --- a/src/monitoring/clickhouse_connector.py +++ b/src/monitoring/clickhouse_connector.py @@ -67,9 +67,9 @@ def __init__(self): def insert( self, - message_text: str, message_id: uuid.UUID, timestamp_in: datetime.datetime, + message_text: str, ): self._add_to_batch([message_id, timestamp_in, message_text]) From fe5f79c6b558d9527386d1885611b307948bf5f4 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 09:02:22 +0100 Subject: [PATCH 47/59] Bug fixes --- src/detector/detector.py | 2 +- src/monitoring/monitoring_agent.py | 3 +++ src/prefilter/prefilter.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/detector/detector.py b/src/detector/detector.py index 935d3ff..412817d 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -80,7 +80,7 @@ def get_and_fill_data(self) -> None: key, data = self.kafka_consume_handler.consume_as_object() - if data: + if data.data: self.suspicious_batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index 6daa2ad..e1ae219 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -46,6 +46,9 @@ def __init__(self): "dns_loglines": DNSLoglinesConnector(), "logline_timestamps": LoglineTimestampsConnector(), "batch_timestamps": BatchTimestampsConnector(), + "suspicious_batches_to_batch": SuspiciousBatchesToBatchConnector(), + "suspicious_batch_timestamps": SuspiciousBatchTimestampsConnector(), + "alerts": AlertsConnector(), } self.topics = [f"clickhouse_{table_name}" for table_name in self.connectors] diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 88eb3ab..f43ef03 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -71,7 +71,7 @@ def get_and_fill_data(self) -> None: key, data = self.kafka_consume_handler.consume_as_object() self.subnet_id = key - if data: + if data.data: self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp From 20de50825f41a25e5d7bbab407003819a86b3763 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 09:03:13 +0100 Subject: [PATCH 48/59] Restructure and update tests --- tests/{ => clickhouse}/__init__.py | 0 .../test_clickhouse_batch_sender.py | 275 ++++++++++++++++++ .../test_clickhouse_connector.py | 212 ++++++++++++-- .../test_clickhouse_kafka_sender.py | 0 tests/detector/__init__.py | 0 tests/{ => detector}/test_detector.py | 0 tests/inspector/__init__.py | 0 tests/{ => inspector}/test_inspector.py | 75 ++--- tests/kafka/__init__.py | 0 ...test_exactly_once_kafka_consume_handler.py | 0 ...test_exactly_once_kafka_produce_handler.py | 0 .../{ => kafka}/test_kafka_consume_handler.py | 0 tests/{ => kafka}/test_kafka_handler.py | 0 .../{ => kafka}/test_kafka_produce_handler.py | 0 .../test_simple_kafka_consume_handler.py | 0 .../test_simple_kafka_produce_handler.py | 0 tests/logcollector/__init__.py | 0 .../{ => logcollector}/test_batch_handler.py | 0 .../{ => logcollector}/test_buffered_batch.py | 0 tests/{ => logcollector}/test_collector.py | 0 tests/logserver/__init__.py | 0 tests/{ => logserver}/test_server.py | 0 tests/miscellaneous/__init__.py | 0 tests/{ => miscellaneous}/test_field_type.py | 0 tests/{ => miscellaneous}/test_log_config.py | 0 .../test_logline_handler.py | 24 +- tests/{ => miscellaneous}/test_marshmallow.py | 0 tests/miscellaneous/test_monitoring_agent.py | 190 ++++++++++++ tests/{ => miscellaneous}/test_utils.py | 0 tests/prefilter/__init__.py | 0 tests/{ => prefilter}/test_prefilter.py | 0 31 files changed, 715 insertions(+), 61 deletions(-) rename tests/{ => clickhouse}/__init__.py (100%) create mode 100644 tests/clickhouse/test_clickhouse_batch_sender.py rename tests/{ => clickhouse}/test_clickhouse_connector.py (76%) rename tests/{ => clickhouse}/test_clickhouse_kafka_sender.py (100%) create mode 100644 tests/detector/__init__.py rename tests/{ => detector}/test_detector.py (100%) create mode 100644 tests/inspector/__init__.py rename tests/{ => inspector}/test_inspector.py (95%) create mode 100644 tests/kafka/__init__.py rename tests/{ => kafka}/test_exactly_once_kafka_consume_handler.py (100%) rename tests/{ => kafka}/test_exactly_once_kafka_produce_handler.py (100%) rename tests/{ => kafka}/test_kafka_consume_handler.py (100%) rename tests/{ => kafka}/test_kafka_handler.py (100%) rename tests/{ => kafka}/test_kafka_produce_handler.py (100%) rename tests/{ => kafka}/test_simple_kafka_consume_handler.py (100%) rename tests/{ => kafka}/test_simple_kafka_produce_handler.py (100%) create mode 100644 tests/logcollector/__init__.py rename tests/{ => logcollector}/test_batch_handler.py (100%) rename tests/{ => logcollector}/test_buffered_batch.py (100%) rename tests/{ => logcollector}/test_collector.py (100%) create mode 100644 tests/logserver/__init__.py rename tests/{ => logserver}/test_server.py (100%) create mode 100644 tests/miscellaneous/__init__.py rename tests/{ => miscellaneous}/test_field_type.py (100%) rename tests/{ => miscellaneous}/test_log_config.py (100%) rename tests/{ => miscellaneous}/test_logline_handler.py (94%) rename tests/{ => miscellaneous}/test_marshmallow.py (100%) create mode 100644 tests/miscellaneous/test_monitoring_agent.py rename tests/{ => miscellaneous}/test_utils.py (100%) create mode 100644 tests/prefilter/__init__.py rename tests/{ => prefilter}/test_prefilter.py (100%) diff --git a/tests/__init__.py b/tests/clickhouse/__init__.py similarity index 100% rename from tests/__init__.py rename to tests/clickhouse/__init__.py diff --git a/tests/clickhouse/test_clickhouse_batch_sender.py b/tests/clickhouse/test_clickhouse_batch_sender.py new file mode 100644 index 0000000..f152cef --- /dev/null +++ b/tests/clickhouse/test_clickhouse_batch_sender.py @@ -0,0 +1,275 @@ +import unittest +from unittest.mock import patch, Mock + +from src.monitoring.clickhouse_batch_sender import ClickHouseBatchSender + + +class TestInit(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.BATCH_SIZE", 50) + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.CLICKHOUSE_HOSTNAME", "test_name") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_init(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + + # Act + sut = ClickHouseBatchSender(table_name, column_names) + + # Assert + self.assertEqual(table_name, sut.table_name) + self.assertEqual(column_names, sut.column_names) + self.assertEqual(50, sut.max_batch_size) + self.assertEqual(0.5, sut.batch_timeout) + self.assertIsNone(sut.timer) + self.assertEqual([], sut.batch) + + mock_clickhouse_connect.get_client.assert_called_once_with(host="test_name") + + +class TestDel(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_del(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + # Act + with patch( + "src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all" + ) as mock_insert_all: + del sut + + # Assert + mock_insert_all.assert_called_once() + + +class TestAdd(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_str_successful( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_timer_already_started( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + sut.timer = Mock() + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_max_size_reached_and_timer_already_started( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + sut.timer = Mock() + sut.max_batch_size = 1 + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_called_once() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_str_wrong_field_number( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + + # Act + with self.assertRaises(ValueError): + sut.add(data) + + # Assert + self.assertEqual([], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_lists_successful( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = [["entry_1", "entry_2"], ["entry_3", "entry_4"]] + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"], ["entry_3", "entry_4"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_lists_wrong_field_number( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = [["entry_1", "entry_2"], ["entry_3"]] + + # Act + with self.assertRaises(ValueError): + sut.add(data) + + # Assert + self.assertEqual([], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + +class TestInsertAll(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_insert_all(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut._client = Mock() + sut.batch = [["entry_1", "entry_2"], ["entry_3", "entry_4"]] + + # Act + sut.insert_all() + + # Assert + self.assertEqual([], sut.batch) + self.assertIsNone(sut.timer) + + sut._client.insert.assert_called_once_with( + table_name, + [["entry_1", "entry_2"], ["entry_3", "entry_4"]], + column_names=column_names, + ) + + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_insert_all_with_timer(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut._client = Mock() + sut.timer = Mock() + sut.batch = [["entry_1", "entry_2"]] + + # Act + sut.insert_all() + + # Assert + self.assertEqual([], sut.batch) + self.assertIsNone(sut.timer) + + sut._client.insert.assert_called_once_with( + table_name, + [["entry_1", "entry_2"]], + column_names=column_names, + ) + + +class TestStartTimer(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.Timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_start_timer(self, mock_clickhouse_connect, mock_timer): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + # Act + sut._start_timer() + + # Assert + mock_timer.assert_called_once_with( + 0.5, + sut.insert_all, + ) + mock_timer.cancel.assert_not_called() + sut.timer.start.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.Timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_start_timer_with_running_timer(self, mock_clickhouse_connect, mock_timer): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut.timer = mock_timer + + # Act + sut._start_timer() + + # Assert + mock_timer.assert_called_once_with( + 0.5, + sut.insert_all, + ) + mock_timer.cancel.assert_called_once() + sut.timer.start.assert_called_once() diff --git a/tests/test_clickhouse_connector.py b/tests/clickhouse/test_clickhouse_connector.py similarity index 76% rename from tests/test_clickhouse_connector.py rename to tests/clickhouse/test_clickhouse_connector.py index 7ff84fe..bba9a89 100644 --- a/tests/test_clickhouse_connector.py +++ b/tests/clickhouse/test_clickhouse_connector.py @@ -322,30 +322,7 @@ def test_init(self, mock_clickhouse_batch_sender): ) @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_all_given_as_str(self, mock_clickhouse_batch_sender): - # Arrange - logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") - batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") - - sut = LoglineToBatchesConnector() - - with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: - # Act - sut.insert( - logline_id=logline_id, - batch_id=batch_id, - ) - - # Assert - mock_add_to_batch.assert_called_once_with( - [ - uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), - uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), - ] - ) - - @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") - def test_insert_all_given_as_uuid(self, mock_clickhouse_batch_sender): + def test_insert_all_given(self, mock_clickhouse_batch_sender): # Arrange logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") @@ -560,5 +537,192 @@ def test_insert_all_given(self, mock_clickhouse_batch_sender): ) +class TestSuspiciousBatchesToBatchConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "suspicious_batches_to_batch" + expected_column_names = [ + "suspicious_batch_id", + "batch_id", + ] + + # Act + sut = SuspiciousBatchesToBatchConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") + + sut = SuspiciousBatchesToBatchConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + suspicious_batch_id=suspicious_batch_id, + batch_id=batch_id, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), + ] + ) + + +class TestSuspiciousBatchTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "suspicious_batch_timestamps" + expected_column_names = [ + "suspicious_batch_id", + "client_ip", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + # Act + sut = SuspiciousBatchTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + client_ip = "127.0.0.1" + stage = "prefilter" + status = "prefilter_out" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + message_count = 456 + + sut = SuspiciousBatchTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + suspicious_batch_id=suspicious_batch_id, + client_ip=client_ip, + stage=stage, + status=status, + timestamp=timestamp, + is_active=True, + message_count=message_count, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "127.0.0.1", + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + True, + 456, + ] + ) + + +class TestAlertsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "alerts" + expected_column_names = [ + "client_ip", + "alert_timestamp", + "suspicious_batch_id", + "overall_score", + "result", + ] + + # Act + sut = AlertsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + client_ip = "127.0.0.1" + alert_timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + overall_score = 15.4 + result = "test" + + sut = AlertsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + client_ip=client_ip, + alert_timestamp=alert_timestamp, + suspicious_batch_id=suspicious_batch_id, + overall_score=overall_score, + result=result, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + "127.0.0.1", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + 15.4, + "test", + ] + ) + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_clickhouse_kafka_sender.py b/tests/clickhouse/test_clickhouse_kafka_sender.py similarity index 100% rename from tests/test_clickhouse_kafka_sender.py rename to tests/clickhouse/test_clickhouse_kafka_sender.py diff --git a/tests/detector/__init__.py b/tests/detector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_detector.py b/tests/detector/test_detector.py similarity index 100% rename from tests/test_detector.py rename to tests/detector/test_detector.py diff --git a/tests/inspector/__init__.py b/tests/inspector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_inspector.py b/tests/inspector/test_inspector.py similarity index 95% rename from tests/test_inspector.py rename to tests/inspector/test_inspector.py index c9e7b95..a3fa1f4 100644 --- a/tests/test_inspector.py +++ b/tests/inspector/test_inspector.py @@ -353,41 +353,44 @@ def test_inspect_empty_models( with self.assertRaises(NotImplementedError): sut.inspect() - # TODO: Update this test as it is not being called - # @patch("src.inspector.inspector.logger") - # @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - # @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - # @patch( - # "src.inspector.inspector.MODELS", - # [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], - # ) - # @patch("src.inspector.inspector.TIME_TYPE", "ms") - # @patch("src.inspector.inspector.TIME_RANGE", 1) - # def test_inspect_univariate( - # self, mock_kafka_consume_handler, mock_produce_handler, mock_logger - # ): - # test_batch = get_batch(None) - # test_batch.begin_timestamp = datetime.now() - # test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - # data = DEFAULT_DATA - # data["timestamp"] = datetime.strftime( - # test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - # ) - # test_batch.data = [data] - # mock_kafka_consume_handler_instance = MagicMock() - # mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - # mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - # "test", - # test_batch, - # ) - # mock_produce_handler_instance = MagicMock() - # mock_produce_handler.return_value = mock_produce_handler_instance - # - # # with patch("src.inspector.inspector.ClickHouseKafkaSender") as mock_clickhouse: - # sut = Inspector() - # sut.get_and_fill_data() - # sut.inspect() - # self.assertEqual([0, 0], sut.anomalies) + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch( + "src.inspector.inspector.MODELS", + [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], + ) + @patch("src.inspector.inspector.TIME_TYPE", "ms") + @patch("src.inspector.inspector.TIME_RANGE", 1) + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + test_batch = get_batch(None) + test_batch.begin_timestamp = datetime.now() + test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["timestamp"] = datetime.strftime( + test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + ) + test_batch.data = [data] + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + mock_produce_handler_instance = MagicMock() + mock_produce_handler.return_value = mock_produce_handler_instance + + sut = Inspector() + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @@ -405,7 +408,7 @@ def test_inspect_empty_models( @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_univariate( + def test_inspect_univariate_2( self, mock_clickhouse, mock_kafka_consume_handler, diff --git a/tests/kafka/__init__.py b/tests/kafka/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_exactly_once_kafka_consume_handler.py b/tests/kafka/test_exactly_once_kafka_consume_handler.py similarity index 100% rename from tests/test_exactly_once_kafka_consume_handler.py rename to tests/kafka/test_exactly_once_kafka_consume_handler.py diff --git a/tests/test_exactly_once_kafka_produce_handler.py b/tests/kafka/test_exactly_once_kafka_produce_handler.py similarity index 100% rename from tests/test_exactly_once_kafka_produce_handler.py rename to tests/kafka/test_exactly_once_kafka_produce_handler.py diff --git a/tests/test_kafka_consume_handler.py b/tests/kafka/test_kafka_consume_handler.py similarity index 100% rename from tests/test_kafka_consume_handler.py rename to tests/kafka/test_kafka_consume_handler.py diff --git a/tests/test_kafka_handler.py b/tests/kafka/test_kafka_handler.py similarity index 100% rename from tests/test_kafka_handler.py rename to tests/kafka/test_kafka_handler.py diff --git a/tests/test_kafka_produce_handler.py b/tests/kafka/test_kafka_produce_handler.py similarity index 100% rename from tests/test_kafka_produce_handler.py rename to tests/kafka/test_kafka_produce_handler.py diff --git a/tests/test_simple_kafka_consume_handler.py b/tests/kafka/test_simple_kafka_consume_handler.py similarity index 100% rename from tests/test_simple_kafka_consume_handler.py rename to tests/kafka/test_simple_kafka_consume_handler.py diff --git a/tests/test_simple_kafka_produce_handler.py b/tests/kafka/test_simple_kafka_produce_handler.py similarity index 100% rename from tests/test_simple_kafka_produce_handler.py rename to tests/kafka/test_simple_kafka_produce_handler.py diff --git a/tests/logcollector/__init__.py b/tests/logcollector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_batch_handler.py b/tests/logcollector/test_batch_handler.py similarity index 100% rename from tests/test_batch_handler.py rename to tests/logcollector/test_batch_handler.py diff --git a/tests/test_buffered_batch.py b/tests/logcollector/test_buffered_batch.py similarity index 100% rename from tests/test_buffered_batch.py rename to tests/logcollector/test_buffered_batch.py diff --git a/tests/test_collector.py b/tests/logcollector/test_collector.py similarity index 100% rename from tests/test_collector.py rename to tests/logcollector/test_collector.py diff --git a/tests/logserver/__init__.py b/tests/logserver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_server.py b/tests/logserver/test_server.py similarity index 100% rename from tests/test_server.py rename to tests/logserver/test_server.py diff --git a/tests/miscellaneous/__init__.py b/tests/miscellaneous/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_field_type.py b/tests/miscellaneous/test_field_type.py similarity index 100% rename from tests/test_field_type.py rename to tests/miscellaneous/test_field_type.py diff --git a/tests/test_log_config.py b/tests/miscellaneous/test_log_config.py similarity index 100% rename from tests/test_log_config.py rename to tests/miscellaneous/test_log_config.py diff --git a/tests/test_logline_handler.py b/tests/miscellaneous/test_logline_handler.py similarity index 94% rename from tests/test_logline_handler.py rename to tests/miscellaneous/test_logline_handler.py index a88d5c2..dee6f16 100644 --- a/tests/test_logline_handler.py +++ b/tests/miscellaneous/test_logline_handler.py @@ -113,10 +113,32 @@ def test_init_missing_fields(self, mock_create): str(context.exception), "Not all needed fields are set in the configuration" ) + @patch("src.base.logline_handler.REQUIRED_FIELDS", ["field_1"]) + @patch("src.base.logline_handler.LOGLINE_FIELDS", ["field_1", "field_2"]) + @patch("src.base.logline_handler.FORBIDDEN_FIELD_NAMES", ["field_2"]) + @patch("src.base.logline_handler.LoglineHandler._create_instance_from_list_entry") + def test_init_no_fields(self, mock_create): + # Arrange + ip_address_instance = MagicMock() + ip_address_instance.name = "field_2" + mock_create.side_effect = [ip_address_instance] + + # Act and Assert + with self.assertRaises(ValueError) as context: + LoglineHandler() + + self.assertEqual( + str(context.exception), + "Forbidden field name included. " + "These fields are used internally and cannot be used as names: " + "['field_2']", + ) + @patch("src.base.logline_handler.REQUIRED_FIELDS", []) @patch("src.base.logline_handler.LOGLINE_FIELDS", []) + @patch("src.base.logline_handler.LOGLINE_FIELDS", []) @patch("src.base.logline_handler.LoglineHandler._create_instance_from_list_entry") - def test_init_no_fields(self, mock_create): + def test_init_forbidden_fields(self, mock_create): # Arrange mock_create.side_effect = [] diff --git a/tests/test_marshmallow.py b/tests/miscellaneous/test_marshmallow.py similarity index 100% rename from tests/test_marshmallow.py rename to tests/miscellaneous/test_marshmallow.py diff --git a/tests/miscellaneous/test_monitoring_agent.py b/tests/miscellaneous/test_monitoring_agent.py new file mode 100644 index 0000000..ead50b0 --- /dev/null +++ b/tests/miscellaneous/test_monitoring_agent.py @@ -0,0 +1,190 @@ +import datetime +import unittest +import uuid +from unittest.mock import patch, AsyncMock, Mock, mock_open + +import marshmallow_dataclass + +from src.base.data_classes.clickhouse_connectors import ServerLogs +from src.monitoring.monitoring_agent import CREATE_TABLES_DIRECTORY, main +from src.monitoring.monitoring_agent import MonitoringAgent, prepare_all_tables + + +class TestPrepareAllTables(unittest.TestCase): + @patch("os.listdir", return_value=["table1.sql", "table2.sql", "not_sql.txt"]) + @patch("builtins.open", new_callable=mock_open, read_data="CREATE TABLE test;") + @patch("clickhouse_connect.get_client") + def test_prepare_all_tables(self, mock_get_client, mock_open_file, mock_listdir): + # Arrange + mock_client = Mock() + mock_get_client.return_value.__enter__.return_value = mock_client + + # Act + prepare_all_tables() + + # Assert + mock_listdir.assert_called_once_with(CREATE_TABLES_DIRECTORY) + self.assertEqual(mock_open_file.call_count, 2) + mock_client.command.assert_called_with("CREATE TABLE test;") + self.assertEqual(mock_client.command.call_count, 2) + + @patch("os.listdir", return_value=["table1.sql"]) + @patch("builtins.open", new_callable=mock_open, read_data="CREATE TABLE test;") + @patch("clickhouse_connect.get_client") + def test_prepare_all_tables_with_exception( + self, mock_get_client, mock_open_file, mock_listdir + ): + # Arrange + mock_client = Mock() + mock_get_client.return_value.__enter__.return_value = mock_client + + mock_client.command.side_effect = Exception("Simulated Error") + + # Act + with self.assertRaises(Exception) as context: + prepare_all_tables() + + # Assert + self.assertEqual(str(context.exception), "Simulated Error") + + +class TestInit(unittest.TestCase): + @patch("src.monitoring.monitoring_agent.ServerLogsConnector") + @patch("src.monitoring.monitoring_agent.ServerLogsTimestampsConnector") + @patch("src.monitoring.monitoring_agent.FailedDNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineToBatchesConnector") + @patch("src.monitoring.monitoring_agent.DNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineTimestampsConnector") + @patch("src.monitoring.monitoring_agent.BatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchesToBatchConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.AlertsConnector") + @patch("src.monitoring.monitoring_agent.SimpleKafkaConsumeHandler") + def test_init( + self, + mock_kafka_consumer, + mock_alerts, + mock_suspicious_batch_timestamps, + mock_suspicious_batches_to_batch, + mock_batch_timestamps, + mock_logline_timestamps, + mock_dns_loglines, + mock_logline_to_batches, + mock_failed_dns_loglines, + mock_server_logs_timestamps, + mock_server_logs, + ): + # Arrange + expected_topics = [ + "clickhouse_server_logs", + "clickhouse_server_logs_timestamps", + "clickhouse_failed_dns_loglines", + "clickhouse_logline_to_batches", + "clickhouse_dns_loglines", + "clickhouse_logline_timestamps", + "clickhouse_batch_timestamps", + "clickhouse_suspicious_batches_to_batch", + "clickhouse_suspicious_batch_timestamps", + "clickhouse_alerts", + ] + + # Act + sut = MonitoringAgent() + + # Assert + self.assertEqual( + expected_topics, + sut.topics, + ) + mock_kafka_consumer.assert_called_once_with(expected_topics) + + +class TestStart(unittest.IsolatedAsyncioTestCase): + @patch("src.monitoring.monitoring_agent.ServerLogsConnector") + @patch("src.monitoring.monitoring_agent.ServerLogsTimestampsConnector") + @patch("src.monitoring.monitoring_agent.FailedDNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineToBatchesConnector") + @patch("src.monitoring.monitoring_agent.DNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineTimestampsConnector") + @patch("src.monitoring.monitoring_agent.BatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchesToBatchConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.AlertsConnector") + @patch("src.monitoring.monitoring_agent.logger") + @patch("src.monitoring.monitoring_agent.SimpleKafkaConsumeHandler") + @patch("asyncio.get_running_loop") + async def test_handle_kafka_inputs( + self, + mock_get_running_loop, + mock_kafka_consume, + mock_logger, + mock_alerts, + mock_suspicious_batch_timestamps, + mock_suspicious_batches_to_batch, + mock_batch_timestamps, + mock_logline_timestamps, + mock_dns_loglines, + mock_logline_to_batches, + mock_failed_dns_loglines, + mock_server_logs_timestamps, + mock_server_logs, + ): + # Arrange + sut = MonitoringAgent() + sut.connectors["server_logs"] = Mock() + + data_schema = marshmallow_dataclass.class_schema(ServerLogs)() + fixed_id = uuid.uuid4() + timestamp_in = datetime.datetime.now() + value = data_schema.dumps( + { + "message_id": fixed_id, + "timestamp_in": timestamp_in, + "message_text": "test_text", + } + ) + + mock_loop = AsyncMock() + mock_get_running_loop.return_value = mock_loop + sut.kafka_consumer.consume.return_value = ( + "key1", + value, + "clickhouse_server_logs", + ) + mock_loop.run_in_executor.side_effect = [ + ("key1", value, "clickhouse_server_logs"), + KeyboardInterrupt(), + ] + + # Act and Assert + await sut.start() + + sut.connectors["server_logs"].insert.assert_called_once_with( + message_id=fixed_id, + timestamp_in=timestamp_in, + message_text="test_text", + ) + + +class TestMain(unittest.TestCase): + @patch("src.monitoring.monitoring_agent.prepare_all_tables") + @patch("src.monitoring.monitoring_agent.MonitoringAgent") + @patch("asyncio.run") + def test_main( + self, mock_asyncio_run, mock_monitoring_agent, mock_prepare_all_tables + ): + # Arrange + mock_agent_instance = Mock() + mock_monitoring_agent.return_value = mock_agent_instance + + # Act + main() + + # Assert + mock_prepare_all_tables.assert_called_once() + mock_monitoring_agent.assert_called_once() + mock_asyncio_run.assert_called_once_with(mock_agent_instance.start()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils.py b/tests/miscellaneous/test_utils.py similarity index 100% rename from tests/test_utils.py rename to tests/miscellaneous/test_utils.py diff --git a/tests/prefilter/__init__.py b/tests/prefilter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_prefilter.py b/tests/prefilter/test_prefilter.py similarity index 100% rename from tests/test_prefilter.py rename to tests/prefilter/test_prefilter.py From ad612eb000b5f292ab40bb7418f52e0263d30029 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 09:20:18 +0100 Subject: [PATCH 49/59] Fix test_coverage.yml workflow to discover all tests --- .github/workflows/test_coverage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml index c59bc2f..4c75df7 100644 --- a/.github/workflows/test_coverage.yml +++ b/.github/workflows/test_coverage.yml @@ -56,7 +56,7 @@ jobs: - name: Test run: | - python -m coverage run -m unittest + python -m coverage run -m unittest discover tests python -m coverage xml - name: Get Coverage From 2c3f602293249dc25387f1a6a7d5993d36bf9b08 Mon Sep 17 00:00:00 2001 From: "Stefan M." Date: Fri, 13 Dec 2024 09:34:44 +0100 Subject: [PATCH 50/59] Updating MINGW64 install --- .github/workflows/build_test_windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_test_windows.yml b/.github/workflows/build_test_windows.yml index b409d31..869bb69 100644 --- a/.github/workflows/build_test_windows.yml +++ b/.github/workflows/build_test_windows.yml @@ -32,7 +32,7 @@ jobs: requirements/requirements.*.txt - name: Setup msys2 - uses: msys2/setup-msys2@v2 + uses: msys2/setup-msys2@v2.26.0 with: msystem: MINGW64 update: true From c74ed7007aad824a077c9faeec591b44652836ae Mon Sep 17 00:00:00 2001 From: "Stefan M." Date: Fri, 13 Dec 2024 09:40:45 +0100 Subject: [PATCH 51/59] Fix packages of mingw --- .github/workflows/build_test_windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_test_windows.yml b/.github/workflows/build_test_windows.yml index 869bb69..801774c 100644 --- a/.github/workflows/build_test_windows.yml +++ b/.github/workflows/build_test_windows.yml @@ -36,7 +36,7 @@ jobs: with: msystem: MINGW64 update: true - install: git unzip mingw-w64-x86_64-libjpeg-turbo mingw-w64-x86_64-zlib mingw-w64-x86_64-libtiff mingw-w64-x86_64-freetype mingw-w64-x86_64-lcms2 mingw-w64-x86_64-libwebp mingw-w64-x86_64-openjpeg2 mingw-w64-x86_64-libimagequant mingw-w64-x86_64-libraqm mingw-w64-x86_64-gcc mingw-w64-x86_64-python3 mingw-w64-x86_64-python3-pip mingw-w64-x86_64-python3-setuptools + install: git unzip mingw-w64-x86_64-libjpeg-turbo mingw-w64-x86_64-zlib mingw-w64-x86_64-libtiff mingw-w64-x86_64-freetype mingw-w64-x86_64-lcms2 mingw-w64-x86_64-libwebp mingw-w64-x86_64-openjpeg2 mingw-w64-x86_64-libimagequant mingw-w64-x86_64-libraqm mingw-w64-x86_64-gcc mingw-w64-x86_64-python mingw-w64-x86_64-python-pip mingw-w64-x86_64-python-setuptools - name: Install requirements run: | From 9510e5c9578054350845204ec4589254fd46b6f7 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 14:16:48 +0100 Subject: [PATCH 52/59] Add Dockerfile.monitoring --- docker/dockerfiles/Dockerfile.monitoring | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docker/dockerfiles/Dockerfile.monitoring diff --git a/docker/dockerfiles/Dockerfile.monitoring b/docker/dockerfiles/Dockerfile.monitoring new file mode 100644 index 0000000..cf181a7 --- /dev/null +++ b/docker/dockerfiles/Dockerfile.monitoring @@ -0,0 +1,16 @@ +FROM python:3.11-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /usr/src/app + +COPY requirements/requirements.monitoring.txt ./ +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.monitoring.txt + +COPY src/base ./src/base +COPY src/monitoring ./src/monitoring +COPY config.yaml . + +RUN rm -rf /root/.cache + +CMD [ "python", "src/monitoring/monitoring_agent.py"] From 68a7a74710cd431dce4d3e3b018c87e9ecf1513a Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 14:17:51 +0100 Subject: [PATCH 53/59] Rename dev-query.py to query.dev.py --- docker/dockerfiles/Dockerfile.dev-query | 2 +- docker/dockerfiles/Dockerfile.logcollector | 2 +- docker/dockerfiles/Dockerfile.prefilter | 2 +- docker/{dev-query.py => query.dev.py} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename docker/{dev-query.py => query.dev.py} (100%) diff --git a/docker/dockerfiles/Dockerfile.dev-query b/docker/dockerfiles/Dockerfile.dev-query index e6cbf47..6f44e11 100644 --- a/docker/dockerfiles/Dockerfile.dev-query +++ b/docker/dockerfiles/Dockerfile.dev-query @@ -12,4 +12,4 @@ COPY docker/dev-query.py . RUN rm -rf /root/.cache -CMD [ "python", "dev-query.py"] +CMD [ "python", "query.dev.py"] diff --git a/docker/dockerfiles/Dockerfile.logcollector b/docker/dockerfiles/Dockerfile.logcollector index aaac14e..35d70bd 100644 --- a/docker/dockerfiles/Dockerfile.logcollector +++ b/docker/dockerfiles/Dockerfile.logcollector @@ -5,7 +5,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /usr/src/app COPY requirements/requirements.logcollector.txt ./ -RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.logcollector.txt +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.logcollector.txt COPY src/base ./src/base COPY src/logcollector ./src/logcollector diff --git a/docker/dockerfiles/Dockerfile.prefilter b/docker/dockerfiles/Dockerfile.prefilter index bb4646c..c2b3ca4 100644 --- a/docker/dockerfiles/Dockerfile.prefilter +++ b/docker/dockerfiles/Dockerfile.prefilter @@ -5,7 +5,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /usr/src/app COPY requirements/requirements.prefilter.txt ./ -RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.prefilter.txt +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.prefilter.txt COPY src/base ./src/base COPY src/prefilter ./src/prefilter diff --git a/docker/dev-query.py b/docker/query.dev.py similarity index 100% rename from docker/dev-query.py rename to docker/query.dev.py From 07cb41f119de59d03f36c7fa78419cfcbb037534 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 14:19:56 +0100 Subject: [PATCH 54/59] Rename dev-query.py to query.dev.py --- docker/dockerfiles/Dockerfile.dev-query | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/dockerfiles/Dockerfile.dev-query b/docker/dockerfiles/Dockerfile.dev-query index 6f44e11..f730d1a 100644 --- a/docker/dockerfiles/Dockerfile.dev-query +++ b/docker/dockerfiles/Dockerfile.dev-query @@ -8,7 +8,7 @@ RUN pip --disable-pip-version-check install --no-cache-dir --no-compile clickhou COPY src/base ./src/base COPY config.yaml . -COPY docker/dev-query.py . +COPY docker/query.dev.py . RUN rm -rf /root/.cache From d0ee5f9cec25b2369573cc804cdf3efe683ce416 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 16:07:18 +0100 Subject: [PATCH 55/59] Fix bugs --- docker/docker-compose.dev-query.yml | 2 +- docker/docker-compose.yml | 26 -------------------------- docker/query.dev.py | 13 +++++-------- src/logcollector/batch_handler.py | 2 +- src/logcollector/collector.py | 8 ++++++-- src/monitoring/monitoring_agent.py | 2 ++ src/prefilter/prefilter.py | 2 +- 7 files changed, 16 insertions(+), 39 deletions(-) diff --git a/docker/docker-compose.dev-query.yml b/docker/docker-compose.dev-query.yml index c868cb3..ade6ae4 100644 --- a/docker/docker-compose.dev-query.yml +++ b/docker/docker-compose.dev-query.yml @@ -2,7 +2,7 @@ include: - "docker-compose.kafka.yml" services: - sandbox: + dev-query: build: context: .. dockerfile: docker/dockerfiles/Dockerfile.dev-query diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 6ded56e..227a78c 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,10 +15,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.7 @@ -45,8 +41,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.8 @@ -76,14 +70,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logserver: - condition: service_started - prefilter: - condition: service_started - logcollector: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.6 @@ -111,12 +97,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logcollector: - condition: service_started - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.9 @@ -144,12 +124,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logcollector: - condition: service_started - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.10 diff --git a/docker/query.dev.py b/docker/query.dev.py index 114b20d..8c274c3 100644 --- a/docker/query.dev.py +++ b/docker/query.dev.py @@ -1,7 +1,5 @@ -import datetime import os import sys -import uuid import clickhouse_connect @@ -25,16 +23,15 @@ def query_once(client, tables): return tables +def reset_tables(client, tables): + for table_name in tables.keys(): + tables[table_name] = client.command(f"DROP TABLE {table_name};") + + def main(): client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) tables = get_tables() - client.insert( - "server_logs", - [[uuid.uuid4(), datetime.datetime.now(), "This is a logline"]], - ["message_id", "timestamp_in", "message_text"], - ) - results = query_once(client, tables) for key in results: diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index d576790..34d9e5f 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -82,7 +82,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: # create new batch self.batch[key] = [message] new_batch_id = uuid.uuid4() - self.batch_id[key] = [new_batch_id] + self.batch_id[key] = new_batch_id self.logline_to_batches.insert( dict( diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 8e15042..3ceebd5 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -24,6 +24,7 @@ IPV6_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ "ipv6_prefix_length" ] +TIMESTAMP_FORMAT = config["environment"]["timestamp_format"] REQUIRED_FIELDS = ["timestamp", "status_code", "client_ip", "record_type"] BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ @@ -107,6 +108,7 @@ async def send(self) -> None: message_text=logline, timestamp_in=timestamp_in, timestamp_failed=datetime.datetime.now(), + reason_for_failure=None, # TODO: Add actual reason ) ) continue @@ -125,11 +127,13 @@ async def send(self) -> None: dict( logline_id=logline_id, subnet_id=subnet_id, - timestamp=fields.get("timestamp"), + timestamp=datetime.datetime.strptime( + fields.get("timestamp"), TIMESTAMP_FORMAT + ), status_code=fields.get("status_code"), client_ip=fields.get("client_ip"), record_type=fields.get("record_type"), - additional_fields=additional_fields, + additional_fields=json.dumps(additional_fields), ) ) diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index e1ae219..030dca2 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -73,6 +73,8 @@ async def start(self): self.connectors[table_name].insert(**asdict(data)) except KeyboardInterrupt: logger.info("Stopped MonitoringAgent.") + except Exception as e: + logger.warning(e) def main(): diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index f43ef03..c766d3a 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -109,7 +109,7 @@ def filter_by_error(self) -> None: if self.logline_handler.check_relevance(e): self.filtered_data.append(e) else: # not relevant, filtered out - logline_id = uuid.UUID(json.loads(e).get("logline_id")) + logline_id = uuid.UUID(e.get("logline_id")) self.logline_timestamps.insert( dict( From 1d66567048923d12903a6971895a07e3570000ba Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Fri, 13 Dec 2024 16:16:31 +0100 Subject: [PATCH 56/59] Fix test for Prefilter --- tests/prefilter/test_prefilter.py | 223 ++++++++++++++---------------- 1 file changed, 105 insertions(+), 118 deletions(-) diff --git a/tests/prefilter/test_prefilter.py b/tests/prefilter/test_prefilter.py index cf01d85..26caf39 100644 --- a/tests/prefilter/test_prefilter.py +++ b/tests/prefilter/test_prefilter.py @@ -1,5 +1,4 @@ import datetime -import json import unittest import uuid from unittest.mock import MagicMock, patch @@ -177,45 +176,41 @@ def test_filter_by_error_with_data_no_error_types( mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] @@ -238,45 +233,41 @@ def test_filter_by_error_with_data_one_error_type( mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] @@ -299,45 +290,41 @@ def test_filter_by_error_with_data_two_error_types( mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "logline_id": str(uuid.uuid4()), - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "OTHER_TYPE", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "OTHER_TYPE", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] From 27c061ffc13e3897cd18d1273efc06cec599e493 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Sun, 15 Dec 2024 19:10:33 +0100 Subject: [PATCH 57/59] Update test for Inspector --- tests/inspector/test_inspector.py | 36 +++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/inspector/test_inspector.py b/tests/inspector/test_inspector.py index a3fa1f4..418124d 100644 --- a/tests/inspector/test_inspector.py +++ b/tests/inspector/test_inspector.py @@ -880,6 +880,42 @@ def test_send( key="192.168.0.167", ) + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) + @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send_not_suspicious( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_produce_handler_instance = MagicMock() + mock_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() + + sut = Inspector() + sut.anomalies = [0.0, 0.0] + sut.X = np.array([[0.0], [0.0]]) + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["timestamp"] = datetime.strftime( + sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + ) + sut.messages = [data] + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() + + mock_produce_handler_instance.produce.assert_not_called() + class TestMainFunction(unittest.TestCase): @patch("src.inspector.inspector.logger") From 18b8e30345f659efc17adb806c278ce0dff580e0 Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 17 Dec 2024 12:37:35 +0100 Subject: [PATCH 58/59] Add docstrings for clickhouse_kafka_sender.py --- src/base/clickhouse_kafka_sender.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/base/clickhouse_kafka_sender.py b/src/base/clickhouse_kafka_sender.py index f45a425..722ad29 100644 --- a/src/base/clickhouse_kafka_sender.py +++ b/src/base/clickhouse_kafka_sender.py @@ -1,3 +1,8 @@ +""" +The ClickHouseKafkaSender serves as the sender for all inserts into ClickHouse. Whenever a class wants to insert +into a ClickHouse table, the ClickHouseKafkaSender is used to send the respective insert via Kafka. +""" + import os import sys @@ -12,6 +17,8 @@ class ClickHouseKafkaSender: + """Sends insert operations for the specified table via Kafka to the MonitoringAgent.""" + def __init__(self, table_name: str): self.table_name = table_name self.kafka_producer = SimpleKafkaProduceHandler() @@ -20,6 +27,7 @@ def __init__(self, table_name: str): )() def insert(self, data: dict): + """Produces the insert operation to Kafka.""" self.kafka_producer.produce( topic=f"clickhouse_{self.table_name}", data=self.data_schema.dumps(data), From 35aa1eac453497005c03d881f89b240da359579a Mon Sep 17 00:00:00 2001 From: Manuel Fuchs Date: Tue, 17 Dec 2024 14:54:16 +0100 Subject: [PATCH 59/59] Move monitoring related docker compose entries to own file --- docker/docker-compose.monitoring.yml | 47 ++++++++++++++++++++++++++++ docker/docker-compose.yml | 36 +-------------------- 2 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 docker/docker-compose.monitoring.yml diff --git a/docker/docker-compose.monitoring.yml b/docker/docker-compose.monitoring.yml new file mode 100644 index 0000000..3ec9d85 --- /dev/null +++ b/docker/docker-compose.monitoring.yml @@ -0,0 +1,47 @@ +include: + - "docker-compose.kafka.yml" + +services: + clickhouse-server: + image: clickhouse/clickhouse-server:24.3.12.75-alpine + container_name: clickhouse-server + networks: + heidgaf: + ipv4_address: 172.27.0.11 + restart: "unless-stopped" + ports: + - "8123:8123" + - "9000:9000" + healthcheck: + test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] + interval: 10s + timeout: 5s + retries: 3 + + monitoring_agent: + build: + context: .. + dockerfile: docker/dockerfiles/Dockerfile.monitoring + network: host + restart: "unless-stopped" + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + clickhouse-server: + condition: service_healthy + networks: + heidgaf: + ipv4_address: 172.27.0.12 + +networks: + heidgaf: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.27.0.0/16 + gateway: 172.27.0.1 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 227a78c..436d3a4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,5 +1,6 @@ include: - "docker-compose.kafka.yml" + - "docker-compose.monitoring.yml" services: logcollector: @@ -142,41 +143,6 @@ services: count: 1 # alternatively, use `count: all` for all GPUs capabilities: [ gpu ] - clickhouse-server: - image: clickhouse/clickhouse-server:24.3.12.75-alpine - container_name: clickhouse-server - networks: - heidgaf: - ipv4_address: 172.27.0.11 - restart: "unless-stopped" - ports: - - "8123:8123" - - "9000:9000" - healthcheck: - test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] - interval: 10s - timeout: 5s - retries: 3 - - monitoring_agent: - build: - context: .. - dockerfile: docker/dockerfiles/Dockerfile.monitoring - network: host - restart: "unless-stopped" - depends_on: - kafka1: - condition: service_healthy - kafka2: - condition: service_healthy - kafka3: - condition: service_healthy - clickhouse-server: - condition: service_healthy - networks: - heidgaf: - ipv4_address: 172.27.0.12 - networks: heidgaf: driver: bridge