diff --git a/.github/workflows/build_test_windows.yml b/.github/workflows/build_test_windows.yml index b409d31..801774c 100644 --- a/.github/workflows/build_test_windows.yml +++ b/.github/workflows/build_test_windows.yml @@ -32,11 +32,11 @@ jobs: requirements/requirements.*.txt - name: Setup msys2 - uses: msys2/setup-msys2@v2 + uses: msys2/setup-msys2@v2.26.0 with: msystem: MINGW64 update: true - install: git unzip mingw-w64-x86_64-libjpeg-turbo mingw-w64-x86_64-zlib mingw-w64-x86_64-libtiff mingw-w64-x86_64-freetype mingw-w64-x86_64-lcms2 mingw-w64-x86_64-libwebp mingw-w64-x86_64-openjpeg2 mingw-w64-x86_64-libimagequant mingw-w64-x86_64-libraqm mingw-w64-x86_64-gcc mingw-w64-x86_64-python3 mingw-w64-x86_64-python3-pip mingw-w64-x86_64-python3-setuptools + install: git unzip mingw-w64-x86_64-libjpeg-turbo mingw-w64-x86_64-zlib mingw-w64-x86_64-libtiff mingw-w64-x86_64-freetype mingw-w64-x86_64-lcms2 mingw-w64-x86_64-libwebp mingw-w64-x86_64-openjpeg2 mingw-w64-x86_64-libimagequant mingw-w64-x86_64-libraqm mingw-w64-x86_64-gcc mingw-w64-x86_64-python mingw-w64-x86_64-python-pip mingw-w64-x86_64-python-setuptools - name: Install requirements run: | diff --git a/.github/workflows/test_coverage.yml b/.github/workflows/test_coverage.yml index c59bc2f..4c75df7 100644 --- a/.github/workflows/test_coverage.yml +++ b/.github/workflows/test_coverage.yml @@ -56,7 +56,7 @@ jobs: - name: Test run: | - python -m coverage run -m unittest + python -m coverage run -m unittest discover tests python -m coverage xml - name: Get Coverage diff --git a/config.yaml b/config.yaml index 66abb3f..531365a 100644 --- a/config.yaml +++ b/config.yaml @@ -19,7 +19,6 @@ pipeline: log_storage: logserver: input_file: "/opt/file.txt" - max_number_of_connections: 1000 log_collection: collector: @@ -64,6 +63,11 @@ pipeline: base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/ threshold: 0.5 + monitoring: + clickhouse_connector: + batch_size: 10000 + batch_timeout: 2.0 + environment: timestamp_format: "%Y-%m-%dT%H:%M:%S.%fZ" kafka_brokers: @@ -80,3 +84,6 @@ environment: batch_sender_to_prefilter: "pipeline.batch_sender_to_prefilter" prefilter_to_inspector: "pipeline.prefilter_to_inspector" inspector_to_detector: "pipeline.inspector_to_detector" + monitoring: + clickhouse_server: + hostname: 172.27.0.11 diff --git a/docker/docker-compose.dev-query.yml b/docker/docker-compose.dev-query.yml new file mode 100644 index 0000000..ade6ae4 --- /dev/null +++ b/docker/docker-compose.dev-query.yml @@ -0,0 +1,56 @@ +include: + - "docker-compose.kafka.yml" + +services: + dev-query: + build: + context: .. + dockerfile: docker/dockerfiles/Dockerfile.dev-query + network: host + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + networks: + heidgaf: + ipv4_address: 172.27.0.100 + memswap_limit: 768m + deploy: + resources: + limits: + cpus: '2' + memory: 512m + reservations: + cpus: '1' + memory: 256m + volumes: + - "${MOUNT_PATH:?MOUNT_PATH not set}:/opt/file.txt" + + clickhouse-server: + image: clickhouse/clickhouse-server:24.3.12.75-alpine + container_name: clickhouse-server + networks: + heidgaf: + ipv4_address: 172.27.0.11 + restart: "unless-stopped" + ports: + - "8123:8123" + - "9000:9000" + healthcheck: + test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] + interval: 10s + timeout: 5s + retries: 3 + + +networks: + heidgaf: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.27.0.0/16 + gateway: 172.27.0.1 diff --git a/docker/docker-compose.monitoring.yml b/docker/docker-compose.monitoring.yml new file mode 100644 index 0000000..3ec9d85 --- /dev/null +++ b/docker/docker-compose.monitoring.yml @@ -0,0 +1,47 @@ +include: + - "docker-compose.kafka.yml" + +services: + clickhouse-server: + image: clickhouse/clickhouse-server:24.3.12.75-alpine + container_name: clickhouse-server + networks: + heidgaf: + ipv4_address: 172.27.0.11 + restart: "unless-stopped" + ports: + - "8123:8123" + - "9000:9000" + healthcheck: + test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] + interval: 10s + timeout: 5s + retries: 3 + + monitoring_agent: + build: + context: .. + dockerfile: docker/dockerfiles/Dockerfile.monitoring + network: host + restart: "unless-stopped" + depends_on: + kafka1: + condition: service_healthy + kafka2: + condition: service_healthy + kafka3: + condition: service_healthy + clickhouse-server: + condition: service_healthy + networks: + heidgaf: + ipv4_address: 172.27.0.12 + +networks: + heidgaf: + driver: bridge + ipam: + driver: default + config: + - subnet: 172.27.0.0/16 + gateway: 172.27.0.1 diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index f428797..436d3a4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -1,5 +1,6 @@ include: - "docker-compose.kafka.yml" + - "docker-compose.monitoring.yml" services: logcollector: @@ -15,10 +16,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.7 @@ -45,8 +42,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.8 @@ -76,14 +71,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logserver: - condition: service_started - prefilter: - condition: service_started - logcollector: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.6 @@ -111,12 +98,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logcollector: - condition: service_started - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.9 @@ -144,12 +125,6 @@ services: condition: service_healthy kafka3: condition: service_healthy - logcollector: - condition: service_started - logserver: - condition: service_started - clickhouse-server: - condition: service_healthy networks: heidgaf: ipv4_address: 172.27.0.10 @@ -168,22 +143,6 @@ services: count: 1 # alternatively, use `count: all` for all GPUs capabilities: [ gpu ] - clickhouse-server: - image: clickhouse/clickhouse-server:24.3.12.75-alpine - container_name: clickhouse-server - networks: - heidgaf: - ipv4_address: 172.27.0.11 - restart: "unless-stopped" - ports: - - "8123:8123" - - "9000:9000" - healthcheck: - test: [ "CMD-SHELL", "nc -z 127.0.0.1 8123" ] - interval: 10s - timeout: 5s - retries: 3 - networks: heidgaf: driver: bridge diff --git a/docker/dockerfiles/Dockerfile.dev-query b/docker/dockerfiles/Dockerfile.dev-query new file mode 100644 index 0000000..f730d1a --- /dev/null +++ b/docker/dockerfiles/Dockerfile.dev-query @@ -0,0 +1,15 @@ +FROM python:3.11-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /usr/src/app + +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile clickhouse_connect marshmallow_dataclass colorlog pyYAML confluent_kafka + +COPY src/base ./src/base +COPY config.yaml . +COPY docker/query.dev.py . + +RUN rm -rf /root/.cache + +CMD [ "python", "query.dev.py"] diff --git a/docker/dockerfiles/Dockerfile.logcollector b/docker/dockerfiles/Dockerfile.logcollector index aaac14e..35d70bd 100644 --- a/docker/dockerfiles/Dockerfile.logcollector +++ b/docker/dockerfiles/Dockerfile.logcollector @@ -5,7 +5,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /usr/src/app COPY requirements/requirements.logcollector.txt ./ -RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.logcollector.txt +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.logcollector.txt COPY src/base ./src/base COPY src/logcollector ./src/logcollector diff --git a/docker/dockerfiles/Dockerfile.monitoring b/docker/dockerfiles/Dockerfile.monitoring new file mode 100644 index 0000000..cf181a7 --- /dev/null +++ b/docker/dockerfiles/Dockerfile.monitoring @@ -0,0 +1,16 @@ +FROM python:3.11-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 + +WORKDIR /usr/src/app + +COPY requirements/requirements.monitoring.txt ./ +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.monitoring.txt + +COPY src/base ./src/base +COPY src/monitoring ./src/monitoring +COPY config.yaml . + +RUN rm -rf /root/.cache + +CMD [ "python", "src/monitoring/monitoring_agent.py"] diff --git a/docker/dockerfiles/Dockerfile.prefilter b/docker/dockerfiles/Dockerfile.prefilter index bb4646c..c2b3ca4 100644 --- a/docker/dockerfiles/Dockerfile.prefilter +++ b/docker/dockerfiles/Dockerfile.prefilter @@ -5,7 +5,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /usr/src/app COPY requirements/requirements.prefilter.txt ./ -RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.prefilter.txt +RUN pip --disable-pip-version-check install --no-cache-dir --no-compile -r requirements.prefilter.txt COPY src/base ./src/base COPY src/prefilter ./src/prefilter diff --git a/docker/query.dev.py b/docker/query.dev.py new file mode 100644 index 0000000..8c274c3 --- /dev/null +++ b/docker/query.dev.py @@ -0,0 +1,48 @@ +import os +import sys + +import clickhouse_connect + +sys.path.append(os.getcwd()) +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE + + +def get_tables(): + tables = {} + + for table_name in TABLE_NAME_TO_TYPE: + tables[table_name] = [] + + return tables + + +def query_once(client, tables): + for table_name in tables.keys(): + tables[table_name] = client.query(f"SELECT * FROM {table_name};") + + return tables + + +def reset_tables(client, tables): + for table_name in tables.keys(): + tables[table_name] = client.command(f"DROP TABLE {table_name};") + + +def main(): + client = clickhouse_connect.get_client(host="172.27.0.11", port=8123) + tables = get_tables() + + results = query_once(client, tables) + + for key in results: + print(f"'{key}':") + + if results[key].result_rows: + for row in results[key].result_rows: + print("\t", row) + else: + print("\t -") + + +if __name__ == "__main__": + main() diff --git a/requirements/requirements.detector.txt b/requirements/requirements.detector.txt index eeec3d7..d8bd3ea 100644 --- a/requirements/requirements.detector.txt +++ b/requirements/requirements.detector.txt @@ -5,3 +5,4 @@ colorlog~=6.8.2 PyYAML~=6.0.1 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.inspector.txt b/requirements/requirements.inspector.txt index 23137fd..b1c97b2 100644 --- a/requirements/requirements.inspector.txt +++ b/requirements/requirements.inspector.txt @@ -4,3 +4,4 @@ colorlog~=6.8.2 streamad~=0.3.1 numpy~=1.26.4 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.logcollector.txt b/requirements/requirements.logcollector.txt index 4e02cce..8e8937f 100644 --- a/requirements/requirements.logcollector.txt +++ b/requirements/requirements.logcollector.txt @@ -2,3 +2,4 @@ PyYAML~=6.0.1 colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.logserver.txt b/requirements/requirements.logserver.txt index 3215e11..a241460 100644 --- a/requirements/requirements.logserver.txt +++ b/requirements/requirements.logserver.txt @@ -3,3 +3,4 @@ colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 aiofiles~=24.1.0 +clickhouse_connect~=0.8.3 diff --git a/requirements/requirements.monitoring.txt b/requirements/requirements.monitoring.txt new file mode 100644 index 0000000..a2e9d98 --- /dev/null +++ b/requirements/requirements.monitoring.txt @@ -0,0 +1,5 @@ +clickhouse_connect~=0.8.3 +confluent-kafka~=2.4.0 +marshmallow_dataclass~=8.7.1 +colorlog~=6.8.2 +PyYAML~=6.0.1 diff --git a/requirements/requirements.prefilter.txt b/requirements/requirements.prefilter.txt index 4e02cce..8e8937f 100644 --- a/requirements/requirements.prefilter.txt +++ b/requirements/requirements.prefilter.txt @@ -2,3 +2,4 @@ PyYAML~=6.0.1 colorlog~=6.8.2 confluent-kafka~=2.4.0 marshmallow_dataclass~=8.7.1 +clickhouse_connect~=0.8.3 diff --git a/src/base/__init__.py b/src/base/__init__.py index d44faf7..e69de29 100644 --- a/src/base/__init__.py +++ b/src/base/__init__.py @@ -1,19 +0,0 @@ -from typing import List -from dataclasses import dataclass, field -import marshmallow.validate -import datetime - - -@dataclass -class Batch: - begin_timestamp: datetime.datetime = field( - metadata={ - "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") - } - ) - end_timestamp: datetime.datetime = field( - metadata={ - "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") - } - ) - data: List[dict] = field(default_factory=list) diff --git a/src/base/clickhouse_kafka_sender.py b/src/base/clickhouse_kafka_sender.py new file mode 100644 index 0000000..722ad29 --- /dev/null +++ b/src/base/clickhouse_kafka_sender.py @@ -0,0 +1,34 @@ +""" +The ClickHouseKafkaSender serves as the sender for all inserts into ClickHouse. Whenever a class wants to insert +into a ClickHouse table, the ClickHouseKafkaSender is used to send the respective insert via Kafka. +""" + +import os +import sys + +import marshmallow_dataclass + +sys.path.append(os.getcwd()) +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE +from src.base.kafka_handler import SimpleKafkaProduceHandler +from src.base.log_config import get_logger + +logger = get_logger() + + +class ClickHouseKafkaSender: + """Sends insert operations for the specified table via Kafka to the MonitoringAgent.""" + + def __init__(self, table_name: str): + self.table_name = table_name + self.kafka_producer = SimpleKafkaProduceHandler() + self.data_schema = marshmallow_dataclass.class_schema( + TABLE_NAME_TO_TYPE.get(table_name) + )() + + def insert(self, data: dict): + """Produces the insert operation to Kafka.""" + self.kafka_producer.produce( + topic=f"clickhouse_{self.table_name}", + data=self.data_schema.dumps(data), + ) diff --git a/src/base/data_classes/batch.py b/src/base/data_classes/batch.py new file mode 100644 index 0000000..3fce56d --- /dev/null +++ b/src/base/data_classes/batch.py @@ -0,0 +1,24 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from typing import List + +import marshmallow.validate + + +@dataclass +class Batch: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + begin_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") + } + ) + end_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%dT%H:%M:%S.%fZ") + } + ) + data: List = field(default_factory=list) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py new file mode 100644 index 0000000..957fd3a --- /dev/null +++ b/src/base/data_classes/clickhouse_connectors.py @@ -0,0 +1,186 @@ +import datetime +import uuid +from dataclasses import dataclass, field +from typing import Optional + +import marshmallow.validate + + +@dataclass +class ServerLogs: + message_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + timestamp_in: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + message_text: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + + +@dataclass +class ServerLogsTimestamps: + message_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + event: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + event_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + +@dataclass +class FailedDNSLoglines: + message_text: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + timestamp_in: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + timestamp_failed: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + reason_for_failure: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class LoglineToBatches: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + + +@dataclass +class DNSLoglines: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + subnet_id: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + status_code: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + record_type: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + additional_fields: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + + +@dataclass +class LoglineTimestamps: + logline_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) + + +@dataclass +class BatchTimestamps: + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) + message_count: int = field( + metadata={"marshmallow_field": marshmallow.fields.Integer()} + ) + + +@dataclass +class SuspiciousBatchesToBatch: + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + + +@dataclass +class SuspiciousBatchTimestamps: + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + is_active: bool = field( + metadata={"marshmallow_field": marshmallow.fields.Boolean()} + ) + message_count: int = field( + metadata={"marshmallow_field": marshmallow.fields.Integer()} + ) + + +@dataclass +class Alerts: + client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + suspicious_batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + alert_timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + overall_score: float = field( + metadata={"marshmallow_field": marshmallow.fields.Float()} + ) + result: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + + +TABLE_NAME_TO_TYPE = { + "server_logs": ServerLogs, + "server_logs_timestamps": ServerLogsTimestamps, + "failed_dns_loglines": FailedDNSLoglines, + "logline_to_batches": LoglineToBatches, + "dns_loglines": DNSLoglines, + "logline_timestamps": LoglineTimestamps, + "batch_timestamps": BatchTimestamps, + "suspicious_batches_to_batch": SuspiciousBatchesToBatch, + "suspicious_batch_timestamps": SuspiciousBatchTimestamps, + "alerts": Alerts, +} diff --git a/src/base/kafka_handler.py b/src/base/kafka_handler.py index 430eaf2..7fde819 100644 --- a/src/base/kafka_handler.py +++ b/src/base/kafka_handler.py @@ -20,7 +20,7 @@ ) sys.path.append(os.getcwd()) -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.log_config import get_logger from src.base.utils import kafka_delivery_report, setup_config @@ -323,7 +323,6 @@ class ExactlyOnceKafkaConsumeHandler(KafkaConsumeHandler): """ def __init__(self, topics: str | list[str]) -> None: - self.batch_schema = marshmallow_dataclass.class_schema(Batch)() super().__init__(topics) def consume(self) -> tuple[str | None, str | None, str | None]: @@ -396,7 +395,8 @@ def consume_as_object(self) -> tuple[None | str, Batch]: ast.literal_eval(item) for item in eval_data.get("data") ] - eval_data: Batch = self.batch_schema.load(eval_data) + batch_schema = marshmallow_dataclass.class_schema(Batch)() + eval_data: Batch = batch_schema.load(eval_data) if isinstance(eval_data, Batch): return key, eval_data diff --git a/src/base/logline_handler.py b/src/base/logline_handler.py index e3d97d5..c0199a5 100644 --- a/src/base/logline_handler.py +++ b/src/base/logline_handler.py @@ -8,6 +8,10 @@ CONFIG = setup_config() LOGLINE_FIELDS = CONFIG["pipeline"]["log_collection"]["collector"]["logline_format"] REQUIRED_FIELDS = ["timestamp", "status_code", "client_ip", "record_type"] +FORBIDDEN_FIELD_NAMES = [ + "logline_id", + "batch_id", +] # field names that are used internally class FieldType: @@ -139,6 +143,12 @@ def __init__(self): for field in LOGLINE_FIELDS: instance = self._create_instance_from_list_entry(field) + if instance.name in FORBIDDEN_FIELD_NAMES: + raise ValueError( + f"Forbidden field name included. These fields are used internally " + f"and cannot be used as names: {FORBIDDEN_FIELD_NAMES}" + ) + if self.instances_by_name.get(instance.name): raise ValueError("Multiple fields with same name") diff --git a/src/detector/detector.py b/src/detector/detector.py index cceb9cc..412817d 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -1,3 +1,4 @@ +import datetime import hashlib import json import os @@ -11,6 +12,7 @@ from numpy import median sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import setup_config from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -47,6 +49,8 @@ class Detector: """ def __init__(self) -> None: + self.suspicious_batch_id = None + self.key = None self.messages = [] self.warnings = [] self.begin_timestamp = None @@ -55,14 +59,18 @@ def __init__(self) -> None: tempfile.gettempdir(), f"{MODEL}_{CHECKSUM}.pickle" ) - logger.debug(f"Initializing Detector...") self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) self.model = self._get_model() + # databases + self.suspicious_batch_timestamps = ClickHouseKafkaSender( + "suspicious_batch_timestamps" + ) + self.alerts = ClickHouseKafkaSender("alerts") + def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" - logger.debug("Getting and filling data...") if self.messages: logger.warning( "Detector is busy: Not consuming new messages. Wait for the Detector to finish the " @@ -70,17 +78,27 @@ def get_and_fill_data(self) -> None: ) return - logger.debug( - "Detector is not busy: Calling KafkaConsumeHandler to consume new JSON messages..." - ) key, data = self.kafka_consume_handler.consume_as_object() - if data: + if data.data: + self.suspicious_batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key + self.suspicious_batch_timestamps.insert( + dict( + suspicious_batch_id=self.suspicious_batch_id, + client_ip=key, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=len(self.messages), + ) + ) + if not self.messages: logger.info( "Received message:\n" @@ -92,9 +110,6 @@ def get_and_fill_data(self) -> None: f" ⤷ Contains data field of {len(self.messages)} message(s). Belongs to subnet_id {key}." ) - logger.debug("Received consumer message as json data.") - logger.debug(f"(data={self.messages})") - def _sha256sum(self, file_path: str) -> str: """Return a SHA265 sum check to validate the model. @@ -116,7 +131,7 @@ def _sha256sum(self, file_path: str) -> str: return h.hexdigest() - def _get_model(self) -> None: + def _get_model(self): """ Downloads model from server. If model already exists, it returns the current model. In addition, it checks the sha256 sum in case a model has been updated. @@ -282,19 +297,42 @@ def detect(self) -> None: # pragma: no cover self.warnings.append(warning) def send_warning(self) -> None: - logger.info("Store alert to file.") + logger.info("Store alert.") if len(self.warnings) > 0: overall_score = median( [warning["probability"] for warning in self.warnings] ) alert = {"overall_score": overall_score, "result": self.warnings} + logger.info(f"Add alert: {alert}") with open(os.path.join(tempfile.gettempdir(), "warnings.json"), "a+") as f: json.dump(alert, f) f.write("\n") + + self.alerts.insert( + dict( + client_ip=self.key, + alert_timestamp=datetime.datetime.now(), + suspicious_batch_id=self.suspicious_batch_id, + overall_score=overall_score, + result=json.dumps(self.warnings), + ) + ) else: logger.info("No warning produced.") + self.suspicious_batch_timestamps.insert( + dict( + suspicious_batch_id=self.suspicious_batch_id, + client_ip=self.key, + stage=module_name, + status="finished", + timestamp=datetime.datetime.now(), + is_active=False, + message_count=len(self.messages), + ) + ) + def main(one_iteration: bool = False): # pragma: no cover """ diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 26f2ff6..334718f 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -1,14 +1,17 @@ import importlib -import json import os import sys +import uuid from datetime import datetime from enum import Enum, unique +import marshmallow_dataclass import numpy as np from streamad.util import StreamGenerator, CustomDS sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.data_classes.batch import Batch from src.base.utils import setup_config from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -76,21 +79,30 @@ class Inspector: """Finds anomalies in a batch of requests and produces it to the ``Detector``.""" def __init__(self) -> None: + self.batch_id = None + self.X = None self.key = None self.begin_timestamp = None self.end_timestamp = None + self.messages = [] self.anomalies = [] - logger.debug(f"Initializing Inspector...") - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) - logger.debug(f"Initialized Inspector.") + + # databases + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.suspicious_batch_timestamps = ClickHouseKafkaSender( + "suspicious_batch_timestamps" + ) + self.suspicious_batches_to_batch = ClickHouseKafkaSender( + "suspicious_batches_to_batch" + ) def get_and_fill_data(self) -> None: """Consumes data from KafkaConsumeHandler and stores it for processing.""" - logger.debug("Getting and filling data...") if self.messages: logger.warning( "Inspector is busy: Not consuming new messages. Wait for the Inspector to finish the " @@ -98,17 +110,26 @@ def get_and_fill_data(self) -> None: ) return - logger.debug( - "Inspector is not busy: Calling KafkaConsumeHandler to consume new JSON messages..." - ) key, data = self.kafka_consume_handler.consume_as_object() if data: + self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="in_process", + timestamp=datetime.now(), + is_active=True, + message_count=len(self.messages), + ) + ) + if not self.messages: logger.info( "Received message:\n" @@ -120,9 +141,6 @@ def get_and_fill_data(self) -> None: f" ⤷ Contains data field of {len(self.messages)} message(s). Belongs to subnet_id {key}." ) - logger.debug("Received consumer message as json data.") - logger.debug(f"(data={self.messages})") - def clear_data(self): """Clears the data in the internal data structures.""" self.messages = [] @@ -403,7 +421,7 @@ def _inspect_univariate(self, model: str): for x in stream.iter_item(): score = self.model.fit_score(x) - if score != None: + if score is not None: self.anomalies.append(score) else: self.anomalies.append(0) @@ -412,30 +430,68 @@ def send_data(self): total_anomalies = np.count_nonzero( np.greater_equal(np.array(self.anomalies), SCORE_THRESHOLD) ) - if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: - logger.debug("Sending data to KafkaProduceHandler...") - logger.info("Sending anomalies to detector for further analysation.") + if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: # subnet is suspicious + logger.info("Sending anomalies to detector for further analysis.") buckets = {} + for message in self.messages: if message["client_ip"] in buckets.keys(): buckets[message["client_ip"]].append(message) else: buckets[message["client_ip"]] = [] buckets.get(message["client_ip"]).append(message) + for key, value in buckets.items(): logger.info(f"Sending anomalies to detector for {key}.") logger.info(f"Sending anomalies to detector for {value}.") + + suspicious_batch_id = uuid.uuid4() # generate new suspicious_batch_id + + self.suspicious_batches_to_batch.insert( + dict( + suspicious_batch_id=suspicious_batch_id, + batch_id=self.batch_id, + ) + ) + data_to_send = { - "begin_timestamp": self.begin_timestamp.strftime(TIMESTAMP_FORMAT), - "end_timestamp": self.end_timestamp.strftime(TIMESTAMP_FORMAT), + "batch_id": suspicious_batch_id, + "begin_timestamp": self.begin_timestamp, + "end_timestamp": self.end_timestamp, "data": value, } - self.kafka_produce_handler.send( - topic="Detector", - data=json.dumps(data_to_send), + + batch_schema = marshmallow_dataclass.class_schema(Batch)() + + self.kafka_produce_handler.produce( + topic=PRODUCE_TOPIC, + data=batch_schema.dumps(data_to_send), key=key, ) + self.suspicious_batch_timestamps.insert( + dict( + suspicious_batch_id=suspicious_batch_id, + client_ip=key, + stage=module_name, + status="finished", + timestamp=datetime.now(), + is_active=True, + message_count=len(value), + ) + ) + else: # subnet is not suspicious + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="filtered_out", + timestamp=datetime.now(), + is_active=False, + message_count=len(self.messages), + ) + ) + def main(one_iteration: bool = False): """ diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 7509d79..34d9e5f 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -1,16 +1,21 @@ +import datetime import json import os import sys -from datetime import datetime +import uuid from threading import Timer -from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler -from src.base.utils import setup_config +import marshmallow_dataclass sys.path.append(os.getcwd()) +from src.base.data_classes.batch import Batch +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler +from src.base.utils import setup_config, generate_unique_transactional_id from src.base.log_config import get_logger -logger = get_logger("log_collection.batch_handler") +module_name = "log_collection.batch_handler" +logger = get_logger(module_name) config = setup_config() BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] @@ -18,6 +23,12 @@ PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ "batch_sender_to_prefilter" ] +KAFKA_BROKERS = ",".join( + [ + f"{broker['hostname']}:{broker['port']}" + for broker in config["environment"]["kafka_brokers"] + ] +) class BufferedBatch: @@ -29,22 +40,67 @@ class BufferedBatch: def __init__(self): self.batch = {} # Batch for the latest messages coming in self.buffer = {} # Former batch with previous messages + self.batch_id = {} # Batch ID per key - def add_message(self, key: str, message: str) -> None: + # databases + self.logline_to_batches = ClickHouseKafkaSender("logline_to_batches") + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + + def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: """ Adds a given message to the messages list of the given key. If the key already exists, the message is simply added, otherwise, the key is created. Args: + logline_id (uuid.UUID): Logline ID of the added message key (str): Key to which the message is added message (str): Message to be added """ if key in self.batch: # key already has messages associated self.batch[key].append(message) - logger.debug(f"Message '{message}' added to {key}'s batch.") + + batch_id = self.batch_id.get(key) + self.logline_to_batches.insert( + dict( + logline_id=logline_id, + batch_id=batch_id, + ) + ) + + self.batch_timestamps.insert( + dict( + batch_id=batch_id, + stage=module_name, + status="waiting", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=self.get_number_of_messages(key), + ) + ) + else: # key has no messages associated yet + # create new batch self.batch[key] = [message] - logger.debug(f"Message '{message}' added to newly created {key}'s batch.") + new_batch_id = uuid.uuid4() + self.batch_id[key] = new_batch_id + + self.logline_to_batches.insert( + dict( + logline_id=logline_id, + batch_id=new_batch_id, + ) + ) + + self.batch_timestamps.insert( + dict( + batch_id=new_batch_id, + stage=module_name, + status="waiting", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=1, + ) + ) def get_number_of_messages(self, key: str) -> int: """ @@ -93,7 +149,7 @@ def sort_messages( List of log lines as strings sorted by timestamps (ascending) """ sorted_data = sorted( - data, key=lambda x: datetime.strptime(x[0], timestamp_format) + data, key=lambda x: datetime.datetime.strptime(x[0], timestamp_format) ) loglines = [message for _, message in sorted_data] @@ -198,7 +254,8 @@ def complete_batch(self, key: str) -> dict: key (str): Key for which to complete the current batch and return data packet Returns: - Dictionary of begin_timestamp, end_timestamp and messages (including buffered data) associated with a key + Set of new Logline IDs and dictionary of begin_timestamp, end_timestamp and messages (including buffered + data) associated with a key Raises: ValueError: No data is available for sending. @@ -216,16 +273,39 @@ def complete_batch(self, key: str) -> dict: buffer_data = self.buffer[key] begin_timestamp = self.get_first_timestamp_of_buffer(key) + batch_id = self.batch_id.get(key) + data = { - "begin_timestamp": begin_timestamp, - "end_timestamp": self.get_last_timestamp_of_batch(key), + "batch_id": batch_id, + "begin_timestamp": datetime.datetime.strptime( + begin_timestamp, + "%Y-%m-%dT%H:%M:%S.%fZ", + ), + "end_timestamp": datetime.datetime.strptime( + self.get_last_timestamp_of_batch(key), + "%Y-%m-%dT%H:%M:%S.%fZ", + ), "data": buffer_data + self.batch[key], } + self.batch_timestamps.insert( + dict( + batch_id=batch_id, + stage=module_name, + status="completed", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=self.get_number_of_messages(key), + ) + ) + # Move data from batch to buffer self.buffer[key] = self.batch[key] del self.batch[key] + # Batch ID is not needed anymore + del self.batch_id[key] + return data if self.buffer: # Variant 3: Only buffer has entries @@ -267,11 +347,11 @@ def __init__(self): self.batch = BufferedBatch() self.timer = None - logger.debug(f"Calling KafkaProduceHandler(transactional_id='collector')...") - self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler( - transactional_id="collector" - ) - logger.debug(f"Initialized KafkaBatchSender.") + transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) + self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) + + # databases + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") def __del__(self): logger.debug(f"Closing KafkaBatchSender ({self.topic=})...") @@ -295,7 +375,29 @@ def add_message(self, key: str, message: str) -> None: """ logger.debug(f"Adding message '{message}' to batch.") - self.batch.add_message(key, message) + logline_id = json.loads(message).get("logline_id") + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + is_active=True, + ) + ) + + self.batch.add_message(key, logline_id, message) + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="batched", + timestamp=datetime.datetime.now(), + is_active=True, + ) + ) logger.debug(f"Batch: {self.batch.batch}") number_of_messages_for_key = self.batch.get_number_of_messages(key) @@ -310,7 +412,6 @@ def add_message(self, key: str, message: str) -> None: f" ⤷ {number_of_messages_for_key} messages sent." ) elif not self.timer: # First time setting the timer - logger.debug("Timer not set yet. Calling _reset_timer()...") self._reset_timer() logger.debug(f"Message '{message}' successfully added to batch for {key=}.") @@ -352,25 +453,22 @@ def _send_all_batches(self, reset_timer: bool = True) -> None: ) def _send_batch_for_key(self, key: str) -> None: - logger.debug(f"Starting to send the batch for {key=}...") - try: - data_packet = self.batch.complete_batch(key) + data = self.batch.complete_batch(key) except ValueError as e: logger.debug(e) return - self._send_data_packet(key, data_packet) + self._send_data_packet(key, data) def _send_data_packet(self, key: str, data: dict) -> None: - logger.debug("Sending data to KafkaProduceHandler...") - logger.debug(f"{data=}") + batch_schema = marshmallow_dataclass.class_schema(Batch)() + self.kafka_produce_handler.produce( topic=self.topic, - data=json.dumps(data), + data=batch_schema.dumps(data), key=key, ) - logger.debug(f"{data=}") def _reset_timer(self) -> None: """ diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index 5af98b7..3ceebd5 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -1,17 +1,21 @@ import asyncio +import datetime import ipaddress import json import os import sys +import uuid sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.kafka_handler import ExactlyOnceKafkaConsumeHandler from src.base.logline_handler import LoglineHandler from src.base import utils from src.logcollector.batch_handler import BufferedBatchSender from src.base.log_config import get_logger -logger = get_logger("log_collection.collector") +module_name = "log_collection.collector" +logger = get_logger(module_name) config = utils.setup_config() IPV4_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ @@ -20,6 +24,8 @@ IPV6_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ "ipv6_prefix_length" ] +TIMESTAMP_FORMAT = config["environment"]["timestamp_format"] +REQUIRED_FIELDS = ["timestamp", "status_code", "client_ip", "record_type"] BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ "logserver_to_collector" @@ -38,6 +44,11 @@ def __init__(self) -> None: self.logline_handler = LoglineHandler() self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + # databases + self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") + self.dns_loglines = ClickHouseKafkaSender("dns_loglines") + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") + async def start(self) -> None: """ Starts fetching messages from Kafka and sending them to the :class:`Prefilter`. @@ -74,7 +85,7 @@ async def fetch(self) -> None: ) logger.debug(f"From Kafka: '{value}'") - await self.store(value) + await self.store(datetime.datetime.now(), value) async def send(self) -> None: """ @@ -85,19 +96,73 @@ async def send(self) -> None: try: while True: if not self.loglines.empty(): - logline = await self.loglines.get() + timestamp_in, logline = await self.loglines.get() + try: fields = self.logline_handler.validate_logline_and_get_fields_as_json( logline ) except ValueError: + self.failed_dns_loglines.insert( + dict( + message_text=logline, + timestamp_in=timestamp_in, + timestamp_failed=datetime.datetime.now(), + reason_for_failure=None, # TODO: Add actual reason + ) + ) continue subnet_id = self.get_subnet_id( ipaddress.ip_address(fields.get("client_ip")) ) - self.batch_handler.add_message(subnet_id, json.dumps(fields)) + additional_fields = fields.copy() + for field in REQUIRED_FIELDS: + additional_fields.pop(field) + + logline_id = uuid.uuid4() + + self.dns_loglines.insert( + dict( + logline_id=logline_id, + subnet_id=subnet_id, + timestamp=datetime.datetime.strptime( + fields.get("timestamp"), TIMESTAMP_FORMAT + ), + status_code=fields.get("status_code"), + client_ip=fields.get("client_ip"), + record_type=fields.get("record_type"), + additional_fields=json.dumps(additional_fields), + ) + ) + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="in_process", + timestamp=timestamp_in, + is_active=True, + ) + ) + + message_fields = fields.copy() + message_fields["logline_id"] = str(logline_id) + + self.batch_handler.add_message( + subnet_id, json.dumps(message_fields) + ) + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="finished", + timestamp=datetime.datetime.now(), + is_active=True, + ) + ) logger.debug(f"Sent: '{logline}'") else: await asyncio.sleep(0.1) @@ -115,14 +180,15 @@ async def send(self) -> None: logger.info("Stopped LogCollector.") - async def store(self, message: str): + async def store(self, timestamp_in: datetime.datetime, message: str): """ Stores the given message temporarily. Args: + timestamp_in (datetime.datetime): Timestamp of entering the pipeline message (str): Message to be stored """ - await self.loglines.put(message) + await self.loglines.put((timestamp_in, message)) @staticmethod def get_subnet_id(address: ipaddress.IPv4Address | ipaddress.IPv6Address) -> str: diff --git a/src/logserver/server.py b/src/logserver/server.py index 59fc9e7..19d9ce9 100644 --- a/src/logserver/server.py +++ b/src/logserver/server.py @@ -1,6 +1,8 @@ import asyncio +import datetime import os import sys +import uuid import aiofiles @@ -9,6 +11,7 @@ SimpleKafkaConsumeHandler, ExactlyOnceKafkaProduceHandler, ) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.utils import generate_unique_transactional_id from src.base.utils import setup_config from src.base.log_config import get_logger @@ -42,6 +45,10 @@ def __init__(self) -> None: self.kafka_consume_handler = SimpleKafkaConsumeHandler(CONSUME_TOPIC) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) + # databases + self.server_logs = ClickHouseKafkaSender("server_logs") + self.server_logs_timestamps = ClickHouseKafkaSender("server_logs_timestamps") + async def start(self) -> None: """ Starts fetching messages from Kafka and from the input file. @@ -68,16 +75,25 @@ async def start(self) -> None: logger.info("LogServer stopped.") - def send(self, message: str) -> None: + def send(self, message_id: uuid.UUID, message: str) -> None: """ Sends a received message using Kafka. Args: + message_id (uuid.UUID): UUID of the message message (str): Message to be sent """ self.kafka_produce_handler.produce(topic=PRODUCE_TOPIC, data=message) logger.debug(f"Sent: '{message}'") + self.server_logs_timestamps.insert( + dict( + message_id=message_id, + event="timestamp_out", + event_timestamp=datetime.datetime.now(), + ) + ) + async def fetch_from_kafka(self) -> None: """ Starts a loop to continuously listen on the configured Kafka topic. If a message is consumed, it is sent. @@ -90,7 +106,16 @@ async def fetch_from_kafka(self) -> None: ) logger.debug(f"From Kafka: '{value}'") - self.send(value) + message_id = uuid.uuid4() + self.server_logs.insert( + dict( + message_id=message_id, + timestamp_in=datetime.datetime.now(), + message_text=value, + ) + ) + + self.send(message_id, value) async def fetch_from_file(self, file: str = READ_FROM_FILE) -> None: """ @@ -117,7 +142,17 @@ async def fetch_from_file(self, file: str = READ_FROM_FILE) -> None: continue logger.debug(f"From file: '{cleaned_line}'") - self.send(cleaned_line) + + message_id = uuid.uuid4() + self.server_logs.insert( + dict( + message_id=message_id, + timestamp_in=datetime.datetime.now(), + message_text=cleaned_line, + ) + ) + + self.send(message_id, cleaned_line) def main() -> None: diff --git a/tests/__init__.py b/src/monitoring/__init__.py similarity index 100% rename from tests/__init__.py rename to src/monitoring/__init__.py diff --git a/src/monitoring/clickhouse_batch_sender.py b/src/monitoring/clickhouse_batch_sender.py new file mode 100644 index 0000000..fb87795 --- /dev/null +++ b/src/monitoring/clickhouse_batch_sender.py @@ -0,0 +1,82 @@ +import os +import sys +from threading import Timer + +import clickhouse_connect + +sys.path.append(os.getcwd()) +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CLICKHOUSE_HOSTNAME = CONFIG["environment"]["monitoring"]["clickhouse_server"][ + "hostname" +] +BATCH_SIZE = CONFIG["pipeline"]["monitoring"]["clickhouse_connector"]["batch_size"] +BATCH_TIMEOUT = CONFIG["pipeline"]["monitoring"]["clickhouse_connector"][ + "batch_timeout" +] + + +class ClickHouseBatchSender: + def __init__(self, table_name: str, column_names: list[str]): + self.table_name = table_name + self.column_names = column_names + + self.max_batch_size = BATCH_SIZE + self.batch_timeout = BATCH_TIMEOUT + + self.timer = None + self.batch = [] + self._client = clickhouse_connect.get_client( + host=CLICKHOUSE_HOSTNAME, + ) + + def __del__(self): + self.insert_all() + + def add(self, data: list[str] | list[list[str]]): + def _add_element(element): + if len(element) != len(self.column_names): + raise ValueError( + "Number of elements in the insert does not match the number of columns" + ) + + self.batch.append(element) + + if any(isinstance(e, list) for e in data): + for e in data: + _add_element(e) + else: + _add_element(data) + + if len(self.batch) >= self.max_batch_size: + self.insert_all() + elif not self.timer: + self._start_timer() + + def insert_all(self): + if self.batch: + self._client.insert( + self.table_name, + self.batch, + column_names=self.column_names, + ) + logger.debug( + f"Inserted {self.table_name=},{self.batch=},{self.column_names=}" + ) + self.batch = [] + + if self.timer: + self.timer.cancel() + + self.timer = None + + def _start_timer(self): + if self.timer: + self.timer.cancel() + + self.timer = Timer(BATCH_TIMEOUT, self.insert_all) + self.timer.start() diff --git a/src/monitoring/clickhouse_connector.py b/src/monitoring/clickhouse_connector.py new file mode 100644 index 0000000..efe652a --- /dev/null +++ b/src/monitoring/clickhouse_connector.py @@ -0,0 +1,325 @@ +import datetime +import os +import sys +import uuid +from abc import abstractmethod +from typing import Optional + +import clickhouse_connect + +sys.path.append(os.getcwd()) +from src.monitoring.clickhouse_batch_sender import ClickHouseBatchSender +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CLICKHOUSE_HOSTNAME = CONFIG["environment"]["monitoring"]["clickhouse_server"][ + "hostname" +] +CREATE_TABLES_DIRECTORY = "src/monitoring/create_tables" # TODO: Get from config + + +class ClickHouseConnector: + def __init__(self, table_name: str, column_names: list[str]): + self._table_name = table_name + self._column_names = column_names + + self._batch_sender = ClickHouseBatchSender( + table_name=self._table_name, + column_names=self._column_names, + ) + + def prepare_table(self): + def _load_contents(file_name: str) -> str: + with open(file_name, "r") as file: + return file.read() + + filename = self._table_name + ".sql" + file_path = os.path.join(CREATE_TABLES_DIRECTORY, filename) + sql_content = _load_contents(file_path) + + with clickhouse_connect.get_client(host=CLICKHOUSE_HOSTNAME) as client: + try: + client.command(sql_content) + except Exception as e: + logger.critical("Error in CREATE TABLE statement") + raise e + + def _add_to_batch(self, data): + self._batch_sender.add(data) + + @abstractmethod + def insert(self, *args, **kwargs): + pass + + +class ServerLogsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_id", + "timestamp_in", + "message_text", + ] + + super().__init__("server_logs", column_names) + + def insert( + self, + message_id: uuid.UUID, + timestamp_in: datetime.datetime, + message_text: str, + ): + self._add_to_batch([message_id, timestamp_in, message_text]) + + +class ServerLogsTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_id", + "event", + "event_timestamp", + ] + + super().__init__("server_logs_timestamps", column_names) + + def insert( + self, + message_id: uuid.UUID, + event: str, + event_timestamp: datetime.datetime, + ): + self._add_to_batch([message_id, event, event_timestamp]) + + +class FailedDNSLoglinesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "message_text", + "timestamp_in", + "timestamp_failed", + "reason_for_failure", + ] + + super().__init__("failed_dns_loglines", column_names) + + def insert( + self, + message_text: str, + timestamp_in: datetime.datetime, + timestamp_failed: datetime.datetime, + reason_for_failure: Optional[str] = None, + ) -> None: + self._add_to_batch( + [message_text, timestamp_in, timestamp_failed, reason_for_failure] + ) + + +class LoglineToBatchesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "batch_id", + ] + + super().__init__("logline_to_batches", column_names) + + def insert( + self, + logline_id: uuid.UUID, + batch_id: uuid.UUID, + ): + self._add_to_batch([logline_id, batch_id]) + + +class DNSLoglinesConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "subnet_id", + "timestamp", + "status_code", + "client_ip", + "record_type", + "additional_fields", + ] + + super().__init__("dns_loglines", column_names) + + def insert( + self, + logline_id: str | uuid.UUID, + subnet_id: str, + timestamp: str | datetime.datetime, + status_code: str, + client_ip: str, + record_type: str, + additional_fields: Optional[str] = None, + ): + self._add_to_batch( + [ + logline_id, + subnet_id, + timestamp, + status_code, + client_ip, + record_type, + additional_fields, + ] + ) + + +class LoglineTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "logline_id", + "stage", + "status", + "timestamp", + "is_active", + ] + + super().__init__("logline_timestamps", column_names) + + def insert( + self, + logline_id: uuid.UUID, + stage: str, + status: str, + timestamp: datetime.datetime, + is_active: bool, + ) -> None: + self._add_to_batch( + [ + logline_id, + stage, + status, + timestamp, + is_active, + ] + ) + + +class BatchTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "batch_id", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + super().__init__("batch_timestamps", column_names) + + def insert( + self, + batch_id: uuid.UUID, + stage: str, + status: str, + is_active: bool, + message_count: int, + timestamp: datetime.datetime, + ) -> None: + self._add_to_batch( + [ + batch_id, + stage, + status, + timestamp, + is_active, + message_count, + ] + ) + + +class SuspiciousBatchesToBatchConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "suspicious_batch_id", + "batch_id", + ] + + super().__init__("suspicious_batches_to_batch", column_names) + + def insert( + self, + suspicious_batch_id: uuid.UUID, + batch_id: uuid.UUID, + ) -> None: + self._add_to_batch( + [ + suspicious_batch_id, + batch_id, + ] + ) + + +class SuspiciousBatchTimestampsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "suspicious_batch_id", + "client_ip", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + super().__init__("suspicious_batch_timestamps", column_names) + + def insert( + self, + suspicious_batch_id: uuid.UUID, + client_ip: str, + stage: str, + status: str, + is_active: bool, + message_count: int, + timestamp: datetime.datetime, + ) -> None: + self._add_to_batch( + [ + suspicious_batch_id, + client_ip, + stage, + status, + timestamp, + is_active, + message_count, + ] + ) + + +class AlertsConnector(ClickHouseConnector): + def __init__(self): + column_names = [ + "client_ip", + "alert_timestamp", + "suspicious_batch_id", + "overall_score", + "result", + ] + + super().__init__("alerts", column_names) + + def insert( + self, + client_ip: str, + alert_timestamp: datetime.datetime, + suspicious_batch_id: uuid.UUID, + overall_score: float, + result: str, + ) -> None: + self._add_to_batch( + [ + client_ip, + alert_timestamp, + suspicious_batch_id, + overall_score, + result, + ] + ) diff --git a/src/monitoring/create_tables/alerts.sql b/src/monitoring/create_tables/alerts.sql new file mode 100644 index 0000000..92e4ad1 --- /dev/null +++ b/src/monitoring/create_tables/alerts.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS alerts ( + client_ip String NOT NULL, + alert_timestamp DateTime64(6) NOT NULL, + suspicious_batch_id UUID NOT NULL, + overall_score Float32 NOT NULL, + result String, +) +ENGINE = MergeTree +PRIMARY KEY(client_ip, alert_timestamp); diff --git a/src/monitoring/create_tables/batch_timestamps.sql b/src/monitoring/create_tables/batch_timestamps.sql new file mode 100644 index 0000000..45ef849 --- /dev/null +++ b/src/monitoring/create_tables/batch_timestamps.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS batch_timestamps ( + batch_id UUID NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL, + message_count UInt32, + is_active Bool NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (batch_id); diff --git a/src/monitoring/create_tables/dns_loglines.sql b/src/monitoring/create_tables/dns_loglines.sql new file mode 100644 index 0000000..c3468f7 --- /dev/null +++ b/src/monitoring/create_tables/dns_loglines.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS dns_loglines ( + logline_id UUID NOT NULL, + subnet_id String NOT NULL, + timestamp DateTime64(6) NOT NULL, + status_code String NOT NULL, + client_ip String NOT NULL, + record_type String NOT NULL, + additional_fields Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/failed_dns_loglines.sql b/src/monitoring/create_tables/failed_dns_loglines.sql new file mode 100644 index 0000000..846f6cd --- /dev/null +++ b/src/monitoring/create_tables/failed_dns_loglines.sql @@ -0,0 +1,8 @@ +CREATE TABLE IF NOT EXISTS failed_dns_loglines ( + message_text String NOT NULL, + timestamp_in DateTime64(6) NOT NULL, + timestamp_failed DateTime64(6) NOT NULL, + reason_for_failure Nullable(String) +) +ENGINE = MergeTree +PRIMARY KEY(message_text, timestamp_in); diff --git a/src/monitoring/create_tables/logline_timestamps.sql b/src/monitoring/create_tables/logline_timestamps.sql new file mode 100644 index 0000000..d7f25ca --- /dev/null +++ b/src/monitoring/create_tables/logline_timestamps.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS logline_timestamps ( + logline_id UUID NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL, + is_active Bool NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/logline_to_batches.sql b/src/monitoring/create_tables/logline_to_batches.sql new file mode 100644 index 0000000..41d4348 --- /dev/null +++ b/src/monitoring/create_tables/logline_to_batches.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS logline_to_batches ( + logline_id UUID NOT NULL, + batch_id UUID NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (logline_id); diff --git a/src/monitoring/create_tables/server_logs.sql b/src/monitoring/create_tables/server_logs.sql new file mode 100644 index 0000000..b191d83 --- /dev/null +++ b/src/monitoring/create_tables/server_logs.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS server_logs ( + message_id UUID NOT NULL, + timestamp_in DateTime64(6) NOT NULL, + message_text String NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY(message_id); diff --git a/src/monitoring/create_tables/server_logs_timestamps.sql b/src/monitoring/create_tables/server_logs_timestamps.sql new file mode 100644 index 0000000..7a6c58c --- /dev/null +++ b/src/monitoring/create_tables/server_logs_timestamps.sql @@ -0,0 +1,7 @@ +CREATE TABLE IF NOT EXISTS server_logs_timestamps ( + message_id UUID NOT NULL, + event String NOT NULL, + event_timestamp DateTime64(6) NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY(message_id); diff --git a/src/monitoring/create_tables/suspicious_batch_timestamps.sql b/src/monitoring/create_tables/suspicious_batch_timestamps.sql new file mode 100644 index 0000000..8c02e82 --- /dev/null +++ b/src/monitoring/create_tables/suspicious_batch_timestamps.sql @@ -0,0 +1,11 @@ +CREATE TABLE IF NOT EXISTS suspicious_batch_timestamps ( + suspicious_batch_id UUID NOT NULL, + client_ip String NOT NULL, + stage String NOT NULL, + status String NOT NULL, + timestamp DateTime64(6) NOT NULL, + message_count UInt32, + is_active Bool NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (suspicious_batch_id); diff --git a/src/monitoring/create_tables/suspicious_batches_to_batch.sql b/src/monitoring/create_tables/suspicious_batches_to_batch.sql new file mode 100644 index 0000000..d587a22 --- /dev/null +++ b/src/monitoring/create_tables/suspicious_batches_to_batch.sql @@ -0,0 +1,6 @@ +CREATE TABLE IF NOT EXISTS suspicious_batches_to_batch ( + suspicious_batch_id UUID NOT NULL, + batch_id UUID NOT NULL +) +ENGINE = MergeTree +PRIMARY KEY (suspicious_batch_id); diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py new file mode 100644 index 0000000..030dca2 --- /dev/null +++ b/src/monitoring/monitoring_agent.py @@ -0,0 +1,87 @@ +import asyncio +import os +import sys +from dataclasses import asdict + +import marshmallow_dataclass + +sys.path.append(os.getcwd()) +from src.monitoring.clickhouse_connector import * +from src.base.kafka_handler import SimpleKafkaConsumeHandler +from src.base.data_classes.clickhouse_connectors import TABLE_NAME_TO_TYPE +from src.base.log_config import get_logger +from src.base.utils import setup_config + +logger = get_logger() + +CONFIG = setup_config() +CREATE_TABLES_DIRECTORY = "src/monitoring/create_tables" # TODO: Get from config + + +def prepare_all_tables(): + def _load_contents(file_name: str) -> str: + with open(file_name, "r") as file: + return file.read() + + for filename in os.listdir(CREATE_TABLES_DIRECTORY): + if filename.endswith(".sql"): + file_path = os.path.join(CREATE_TABLES_DIRECTORY, filename) + sql_content = _load_contents(file_path) + + with clickhouse_connect.get_client(host=CLICKHOUSE_HOSTNAME) as client: + try: + client.command(sql_content) + except Exception as e: + logger.critical("Error in CREATE TABLE statement") + raise e + + +class MonitoringAgent: + def __init__(self): + self.connectors = { + "server_logs": ServerLogsConnector(), + "server_logs_timestamps": ServerLogsTimestampsConnector(), + "failed_dns_loglines": FailedDNSLoglinesConnector(), + "logline_to_batches": LoglineToBatchesConnector(), + "dns_loglines": DNSLoglinesConnector(), + "logline_timestamps": LoglineTimestampsConnector(), + "batch_timestamps": BatchTimestampsConnector(), + "suspicious_batches_to_batch": SuspiciousBatchesToBatchConnector(), + "suspicious_batch_timestamps": SuspiciousBatchTimestampsConnector(), + "alerts": AlertsConnector(), + } + + self.topics = [f"clickhouse_{table_name}" for table_name in self.connectors] + self.kafka_consumer = SimpleKafkaConsumeHandler(self.topics) + + async def start(self): + loop = asyncio.get_running_loop() + + try: + while True: + key, value, topic = await loop.run_in_executor( + None, self.kafka_consumer.consume + ) + logger.debug(f"From Kafka: {value}") + + table_name = topic.replace("clickhouse_", "") + data_schema = marshmallow_dataclass.class_schema( + TABLE_NAME_TO_TYPE.get(table_name) + )() + data = data_schema.loads(value) + + self.connectors[table_name].insert(**asdict(data)) + except KeyboardInterrupt: + logger.info("Stopped MonitoringAgent.") + except Exception as e: + logger.warning(e) + + +def main(): + prepare_all_tables() + clickhouse_consumer = MonitoringAgent() + asyncio.run(clickhouse_consumer.start()) + + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index 785c04d..c766d3a 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -1,9 +1,14 @@ -import ast +import datetime import json import os import sys +import uuid + +import marshmallow_dataclass sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.data_classes.batch import Batch from src.base.logline_handler import LoglineHandler from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, @@ -39,6 +44,7 @@ class Prefilter: """ def __init__(self): + self.batch_id = None self.begin_timestamp = None self.end_timestamp = None self.subnet_id = None @@ -51,25 +57,36 @@ def __init__(self): transactional_id = generate_unique_transactional_id(module_name, KAFKA_BROKERS) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler(transactional_id) + # databases + self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") + self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") + def get_and_fill_data(self) -> None: """ - Clears data already stored and consumes new data. Unpacks the data and checks if it is empty. If that is the - case, an info message is shown, otherwise the data is stored internally, including timestamps. + Clears data already stored and consumes new data. Unpacks the data and checks if it is empty. Data is stored + internally, including timestamps. """ - logger.debug("Checking for existing data...") - if self.unfiltered_data: - logger.warning("Overwriting existing data by new message...") - self.clear_data() - logger.debug("Cleared existing data.") + self.clear_data() # clear in case we already have data stored - logger.debug("Calling KafkaConsumeHandler for consuming JSON data...") - key, data = self.kafka_consume_handler.consume_as_json() + key, data = self.kafka_consume_handler.consume_as_object() self.subnet_id = key - if data: - self.begin_timestamp = data.get("begin_timestamp") - self.end_timestamp = data.get("end_timestamp") - self.unfiltered_data = data.get("data") + if data.data: + self.batch_id = data.batch_id + self.begin_timestamp = data.begin_timestamp + self.end_timestamp = data.end_timestamp + self.unfiltered_data = data.data + + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="in_process", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=len(self.unfiltered_data), + ) + ) if not self.unfiltered_data: logger.info( @@ -83,53 +100,58 @@ def get_and_fill_data(self) -> None: f"subnet_id: '{self.subnet_id}'." ) - logger.debug("Received consumer message as JSON data.") - logger.debug(f"{data=}") - def filter_by_error(self) -> None: """ Applies the filter to the data in ``unfiltered_data``, i.e. all loglines whose error status is in the given error types are kept and added to ``filtered_data``, all other ones are discarded. """ - logger.debug("Filtering data...") - for e in self.unfiltered_data: - e_as_json = ast.literal_eval(e) - if self.logline_handler.check_relevance(e_as_json): + if self.logline_handler.check_relevance(e): self.filtered_data.append(e) - - logger.debug("Data filtered and now available in filtered_data.") - logger.info("Data successfully filtered.") + else: # not relevant, filtered out + logline_id = uuid.UUID(e.get("logline_id")) + + self.logline_timestamps.insert( + dict( + logline_id=logline_id, + stage=module_name, + status="filtered_out", + timestamp=datetime.datetime.now(), + is_active=False, + ) + ) def send_filtered_data(self): """ Sends the filtered data if available via the :class:`KafkaProduceHandler`. """ - if not self.unfiltered_data: - logger.debug("No unfiltered or filtered data is available.") - return - if not self.filtered_data: - logger.info("No errors in filtered data.") - logger.debug("No data sent. No filtered or unfiltered data exists.") raise ValueError("Failed to send data: No filtered data.") data_to_send = { + "batch_id": self.batch_id, "begin_timestamp": self.begin_timestamp, "end_timestamp": self.end_timestamp, "data": self.filtered_data, } - logger.debug("Calling KafkaProduceHandler...") - logger.debug(f"{data_to_send=}") + + self.batch_timestamps.insert( + dict( + batch_id=self.batch_id, + stage=module_name, + status="finished", + timestamp=datetime.datetime.now(), + is_active=True, + message_count=len(self.filtered_data), + ) + ) + + batch_schema = marshmallow_dataclass.class_schema(Batch)() self.kafka_produce_handler.produce( topic=PRODUCE_TOPIC, - data=json.dumps(data_to_send), + data=batch_schema.dumps(data_to_send), key=self.subnet_id, ) - logger.debug( - f"Sent filtered data with time frame from {self.begin_timestamp} to {self.end_timestamp} and data" - f" ({len(self.filtered_data)} message(s))." - ) logger.info( f"Filtered data was successfully sent:\n" f" ⤷ Contains data field of {len(self.filtered_data)} message(s). Originally: " @@ -137,17 +159,14 @@ def send_filtered_data(self): ) def clear_data(self): - """ - Clears the data in the internal data structures. - """ + """Clears the data in the internal data structures.""" self.unfiltered_data = [] self.filtered_data = [] - logger.debug("Cleared data.") def main(one_iteration: bool = False) -> None: """ - Runs the main loop with by + Runs the main loop by 1. Retrieving new data, 2. Filtering the data and @@ -158,29 +177,18 @@ def main(one_iteration: bool = False) -> None: Args: one_iteration (bool): Only one iteration is done if True (for testing purposes). False by default. """ - logger.info("Starting Prefilter...") prefilter = Prefilter() - logger.info(f"Prefilter started.") iterations = 0 - while True: if one_iteration and iterations > 0: break iterations += 1 try: - logger.debug("Before getting and filling data") prefilter.get_and_fill_data() - logger.debug("After getting and filling data") - - logger.debug("Before filtering by error") prefilter.filter_by_error() - logger.debug("After filtering by error") - - logger.debug("Before adding filtered data to batch") prefilter.send_filtered_data() - logger.debug("After adding filtered data to batch") except IOError as e: logger.error(e) raise diff --git a/tests/clickhouse/__init__.py b/tests/clickhouse/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/clickhouse/test_clickhouse_batch_sender.py b/tests/clickhouse/test_clickhouse_batch_sender.py new file mode 100644 index 0000000..f152cef --- /dev/null +++ b/tests/clickhouse/test_clickhouse_batch_sender.py @@ -0,0 +1,275 @@ +import unittest +from unittest.mock import patch, Mock + +from src.monitoring.clickhouse_batch_sender import ClickHouseBatchSender + + +class TestInit(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.BATCH_SIZE", 50) + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.CLICKHOUSE_HOSTNAME", "test_name") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_init(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + + # Act + sut = ClickHouseBatchSender(table_name, column_names) + + # Assert + self.assertEqual(table_name, sut.table_name) + self.assertEqual(column_names, sut.column_names) + self.assertEqual(50, sut.max_batch_size) + self.assertEqual(0.5, sut.batch_timeout) + self.assertIsNone(sut.timer) + self.assertEqual([], sut.batch) + + mock_clickhouse_connect.get_client.assert_called_once_with(host="test_name") + + +class TestDel(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_del(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + # Act + with patch( + "src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all" + ) as mock_insert_all: + del sut + + # Assert + mock_insert_all.assert_called_once() + + +class TestAdd(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_str_successful( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_timer_already_started( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + sut.timer = Mock() + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_max_size_reached_and_timer_already_started( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + sut.timer = Mock() + sut.max_batch_size = 1 + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"]], sut.batch) + + mock_insert_all.assert_called_once() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_str_wrong_field_number( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = ["entry_1", "entry_2"] + + # Act + with self.assertRaises(ValueError): + sut.add(data) + + # Assert + self.assertEqual([], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_lists_successful( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = [["entry_1", "entry_2"], ["entry_3", "entry_4"]] + + # Act + sut.add(data) + + # Assert + self.assertEqual([["entry_1", "entry_2"], ["entry_3", "entry_4"]], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender.insert_all") + @patch("src.monitoring.clickhouse_batch_sender.ClickHouseBatchSender._start_timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_add_list_of_lists_wrong_field_number( + self, mock_clickhouse_connect, mock_start_timer, mock_insert_all + ): + # Arrange + table_name = "test_table_name" + column_names = ["col_1"] + sut = ClickHouseBatchSender(table_name, column_names) + + data = [["entry_1", "entry_2"], ["entry_3"]] + + # Act + with self.assertRaises(ValueError): + sut.add(data) + + # Assert + self.assertEqual([], sut.batch) + + mock_insert_all.assert_not_called() + mock_start_timer.assert_not_called() + + +class TestInsertAll(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_insert_all(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut._client = Mock() + sut.batch = [["entry_1", "entry_2"], ["entry_3", "entry_4"]] + + # Act + sut.insert_all() + + # Assert + self.assertEqual([], sut.batch) + self.assertIsNone(sut.timer) + + sut._client.insert.assert_called_once_with( + table_name, + [["entry_1", "entry_2"], ["entry_3", "entry_4"]], + column_names=column_names, + ) + + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_insert_all_with_timer(self, mock_clickhouse_connect): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut._client = Mock() + sut.timer = Mock() + sut.batch = [["entry_1", "entry_2"]] + + # Act + sut.insert_all() + + # Assert + self.assertEqual([], sut.batch) + self.assertIsNone(sut.timer) + + sut._client.insert.assert_called_once_with( + table_name, + [["entry_1", "entry_2"]], + column_names=column_names, + ) + + +class TestStartTimer(unittest.TestCase): + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.Timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_start_timer(self, mock_clickhouse_connect, mock_timer): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + + # Act + sut._start_timer() + + # Assert + mock_timer.assert_called_once_with( + 0.5, + sut.insert_all, + ) + mock_timer.cancel.assert_not_called() + sut.timer.start.assert_called_once() + + @patch("src.monitoring.clickhouse_batch_sender.BATCH_TIMEOUT", 0.5) + @patch("src.monitoring.clickhouse_batch_sender.Timer") + @patch("src.monitoring.clickhouse_batch_sender.clickhouse_connect") + def test_start_timer_with_running_timer(self, mock_clickhouse_connect, mock_timer): + # Arrange + table_name = "test_table_name" + column_names = ["col_1", "col_2"] + sut = ClickHouseBatchSender(table_name, column_names) + sut.timer = mock_timer + + # Act + sut._start_timer() + + # Assert + mock_timer.assert_called_once_with( + 0.5, + sut.insert_all, + ) + mock_timer.cancel.assert_called_once() + sut.timer.start.assert_called_once() diff --git a/tests/clickhouse/test_clickhouse_connector.py b/tests/clickhouse/test_clickhouse_connector.py new file mode 100644 index 0000000..bba9a89 --- /dev/null +++ b/tests/clickhouse/test_clickhouse_connector.py @@ -0,0 +1,728 @@ +import json +import unittest +from unittest.mock import patch, MagicMock, mock_open + +from src.monitoring.clickhouse_connector import * + + +class TestClickHouseConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + table_name = "test_table" + column_names = ["col_1", "col_2", "col_3"] + + # Act + sut = ClickHouseConnector(table_name, column_names) + + # Assert + self.assertEqual(table_name, sut._table_name) + self.assertEqual(column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=table_name, + column_names=column_names, + ) + + @patch("src.monitoring.clickhouse_connector.os.path.join") + @patch( + "src.monitoring.clickhouse_connector.open", + new_callable=mock_open, + read_data="CREATE TABLE test;", + ) + @patch("src.monitoring.clickhouse_connector.clickhouse_connect.get_client") + def test_prepare_table_success( + self, mock_get_client, mock_open_file, mock_path_join + ): + # Arrange + mock_client = MagicMock() + mock_get_client.return_value.__enter__.return_value = mock_client + mock_path_join.return_value = "/fake/path/test_table.sql" + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut.prepare_table() + + # Assert + mock_open_file.assert_called_once_with("/fake/path/test_table.sql", "r") + mock_client.command.assert_called_once_with("CREATE TABLE test;") + + @patch("src.monitoring.clickhouse_connector.os.path.join") + @patch( + "src.monitoring.clickhouse_connector.open", + new_callable=mock_open, + read_data="CREATE TABLE test;", + ) + @patch("src.monitoring.clickhouse_connector.clickhouse_connect.get_client") + @patch("src.monitoring.clickhouse_connector.logger") + def test_prepare_table_failure( + self, mock_logger, mock_get_client, mock_open_file, mock_path_join + ): + mock_client = MagicMock() + mock_get_client.return_value.__enter__.return_value = mock_client + mock_path_join.return_value = "/fake/path/test_table.sql" + mock_client.command.side_effect = Exception("Test exception") + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + with self.assertRaises(Exception): + sut.prepare_table() + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_add_to_batch(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut._add_to_batch("test_data") + + # Assert + mock_clickhouse_batch_sender_instance.add.assert_called_once_with("test_data") + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert(self, mock_clickhouse_batch_sender): + # Arrange + sut = ClickHouseConnector("test_table", ["col_1", "col_2", "col_3"]) + + # Act + sut.insert("test_data") + + +class TestServerLogsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "server_logs" + expected_column_names = [ + "message_id", + "timestamp_in", + "message_text", + ] + + # Act + sut = ServerLogsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + message_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + + sut = ServerLogsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_text=message_text, + message_id=message_id, + timestamp_in=timestamp_in, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + "test_message_text", + ] + ) + + +class TestServerLogsTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "server_logs_timestamps" + expected_column_names = [ + "message_id", + "event", + "event_timestamp", + ] + + # Act + sut = ServerLogsTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + event = "test_event" + event_timestamp = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + + sut = ServerLogsTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_id=message_id, + event=event, + event_timestamp=event_timestamp, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "test_event", + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + ] + ) + + +class TestFailedDNSLoglinesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "failed_dns_loglines" + expected_column_names = [ + "message_text", + "timestamp_in", + "timestamp_failed", + "reason_for_failure", + ] + + # Act + sut = FailedDNSLoglinesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + timestamp_failed = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + reason_for_failure = "Wrong client_ip field" + + sut = FailedDNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_text=message_text, + timestamp_in=timestamp_in, + timestamp_failed=timestamp_failed, + reason_for_failure=reason_for_failure, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + "test_message_text", + datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + "Wrong client_ip field", + ] + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_none_given(self, mock_clickhouse_batch_sender): + # Arrange + message_text = "test_message_text" + timestamp_in = datetime.datetime(2034, 12, 13, 12, 34, 12, 132412) + timestamp_failed = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + + sut = FailedDNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + message_text=message_text, + timestamp_in=datetime.datetime(2034, 12, 13, 12, 34, 12, 132412), + timestamp_failed=datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + reason_for_failure=None, + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestLoglineToBatchesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "logline_to_batches" + expected_column_names = [ + "logline_id", + "batch_id", + ] + + # Act + sut = LoglineToBatchesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") + + sut = LoglineToBatchesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + batch_id=batch_id, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), + ] + ) + + +class TestDNSLoglinesConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "dns_loglines" + expected_column_names = [ + "logline_id", + "subnet_id", + "timestamp", + "status_code", + "client_ip", + "record_type", + "additional_fields", + ] + + # Act + sut = DNSLoglinesConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("d7add097-40a5-42f6-89df-1e7b20c4a4b8") + subnet_id = "127.0.0.0_24" + timestamp = datetime.datetime(2024, 12, 6, 13, 41, 53, 589594) + status_code = "NXDOMAIN" + client_ip = "127.0.0.1" + record_type = "A" + additional_fields = json.dumps(dict(test="some_field")) + + sut = DNSLoglinesConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + subnet_id=subnet_id, + timestamp=timestamp, + status_code=status_code, + client_ip=client_ip, + record_type=record_type, + additional_fields=additional_fields, + ) + + # Assert + mock_add_to_batch.assert_called_once() + + +class TestLoglineTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "logline_timestamps" + expected_column_names = [ + "logline_id", + "stage", + "status", + "timestamp", + "is_active", + ] + + # Act + sut = LoglineTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + logline_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + stage = "prefilter" + status = "prefilter_out" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + + sut = LoglineTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + logline_id=logline_id, + stage=stage, + status=status, + timestamp=timestamp, + is_active=True, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + True, + ] + ) + + +class TestBatchTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "batch_timestamps" + expected_column_names = [ + "batch_id", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + # Act + sut = BatchTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + stage = "prefilter" + status = "prefilter_out" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + message_count = 456 + + sut = BatchTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + batch_id=batch_id, + stage=stage, + status=status, + timestamp=timestamp, + is_active=True, + message_count=message_count, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + True, + 456, + ] + ) + + +class TestSuspiciousBatchesToBatchConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "suspicious_batches_to_batch" + expected_column_names = [ + "suspicious_batch_id", + "batch_id", + ] + + # Act + sut = SuspiciousBatchesToBatchConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + batch_id = uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6") + + sut = SuspiciousBatchesToBatchConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + suspicious_batch_id=suspicious_batch_id, + batch_id=batch_id, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + uuid.UUID("1f855c43-8a75-4b53-b6cd-4a13b89312d6"), + ] + ) + + +class TestSuspiciousBatchTimestampsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "suspicious_batch_timestamps" + expected_column_names = [ + "suspicious_batch_id", + "client_ip", + "stage", + "status", + "timestamp", + "is_active", + "message_count", + ] + + # Act + sut = SuspiciousBatchTimestampsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + client_ip = "127.0.0.1" + stage = "prefilter" + status = "prefilter_out" + timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + message_count = 456 + + sut = SuspiciousBatchTimestampsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + suspicious_batch_id=suspicious_batch_id, + client_ip=client_ip, + stage=stage, + status=status, + timestamp=timestamp, + is_active=True, + message_count=message_count, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + "127.0.0.1", + "prefilter", + "prefilter_out", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + True, + 456, + ] + ) + + +class TestAlertsConnector(unittest.TestCase): + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_init(self, mock_clickhouse_batch_sender): + # Arrange + mock_clickhouse_batch_sender_instance = MagicMock() + mock_clickhouse_batch_sender.return_value = ( + mock_clickhouse_batch_sender_instance + ) + + expected_table_name = "alerts" + expected_column_names = [ + "client_ip", + "alert_timestamp", + "suspicious_batch_id", + "overall_score", + "result", + ] + + # Act + sut = AlertsConnector() + + # Assert + self.assertEqual(expected_table_name, sut._table_name) + self.assertEqual(expected_column_names, sut._column_names) + self.assertEqual(mock_clickhouse_batch_sender_instance, sut._batch_sender) + + mock_clickhouse_batch_sender.assert_called_once_with( + table_name=expected_table_name, + column_names=expected_column_names, + ) + + @patch("src.monitoring.clickhouse_connector.ClickHouseBatchSender") + def test_insert_all_given(self, mock_clickhouse_batch_sender): + # Arrange + client_ip = "127.0.0.1" + alert_timestamp = datetime.datetime(2034, 12, 13, 12, 35, 35, 542635) + suspicious_batch_id = uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff") + overall_score = 15.4 + result = "test" + + sut = AlertsConnector() + + with patch.object(sut, "_add_to_batch", MagicMock()) as mock_add_to_batch: + # Act + sut.insert( + client_ip=client_ip, + alert_timestamp=alert_timestamp, + suspicious_batch_id=suspicious_batch_id, + overall_score=overall_score, + result=result, + ) + + # Assert + mock_add_to_batch.assert_called_once_with( + [ + "127.0.0.1", + datetime.datetime(2034, 12, 13, 12, 35, 35, 542635), + uuid.UUID("7299539b-6215-4f6b-b39f-69335aafbeff"), + 15.4, + "test", + ] + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/clickhouse/test_clickhouse_kafka_sender.py b/tests/clickhouse/test_clickhouse_kafka_sender.py new file mode 100644 index 0000000..9bb5dbc --- /dev/null +++ b/tests/clickhouse/test_clickhouse_kafka_sender.py @@ -0,0 +1,42 @@ +import unittest +from unittest.mock import patch + +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender + + +class TestInit(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.marshmallow_dataclass") + @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") + def test_init(self, mock_produce_handler, mock_marshmallow): + # Arrange + table_name = "test_table" + mock_produce_handler_instance = mock_produce_handler + mock_produce_handler.return_value = mock_produce_handler_instance + + # Act + sut = ClickHouseKafkaSender(table_name) + + # Assert + self.assertEqual(table_name, sut.table_name) + self.assertEqual(mock_produce_handler_instance, sut.kafka_producer) + mock_produce_handler.assert_called_once() + + +class TestInsert(unittest.TestCase): + @patch("src.base.clickhouse_kafka_sender.marshmallow_dataclass") + @patch("src.base.clickhouse_kafka_sender.SimpleKafkaProduceHandler") + def test_insert(self, mock_produce_handler, mock_marshmallow): + # Arrange + mock_produce_handler_instance = mock_produce_handler + mock_produce_handler.return_value = mock_produce_handler_instance + sut = ClickHouseKafkaSender("test_table") + + # Act + sut.insert({"test_key": "test_value"}) + + # Assert + mock_produce_handler_instance.produce.assert_called_once() + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/detector/__init__.py b/tests/detector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_detector.py b/tests/detector/test_detector.py similarity index 91% rename from tests/test_detector.py rename to tests/detector/test_detector.py index be7c487..6bc878d 100644 --- a/tests/test_detector.py +++ b/tests/detector/test_detector.py @@ -1,12 +1,13 @@ import os import tempfile import unittest +import uuid from datetime import datetime, timedelta from unittest.mock import MagicMock, patch, mock_open from requests import HTTPError -from src.base import Batch +from src.base.data_classes.batch import Batch from src.detector.detector import Detector, WrongChecksum @@ -134,7 +135,8 @@ class TestInit(unittest.TestCase): @patch("src.detector.detector.CONSUME_TOPIC", "test_topic") @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_init(self, mock_kafka_consume_handler, mock_logger): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_init(self, mock_clickhouse, mock_kafka_consume_handler, mock_logger): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -148,10 +150,12 @@ def test_init(self, mock_kafka_consume_handler, mock_logger): class TestGetData(unittest.TestCase): @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") def test_get_data_without_return_data( - self, mock_kafka_consume_handler, mock_logger + self, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=datetime.now(), end_timestamp=datetime.now() + timedelta(0, 3), data=[], @@ -171,10 +175,14 @@ def test_get_data_without_return_data( @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_get_data_with_return_data(self, mock_kafka_consume_handler, mock_logger): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_data_with_return_data( + self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + ): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[{"test": "test_message_2"}], @@ -201,6 +209,7 @@ def test_get_data_while_busy(self, mock_kafka_consume_handler, mock_logger): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[{"test": "test_message_2"}], @@ -236,7 +245,8 @@ def setUp(self): "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_save_warning(self, mock_kafka_consume_handler): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_save_warning(self, mock_clickhouse, mock_kafka_consume_handler): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -273,7 +283,8 @@ def test_save_warning(self, mock_kafka_consume_handler): "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_save_empty_warning(self, mock_kafka_consume_handler): + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_save_empty_warning(self, mock_clickhouse, mock_kafka_consume_handler): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -325,7 +336,9 @@ def test_clear_data_without_existing_data( ): begin = datetime.now() end = begin + timedelta(0, 3) - test_batch = Batch(begin_timestamp=begin, end_timestamp=end, data=[]) + test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + ) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -347,7 +360,9 @@ def test_clear_data_with_existing_data( ): begin = datetime.now() end = begin + timedelta(0, 3) - test_batch = Batch(begin_timestamp=begin, end_timestamp=end, data=[]) + test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + ) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance diff --git a/tests/inspector/__init__.py b/tests/inspector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_inspector.py b/tests/inspector/test_inspector.py similarity index 87% rename from tests/test_inspector.py rename to tests/inspector/test_inspector.py index 5c5f7c7..418124d 100644 --- a/tests/test_inspector.py +++ b/tests/inspector/test_inspector.py @@ -1,11 +1,13 @@ import unittest +import uuid from datetime import datetime, timedelta from unittest.mock import MagicMock, patch -import numpy as np -import json +import marshmallow_dataclass +import numpy as np from streamad.model import ZScoreDetector, RShashDetector -from src.base import Batch + +from src.base.data_classes.batch import Batch from src.inspector.inspector import Inspector, main DEFAULT_DATA = { @@ -26,9 +28,10 @@ def get_batch(data): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, - data=data if data != None else [], + data=data if data is not None else [], ) return test_batch @@ -54,8 +57,13 @@ class TestGetData(unittest.TestCase): @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_without_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance @@ -74,8 +82,13 @@ def test_get_data_without_return_data( @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_with_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch([{"test": "test_message_1"}, {"test": "test_message_2"}]) mock_kafka_consume_handler_instance = MagicMock() @@ -100,8 +113,13 @@ def test_get_data_with_return_data( @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_get_data_with_no_return_data( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): begin = None end = None @@ -319,7 +337,7 @@ def test_inspect_none_models( "src.inspector.inspector.MODELS", "", ) - def test_inspect_empy_models( + def test_inspect_empty_models( self, mock_kafka_consume_handler, mock_produce_handler, mock_logger ): mock_kafka_consume_handler_instance = MagicMock() @@ -344,8 +362,13 @@ def test_inspect_empy_models( ) @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_univariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -384,8 +407,13 @@ def test_inspect_univariate( ) @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) - def test_inspect_univariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate_2( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -421,8 +449,13 @@ def test_inspect_univariate( ) @patch("src.inspector.inspector.TIME_TYPE", "ms") @patch("src.inspector.inspector.TIME_RANGE", 1) + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_univariate_two_models( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -455,8 +488,13 @@ def test_inspect_univariate_two_models( [{"model": "RShashDetector", "module": "streamad.model", "model_args": {}}], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -494,8 +532,13 @@ def test_inspect_multivariate( ], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate_window_len( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -530,8 +573,13 @@ def test_inspect_multivariate_window_len( ], ) @patch("src.inspector.inspector.MODE", "multivariate") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_multivariate_two_models( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -575,8 +623,13 @@ def test_inspect_multivariate_two_models( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -627,8 +680,13 @@ def test_inspect_ensemble( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble_window_len( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -671,8 +729,13 @@ def test_inspect_ensemble_window_len( }, ) @patch("src.inspector.inspector.MODE", "ensemble") + @patch("src.inspector.inspector.ClickHouseKafkaSender") def test_inspect_ensemble_invalid( - self, mock_kafka_consume_handler, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): test_batch = get_batch(None) test_batch.begin_timestamp = datetime.now() @@ -770,15 +833,24 @@ def test_invalid_mode(self, mock_kafka_consume_handler, mock_produce_handler): class TestSend(unittest.TestCase): + @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) - def test_send(self, mock_kafka_consume_handler, mock_produce_handler): + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() sut = Inspector() sut.anomalies = [0.9, 0.9] @@ -790,20 +862,60 @@ def test_send(self, mock_kafka_consume_handler, mock_produce_handler): sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT ) sut.messages = [data] - sut.send_data() - - mock_produce_handler_instance.send.assert_called_once_with( - topic="Detector", - data=json.dumps( + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() + + mock_produce_handler_instance.produce.assert_called_once_with( + topic="pipeline.inspector_to_detector", + data=batch_schema.dumps( { - "begin_timestamp": sut.begin_timestamp.strftime(TIMESTAMP_FORMAT), - "end_timestamp": sut.end_timestamp.strftime(TIMESTAMP_FORMAT), + "batch_id": mock_batch_id, + "begin_timestamp": sut.begin_timestamp, + "end_timestamp": sut.end_timestamp, "data": [data], } ), key="192.168.0.167", ) + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) + @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send_not_suspicious( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_produce_handler_instance = MagicMock() + mock_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() + + sut = Inspector() + sut.anomalies = [0.0, 0.0] + sut.X = np.array([[0.0], [0.0]]) + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["timestamp"] = datetime.strftime( + sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + ) + sut.messages = [data] + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() + + mock_produce_handler_instance.produce.assert_not_called() + class TestMainFunction(unittest.TestCase): @patch("src.inspector.inspector.logger") diff --git a/tests/kafka/__init__.py b/tests/kafka/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_exactly_once_kafka_consume_handler.py b/tests/kafka/test_exactly_once_kafka_consume_handler.py similarity index 91% rename from tests/test_exactly_once_kafka_consume_handler.py rename to tests/kafka/test_exactly_once_kafka_consume_handler.py index 901b94a..381a53a 100644 --- a/tests/test_exactly_once_kafka_consume_handler.py +++ b/tests/kafka/test_exactly_once_kafka_consume_handler.py @@ -1,11 +1,13 @@ import datetime import json import unittest +import uuid from unittest.mock import patch, Mock +import marshmallow_dataclass from confluent_kafka import KafkaException, KafkaError -from src.base import Batch +from src.base.data_classes.batch import Batch from src.base.kafka_handler import ExactlyOnceKafkaConsumeHandler CONSUMER_GROUP_ID = "test_group_id" @@ -305,10 +307,6 @@ class TestConsumeAsObject(unittest.TestCase): @patch("src.base.kafka_handler.Consumer") def setUp(self, mock_consumer): self.sut = ExactlyOnceKafkaConsumeHandler(topics="test_topic") - self.sut.batch_schema = Mock() - self.sut.batch_schema.load = Mock( - return_value=Batch(datetime.datetime.now(), datetime.datetime.now()) - ) def test_consume_as_object_no_key_no_value(self): with patch( @@ -322,14 +320,16 @@ def test_consume_as_object_no_key_no_value(self): def test_consume_as_object_valid_data(self): key = "valid_key" - value = json.dumps({"data": [{"field1": "value1", "field2": "value2"}]}) - topic = "test_topic" - batch_data = [{"field1": "value1", "field2": "value2"}] - self.sut.batch_schema.load.return_value = Batch( - datetime.datetime.now(), - datetime.datetime.now(), - batch_data, + batch_schema = marshmallow_dataclass.class_schema(Batch)() + value = batch_schema.dumps( + { + "batch_id": uuid.uuid4(), + "begin_timestamp": datetime.datetime.now(), + "end_timestamp": datetime.datetime.now(), + "data": [{"field1": "value1", "field2": "value2"}], + } ) + topic = "test_topic" with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" @@ -343,21 +343,19 @@ def test_consume_as_object_valid_data(self): def test_consume_as_object_valid_data_with_inner_strings(self): key = "valid_key" - value = json.dumps( + batch_schema = marshmallow_dataclass.class_schema(Batch)() + value = batch_schema.dumps( { + "batch_id": uuid.uuid4(), + "begin_timestamp": datetime.datetime.now(), + "end_timestamp": datetime.datetime.now(), "data": [ '{"field1": "value1", "field2": "value2"}', '{"field3": "value3", "field4": "value4"}', - ] + ], } ) topic = "test_topic" - batch_data = [{"field1": "value1", "field2": "value2"}] - self.sut.batch_schema.load.return_value = Batch( - datetime.datetime.now(), - datetime.datetime.now(), - batch_data, - ) with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" @@ -384,12 +382,16 @@ def test_consume_as_object_invalid_data(self): with self.assertRaises(ValueError): self.sut.consume_as_object() - def test_consume_as_object_invalid_batch(self): + @patch("src.base.kafka_handler.marshmallow_dataclass.class_schema") + def test_consume_as_object_invalid_batch(self, mock_schema): key = "valid_key" value = json.dumps({"data": [{"field1": "value1", "field2": "value2"}]}) topic = "test_topic" - self.sut.batch_schema.load.return_value = None + mock_schema_instance = Mock() + mock_schema.return_value = mock_schema_instance + + mock_schema_instance.load.return_value = None with patch( "src.base.kafka_handler.ExactlyOnceKafkaConsumeHandler.consume" diff --git a/tests/test_exactly_once_kafka_produce_handler.py b/tests/kafka/test_exactly_once_kafka_produce_handler.py similarity index 100% rename from tests/test_exactly_once_kafka_produce_handler.py rename to tests/kafka/test_exactly_once_kafka_produce_handler.py diff --git a/tests/test_kafka_consume_handler.py b/tests/kafka/test_kafka_consume_handler.py similarity index 100% rename from tests/test_kafka_consume_handler.py rename to tests/kafka/test_kafka_consume_handler.py diff --git a/tests/test_kafka_handler.py b/tests/kafka/test_kafka_handler.py similarity index 100% rename from tests/test_kafka_handler.py rename to tests/kafka/test_kafka_handler.py diff --git a/tests/test_kafka_produce_handler.py b/tests/kafka/test_kafka_produce_handler.py similarity index 100% rename from tests/test_kafka_produce_handler.py rename to tests/kafka/test_kafka_produce_handler.py diff --git a/tests/test_simple_kafka_consume_handler.py b/tests/kafka/test_simple_kafka_consume_handler.py similarity index 100% rename from tests/test_simple_kafka_consume_handler.py rename to tests/kafka/test_simple_kafka_consume_handler.py diff --git a/tests/test_simple_kafka_produce_handler.py b/tests/kafka/test_simple_kafka_produce_handler.py similarity index 100% rename from tests/test_simple_kafka_produce_handler.py rename to tests/kafka/test_simple_kafka_produce_handler.py diff --git a/tests/logcollector/__init__.py b/tests/logcollector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_batch_handler.py b/tests/logcollector/test_batch_handler.py similarity index 78% rename from tests/test_batch_handler.py rename to tests/logcollector/test_batch_handler.py index bd69c0d..cc3f56d 100644 --- a/tests/test_batch_handler.py +++ b/tests/logcollector/test_batch_handler.py @@ -1,4 +1,7 @@ +import datetime +import json import unittest +import uuid from unittest.mock import patch, MagicMock from src.logcollector.batch_handler import BufferedBatchSender @@ -8,7 +11,10 @@ class TestInit(unittest.TestCase): @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.BufferedBatch") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") - def test_init_with_buffer(self, mock_kafka_produce_handler, mock_buffered_batch): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_init_with_buffer( + self, mock_clickhouse, mock_kafka_produce_handler, mock_buffered_batch + ): # Arrange mock_handler_instance = MagicMock() mock_kafka_produce_handler.return_value = mock_handler_instance @@ -25,7 +31,9 @@ def test_init_with_buffer(self, mock_kafka_produce_handler, mock_buffered_batch) self.assertEqual(mock_handler_instance, sut.kafka_produce_handler) mock_buffered_batch.assert_called_once() - mock_kafka_produce_handler.assert_called_once_with(transactional_id="collector") + mock_kafka_produce_handler.assert_called_once_with( + "log_collection.batch_handler" + ) class TestDel(unittest.TestCase): @@ -40,8 +48,10 @@ class TestAddMessage(unittest.TestCase): @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") @patch("src.logcollector.batch_handler.BufferedBatch.get_number_of_messages") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_normal( self, + mock_clickhouse, mock_send_batch, mock_get_nr_messages, mock_reset_timer, @@ -54,7 +64,12 @@ def test_add_message_normal( mock_get_nr_messages.return_value = 1 key = "test_key" - message = "test_message" + message = json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"test_message", + ) + ) sut = BufferedBatchSender() sut.timer = MagicMock() @@ -71,8 +86,9 @@ def test_add_message_normal( @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages( - self, mock_send_batch, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -85,19 +101,34 @@ def test_add_message_full_messages( # Act for i in range(99): - sut.add_message(key, f"message_{i}") + test_message = json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ) + sut.add_message(key, test_message) # Assert mock_send_batch.assert_not_called() - sut.add_message(key, f"message_100") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="message_100", + ) + ), + ) mock_send_batch.assert_called_once() @patch("src.logcollector.batch_handler.logger") @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages_with_different_keys( - self, mock_send_batch, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -111,23 +142,56 @@ def test_add_message_full_messages_with_different_keys( # Act for i in range(79): - sut.add_message(key, f"message_{i}") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) for i in range(15): - sut.add_message(other_key, f"message_{i}") + sut.add_message( + other_key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) for i in range(20): - sut.add_message(key, f"message_{i}") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data=f"message_{i}", + ) + ), + ) # Assert mock_send_batch.assert_not_called() - sut.add_message(key, f"message_100") + sut.add_message( + key, + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="message_100", + ) + ), + ) mock_send_batch.assert_called_once() @patch("src.logcollector.batch_handler.logger") @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_no_timer( - self, mock_reset_timer, mock_produce_handler, mock_logger + self, mock_clickhouse, mock_reset_timer, mock_produce_handler, mock_logger ): # Arrange mock_produce_handler_instance = MagicMock() @@ -137,7 +201,15 @@ def test_add_message_no_timer( sut.timer = None # Act - sut.add_message("test_key", "test_message") + sut.add_message( + "test_key", + json.dumps( + dict( + logline_id=str(uuid.uuid4()), + data="test_message", + ) + ), + ) # Assert mock_reset_timer.assert_called_once() @@ -293,7 +365,8 @@ def test_send_batch_for_key_value_error( class TestSendDataPacket(unittest.TestCase): @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") - def test_send_data_packet(self, mock_produce_handler): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_send_data_packet(self, mock_clickhouse, mock_produce_handler): # Arrange mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance @@ -301,9 +374,10 @@ def test_send_data_packet(self, mock_produce_handler): key = "test_key" data = { - "begin_timestamp": "test_begin", - "end_timestamp": "test_end", - "data": "test_data", + "batch_id": uuid.UUID("b4b6f13e-d064-4ab7-94ed-d02b46063308"), + "begin_timestamp": datetime.datetime(2024, 12, 6, 13, 12, 30, 324015), + "end_timestamp": datetime.datetime(2024, 12, 6, 13, 12, 31, 832173), + "data": ["test_data"], } sut = BufferedBatchSender() @@ -314,7 +388,9 @@ def test_send_data_packet(self, mock_produce_handler): # Assert mock_produce_handler_instance.produce.assert_called_once_with( topic="test_topic", - data='{"begin_timestamp": "test_begin", "end_timestamp": "test_end", "data": "test_data"}', + data='{"batch_id": "b4b6f13e-d064-4ab7-94ed-d02b46063308", "begin_timestamp": ' + '"2024-12-06T13:12:30.324015Z", "end_timestamp": "2024-12-06T13:12:31.832173Z", ' + '"data": ["test_data"]}', key=key, ) diff --git a/tests/test_buffered_batch.py b/tests/logcollector/test_buffered_batch.py similarity index 88% rename from tests/test_buffered_batch.py rename to tests/logcollector/test_buffered_batch.py index 56d355d..31e35ec 100644 --- a/tests/test_buffered_batch.py +++ b/tests/logcollector/test_buffered_batch.py @@ -1,4 +1,7 @@ +import datetime import unittest +import uuid +from unittest.mock import patch from src.logcollector.batch_handler import BufferedBatch @@ -14,7 +17,8 @@ def test_init(self): class TestAddMessage(unittest.TestCase): - def test_add_message_empty_batch_and_empty_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_empty_batch_and_empty_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -22,7 +26,7 @@ def test_add_message_empty_batch_and_empty_buffer(self): sut = BufferedBatch() # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -30,7 +34,8 @@ def test_add_message_empty_batch_and_empty_buffer(self): ) self.assertEqual({}, sut.buffer, "Buffer should remain empty") - def test_add_message_empty_batch_and_used_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_empty_batch_and_used_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -40,7 +45,7 @@ def test_add_message_empty_batch_and_used_buffer(self): sut.buffer = {key: [old_message]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -52,7 +57,8 @@ def test_add_message_empty_batch_and_used_buffer(self): "Buffer should still contain key with old message", ) - def test_add_message_used_batch_and_empty_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_used_batch_and_empty_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -62,7 +68,7 @@ def test_add_message_used_batch_and_empty_buffer(self): sut.batch = {key: [old_message]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -72,7 +78,8 @@ def test_add_message_used_batch_and_empty_buffer(self): ) self.assertEqual({}, sut.buffer, "Buffer should remain empty") - def test_add_message_used_batch_and_used_buffer(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_used_batch_and_used_buffer(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -84,7 +91,7 @@ def test_add_message_used_batch_and_used_buffer(self): sut.buffer = {key: [old_message_1]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -98,7 +105,8 @@ def test_add_message_used_batch_and_used_buffer(self): "Buffer should still contain key with old message", ) - def test_add_message_with_existing_other_key(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_add_message_with_existing_other_key(self, mock_clickhouse): # Arrange key = "test_key" message = "test_message" @@ -111,7 +119,7 @@ def test_add_message_with_existing_other_key(self): sut.buffer = {old_key: [old_message_1]} # Act - sut.add_message(key, message) + sut.add_message(key, uuid.uuid4(), message) # Assert self.assertEqual( @@ -755,50 +763,88 @@ def test_sort_unsorted_buffer(self): class TestCompleteBatch(unittest.TestCase): - def test_complete_batch_variant_1(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_complete_batch_variant_1(self, mock_clickhouse): # Arrange key = "test_key" - message_1 = '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_2 = '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + message_1 = ( + '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_2 = ( + '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) expected_messages = [message_1, message_2] sut = BufferedBatch() # Act - sut.add_message(key, message_2) - sut.add_message(key, message_1) + sut.add_message(key, uuid.uuid4(), message_2) + sut.add_message(key, uuid.uuid4(), message_1) data = sut.complete_batch(key) # Assert - self.assertEqual("2024-05-21T08:31:28.119Z", data["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.249Z", data["end_timestamp"]) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 249000), data["end_timestamp"] + ) self.assertEqual(expected_messages, data["data"]) - def test_complete_batch_variant_2(self): + @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") + def test_complete_batch_variant_2(self, mock_clickhouse): # Arrange key = "test_key" - message_1 = '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_2 = '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' - message_3 = '{"timestamp": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' - message_4 = '{"timestamp": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + message_1 = ( + '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_2 = ( + '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) + message_3 = ( + '{"timestamp": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' + ) + message_4 = ( + '{"timestamp": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' + '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' + ) sut = BufferedBatch() # Act - sut.add_message(key, message_1) - sut.add_message(key, message_2) + sut.add_message(key, uuid.uuid4(), message_1) + sut.add_message(key, uuid.uuid4(), message_2) data_1 = sut.complete_batch(key) - sut.add_message(key, message_3) - sut.add_message(key, message_4) + sut.add_message(key, uuid.uuid4(), message_3) + sut.add_message(key, uuid.uuid4(), message_4) data_2 = sut.complete_batch(key) # Assert - self.assertEqual("2024-05-21T08:31:28.119Z", data_1["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.249Z", data_1["end_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.119Z", data_2["begin_timestamp"]) - self.assertEqual("2024-05-21T08:31:28.749Z", data_2["end_timestamp"]) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data_1["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 249000), data_1["end_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 119000), data_2["begin_timestamp"] + ) + self.assertEqual( + datetime.datetime(2024, 5, 21, 8, 31, 28, 749000), data_2["end_timestamp"] + ) self.assertEqual({key: [message_3, message_4]}, sut.buffer) self.assertEqual({}, sut.batch) diff --git a/tests/test_collector.py b/tests/logcollector/test_collector.py similarity index 77% rename from tests/test_collector.py rename to tests/logcollector/test_collector.py index 033a4ce..54f1c74 100644 --- a/tests/test_collector.py +++ b/tests/logcollector/test_collector.py @@ -1,6 +1,8 @@ import asyncio +import datetime import ipaddress import unittest +import uuid from unittest.mock import MagicMock, patch, AsyncMock from src.logcollector.collector import LogCollector, main @@ -11,8 +13,13 @@ class TestInit(unittest.TestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_valid_init( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): mock_batch_handler_instance = MagicMock() mock_logline_handler_instance = MagicMock() @@ -37,8 +44,10 @@ class TestStart(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def setUp( self, + mock_clickhouse, mock_logline_handler, mock_batch_handler, mock_kafka_consume_handler, @@ -89,8 +98,13 @@ class TestFetch(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.LoglineHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def asyncSetUp( - self, mock_kafka_handler, mock_batch_sender, mock_logline_handler + self, + mock_clickhouse, + mock_kafka_handler, + mock_batch_sender, + mock_logline_handler, ): self.sut = LogCollector() self.sut.kafka_consume_handler = AsyncMock() @@ -98,8 +112,9 @@ async def asyncSetUp( @patch("src.logcollector.collector.LogCollector.store") @patch("src.logcollector.collector.logger") @patch("asyncio.get_running_loop") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_handle_kafka_inputs( - self, mock_get_running_loop, mock_logger, mock_store + self, mock_clickhouse, mock_get_running_loop, mock_logger, mock_store ): mock_store_instance = AsyncMock() mock_store.return_value = mock_store_instance @@ -119,7 +134,7 @@ async def test_handle_kafka_inputs( with self.assertRaises(asyncio.CancelledError): await self.sut.fetch() - mock_store.assert_called_once_with("value1") + mock_store.assert_called_once() class TestSend(unittest.IsolatedAsyncioTestCase): @@ -128,8 +143,16 @@ class TestSend(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.uuid") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_with_one_logline( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_uuid, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() @@ -139,10 +162,11 @@ async def test_send_with_one_logline( KeyboardInterrupt, ] mock_logline_handler.return_value = mock_logline_handler_instance + mock_uuid.uuid4.return_value = uuid.UUID("8ac2e82b-9252-4e67-a691-4924f98bc605") mock_logline_handler_instance.validate_logline_and_get_fields_as_json.return_value = { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -151,10 +175,10 @@ async def test_send_with_one_logline( "size": "150b", } expected_message = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", ' '"record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", ' - '"size": "150b"}' + '"size": "150b", "logline_id": "8ac2e82b-9252-4e67-a691-4924f98bc605"}' ) input_logline = ( "2024-05-21T08:31:28.119Z NOERROR 192.168.0.105 8.8.8.8 www.heidelberg-botanik.de A " @@ -162,7 +186,7 @@ async def test_send_with_one_logline( ) sut = LogCollector() - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -176,8 +200,14 @@ async def test_send_with_one_logline( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_keyboard_interrupt( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() @@ -193,7 +223,7 @@ async def test_send_keyboard_interrupt( mock_logline_handler_instance.validate_logline_and_get_fields_as_json.return_value = { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -207,10 +237,10 @@ async def test_send_keyboard_interrupt( ) sut = LogCollector() - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -224,8 +254,10 @@ async def test_send_keyboard_interrupt( @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @patch("src.logcollector.collector.asyncio.Queue") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_empty( self, + mock_clickhouse, mock_queue, mock_logline_handler, mock_batch_handler, @@ -253,20 +285,29 @@ async def test_send_empty( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.uuid") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_send_value_error( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler, mock_logger + self, + mock_clickhouse, + mock_uuid, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, + mock_logger, ): # Arrange mock_batch_handler_instance = MagicMock() mock_logline_handler_instance = MagicMock() mock_batch_handler.return_value = mock_batch_handler_instance mock_logline_handler.return_value = mock_logline_handler_instance + mock_uuid.uuid4.return_value = uuid.UUID("8ac2e82b-9252-4e67-a691-4924f98bc605") mock_logline_handler_instance.validate_logline_and_get_fields_as_json.side_effect = [ ValueError, { "timestamp": "2024-05-21T08:31:28.119Z", - "status": "NOERROR", + "status_code": "NOERROR", "client_ip": "192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", @@ -277,10 +318,10 @@ async def test_send_value_error( KeyboardInterrupt, ] expected_message = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"timestamp": "2024-05-21T08:31:28.119Z", "status_code": "NOERROR", "client_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", ' '"record_type": "A", "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", ' - '"size": "150b"}' + '"size": "150b", "logline_id": "8ac2e82b-9252-4e67-a691-4924f98bc605"}' ) input_logline = ( "2024-05-21T08:31:28.119Z NOERROR 192.168.0.105 8.8.8.8 www.heidelberg-botanik.de A " @@ -288,9 +329,9 @@ async def test_send_value_error( ) sut = LogCollector() - await sut.store(input_logline) - await sut.store(input_logline) - await sut.store(input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) + await sut.store(datetime.datetime.now(), input_logline) # Act await sut.send() @@ -304,18 +345,24 @@ class TestStore(unittest.IsolatedAsyncioTestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") async def test_store( - self, mock_logline_handler, mock_batch_handler, mock_kafka_consume_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_consume_handler, ): # Arrange sut = LogCollector() self.assertTrue(sut.loglines.empty()) # Act - await sut.store("test_message") + await sut.store(datetime.datetime.now(), "test_message") # Assert - self.assertEqual("test_message", await sut.loglines.get()) + stored_timestamp, stored_message = await sut.loglines.get() + self.assertEqual("test_message", stored_message) self.assertTrue(sut.loglines.empty()) @@ -324,8 +371,13 @@ class TestGetSubnetId(unittest.TestCase): @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("192.168.1.1") @@ -342,8 +394,13 @@ def test_get_subnet_id_ipv4( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4_zero( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("0.0.0.0") @@ -360,8 +417,13 @@ def test_get_subnet_id_ipv4_zero( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv4_max( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv4Address("255.255.255.255") @@ -378,8 +440,13 @@ def test_get_subnet_id_ipv4_max( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("2001:db8:85a3:1234:5678:8a2e:0370:7334") @@ -396,8 +463,13 @@ def test_get_subnet_id_ipv6( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6_zero( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("::") @@ -414,8 +486,13 @@ def test_get_subnet_id_ipv6_zero( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_ipv6_max( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = ipaddress.IPv6Address("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff") @@ -433,8 +510,13 @@ def test_get_subnet_id_ipv6_max( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_unsupported_type( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = "192.168.1.1" # String instead of IPv4Address or IPv6Address @@ -450,8 +532,13 @@ def test_get_subnet_id_unsupported_type( @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") + @patch("src.logcollector.collector.ClickHouseKafkaSender") def test_get_subnet_id_none( - self, mock_logline_handler, mock_batch_handler, mock_kafka_handler + self, + mock_clickhouse, + mock_logline_handler, + mock_batch_handler, + mock_kafka_handler, ): # Arrange test_address = None diff --git a/tests/logserver/__init__.py b/tests/logserver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_server.py b/tests/logserver/test_server.py similarity index 82% rename from tests/test_server.py rename to tests/logserver/test_server.py index 06ee245..8f29c54 100644 --- a/tests/test_server.py +++ b/tests/logserver/test_server.py @@ -2,7 +2,9 @@ import os import tempfile import unittest +import uuid from unittest.mock import AsyncMock, MagicMock, patch +from uuid import UUID import aiofiles @@ -15,7 +17,10 @@ class TestInit(unittest.TestCase): @patch("src.logserver.server.CONSUME_TOPIC", "test_topic") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") @patch("src.logserver.server.SimpleKafkaConsumeHandler") - def test_valid_init(self, mock_kafka_consume_handler, mock_kafka_produce_handler): + @patch("src.logserver.server.ClickHouseKafkaSender") + def test_valid_init( + self, mock_clickhouse, mock_kafka_consume_handler, mock_kafka_produce_handler + ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_produce_handler_instance = MagicMock() @@ -32,8 +37,10 @@ class TestStart(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.logger") @patch("src.logserver.server.SimpleKafkaConsumeHandler") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.ClickHouseKafkaSender") def setUp( self, + mock_clickhouse, mock_kafka_produce_handler, mock_kafka_consume_handler, mock_logger, @@ -42,8 +49,10 @@ def setUp( @patch("src.logserver.server.LogServer.fetch_from_kafka") @patch("src.logserver.server.LogServer.fetch_from_file") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_start( self, + mock_clickhouse, mock_fetch_from_file, mock_fetch_from_kafka, ): @@ -56,8 +65,10 @@ async def test_start( @patch("src.logserver.server.LogServer.fetch_from_kafka") @patch("src.logserver.server.LogServer.fetch_from_file") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_start_handles_keyboard_interrupt( self, + mock_clickhouse, mock_fetch_from_file, mock_fetch_from_kafka, ): @@ -80,8 +91,10 @@ async def mock_gather(*args, **kwargs): class TestSend(unittest.TestCase): @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") @patch("src.logserver.server.ExactlyOnceKafkaProduceHandler") + @patch("src.logserver.server.ClickHouseKafkaSender") def test_send( self, + mock_clickhouse, mock_produce_handler, ): # Arrange @@ -92,7 +105,7 @@ def test_send( sut = LogServer() # Act - sut.send(message) + sut.send(uuid.uuid4(), message) # Assert mock_kafka_produce_handler_instance.produce.assert_called_once_with( @@ -107,8 +120,12 @@ class TestFetchFromKafka(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") @patch("asyncio.get_running_loop") + @patch("src.logserver.server.ClickHouseKafkaSender") + @patch("src.logserver.server.uuid") async def test_handle_kafka_inputs( self, + mock_uuid, + mock_clickhouse, mock_get_running_loop, mock_logger, mock_send, @@ -117,6 +134,9 @@ async def test_handle_kafka_inputs( ): self.sut = LogServer() + mock_uuid_instance = MagicMock() + mock_uuid.return_value = mock_uuid_instance + mock_uuid.uuid4.return_value = UUID("bd72ccb4-0ef2-4100-aa22-e787122d6875") mock_send_instance = AsyncMock() mock_send.return_value = mock_send_instance mock_loop = AsyncMock() @@ -135,7 +155,9 @@ async def test_handle_kafka_inputs( with self.assertRaises(asyncio.CancelledError): await self.sut.fetch_from_kafka() - mock_send.assert_called_once_with("value1") + mock_send.assert_called_once_with( + UUID("bd72ccb4-0ef2-4100-aa22-e787122d6875"), "value1" + ) class TestFetchFromFile(unittest.IsolatedAsyncioTestCase): @@ -145,8 +167,14 @@ class TestFetchFromFile(unittest.IsolatedAsyncioTestCase): @patch("src.logserver.server.PRODUCE_TOPIC", "test_topic") @patch("src.logserver.server.LogServer.send") @patch("src.logserver.server.logger") + @patch("src.logserver.server.ClickHouseKafkaSender") async def test_fetch_from_file( - self, mock_logger, mock_send, mock_kafka_consume, mock_kafka_produce + self, + mock_clickhouse, + mock_logger, + mock_send, + mock_kafka_consume, + mock_kafka_produce, ): self.sut = LogServer() @@ -178,8 +206,7 @@ async def test_fetch_from_file( finally: os.remove(temp_file_path) - mock_send.assert_any_call("Test line 3") - mock_send.assert_any_call("Test line 4") + self.assertEqual(2, mock_send.call_count) class TestMain(unittest.TestCase): diff --git a/tests/miscellaneous/__init__.py b/tests/miscellaneous/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_field_type.py b/tests/miscellaneous/test_field_type.py similarity index 100% rename from tests/test_field_type.py rename to tests/miscellaneous/test_field_type.py diff --git a/tests/test_log_config.py b/tests/miscellaneous/test_log_config.py similarity index 100% rename from tests/test_log_config.py rename to tests/miscellaneous/test_log_config.py diff --git a/tests/test_logline_handler.py b/tests/miscellaneous/test_logline_handler.py similarity index 94% rename from tests/test_logline_handler.py rename to tests/miscellaneous/test_logline_handler.py index a88d5c2..dee6f16 100644 --- a/tests/test_logline_handler.py +++ b/tests/miscellaneous/test_logline_handler.py @@ -113,10 +113,32 @@ def test_init_missing_fields(self, mock_create): str(context.exception), "Not all needed fields are set in the configuration" ) + @patch("src.base.logline_handler.REQUIRED_FIELDS", ["field_1"]) + @patch("src.base.logline_handler.LOGLINE_FIELDS", ["field_1", "field_2"]) + @patch("src.base.logline_handler.FORBIDDEN_FIELD_NAMES", ["field_2"]) + @patch("src.base.logline_handler.LoglineHandler._create_instance_from_list_entry") + def test_init_no_fields(self, mock_create): + # Arrange + ip_address_instance = MagicMock() + ip_address_instance.name = "field_2" + mock_create.side_effect = [ip_address_instance] + + # Act and Assert + with self.assertRaises(ValueError) as context: + LoglineHandler() + + self.assertEqual( + str(context.exception), + "Forbidden field name included. " + "These fields are used internally and cannot be used as names: " + "['field_2']", + ) + @patch("src.base.logline_handler.REQUIRED_FIELDS", []) @patch("src.base.logline_handler.LOGLINE_FIELDS", []) + @patch("src.base.logline_handler.LOGLINE_FIELDS", []) @patch("src.base.logline_handler.LoglineHandler._create_instance_from_list_entry") - def test_init_no_fields(self, mock_create): + def test_init_forbidden_fields(self, mock_create): # Arrange mock_create.side_effect = [] diff --git a/tests/test_marshmallow.py b/tests/miscellaneous/test_marshmallow.py similarity index 91% rename from tests/test_marshmallow.py rename to tests/miscellaneous/test_marshmallow.py index 039e2be..3edf93a 100644 --- a/tests/test_marshmallow.py +++ b/tests/miscellaneous/test_marshmallow.py @@ -1,13 +1,15 @@ import unittest +import uuid import marshmallow_dataclass -from src.base import Batch +from src.base.data_classes.batch import Batch class TestClearData(unittest.TestCase): def test_clear_data_with_existing_data(self): json_data = { + "batch_id": str(uuid.uuid4()), "begin_timestamp": "2024-05-21T08:31:27.000000Z", "end_timestamp": "2024-05-21T08:31:29.000000Z", "data": [ diff --git a/tests/miscellaneous/test_monitoring_agent.py b/tests/miscellaneous/test_monitoring_agent.py new file mode 100644 index 0000000..ead50b0 --- /dev/null +++ b/tests/miscellaneous/test_monitoring_agent.py @@ -0,0 +1,190 @@ +import datetime +import unittest +import uuid +from unittest.mock import patch, AsyncMock, Mock, mock_open + +import marshmallow_dataclass + +from src.base.data_classes.clickhouse_connectors import ServerLogs +from src.monitoring.monitoring_agent import CREATE_TABLES_DIRECTORY, main +from src.monitoring.monitoring_agent import MonitoringAgent, prepare_all_tables + + +class TestPrepareAllTables(unittest.TestCase): + @patch("os.listdir", return_value=["table1.sql", "table2.sql", "not_sql.txt"]) + @patch("builtins.open", new_callable=mock_open, read_data="CREATE TABLE test;") + @patch("clickhouse_connect.get_client") + def test_prepare_all_tables(self, mock_get_client, mock_open_file, mock_listdir): + # Arrange + mock_client = Mock() + mock_get_client.return_value.__enter__.return_value = mock_client + + # Act + prepare_all_tables() + + # Assert + mock_listdir.assert_called_once_with(CREATE_TABLES_DIRECTORY) + self.assertEqual(mock_open_file.call_count, 2) + mock_client.command.assert_called_with("CREATE TABLE test;") + self.assertEqual(mock_client.command.call_count, 2) + + @patch("os.listdir", return_value=["table1.sql"]) + @patch("builtins.open", new_callable=mock_open, read_data="CREATE TABLE test;") + @patch("clickhouse_connect.get_client") + def test_prepare_all_tables_with_exception( + self, mock_get_client, mock_open_file, mock_listdir + ): + # Arrange + mock_client = Mock() + mock_get_client.return_value.__enter__.return_value = mock_client + + mock_client.command.side_effect = Exception("Simulated Error") + + # Act + with self.assertRaises(Exception) as context: + prepare_all_tables() + + # Assert + self.assertEqual(str(context.exception), "Simulated Error") + + +class TestInit(unittest.TestCase): + @patch("src.monitoring.monitoring_agent.ServerLogsConnector") + @patch("src.monitoring.monitoring_agent.ServerLogsTimestampsConnector") + @patch("src.monitoring.monitoring_agent.FailedDNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineToBatchesConnector") + @patch("src.monitoring.monitoring_agent.DNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineTimestampsConnector") + @patch("src.monitoring.monitoring_agent.BatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchesToBatchConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.AlertsConnector") + @patch("src.monitoring.monitoring_agent.SimpleKafkaConsumeHandler") + def test_init( + self, + mock_kafka_consumer, + mock_alerts, + mock_suspicious_batch_timestamps, + mock_suspicious_batches_to_batch, + mock_batch_timestamps, + mock_logline_timestamps, + mock_dns_loglines, + mock_logline_to_batches, + mock_failed_dns_loglines, + mock_server_logs_timestamps, + mock_server_logs, + ): + # Arrange + expected_topics = [ + "clickhouse_server_logs", + "clickhouse_server_logs_timestamps", + "clickhouse_failed_dns_loglines", + "clickhouse_logline_to_batches", + "clickhouse_dns_loglines", + "clickhouse_logline_timestamps", + "clickhouse_batch_timestamps", + "clickhouse_suspicious_batches_to_batch", + "clickhouse_suspicious_batch_timestamps", + "clickhouse_alerts", + ] + + # Act + sut = MonitoringAgent() + + # Assert + self.assertEqual( + expected_topics, + sut.topics, + ) + mock_kafka_consumer.assert_called_once_with(expected_topics) + + +class TestStart(unittest.IsolatedAsyncioTestCase): + @patch("src.monitoring.monitoring_agent.ServerLogsConnector") + @patch("src.monitoring.monitoring_agent.ServerLogsTimestampsConnector") + @patch("src.monitoring.monitoring_agent.FailedDNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineToBatchesConnector") + @patch("src.monitoring.monitoring_agent.DNSLoglinesConnector") + @patch("src.monitoring.monitoring_agent.LoglineTimestampsConnector") + @patch("src.monitoring.monitoring_agent.BatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchesToBatchConnector") + @patch("src.monitoring.monitoring_agent.SuspiciousBatchTimestampsConnector") + @patch("src.monitoring.monitoring_agent.AlertsConnector") + @patch("src.monitoring.monitoring_agent.logger") + @patch("src.monitoring.monitoring_agent.SimpleKafkaConsumeHandler") + @patch("asyncio.get_running_loop") + async def test_handle_kafka_inputs( + self, + mock_get_running_loop, + mock_kafka_consume, + mock_logger, + mock_alerts, + mock_suspicious_batch_timestamps, + mock_suspicious_batches_to_batch, + mock_batch_timestamps, + mock_logline_timestamps, + mock_dns_loglines, + mock_logline_to_batches, + mock_failed_dns_loglines, + mock_server_logs_timestamps, + mock_server_logs, + ): + # Arrange + sut = MonitoringAgent() + sut.connectors["server_logs"] = Mock() + + data_schema = marshmallow_dataclass.class_schema(ServerLogs)() + fixed_id = uuid.uuid4() + timestamp_in = datetime.datetime.now() + value = data_schema.dumps( + { + "message_id": fixed_id, + "timestamp_in": timestamp_in, + "message_text": "test_text", + } + ) + + mock_loop = AsyncMock() + mock_get_running_loop.return_value = mock_loop + sut.kafka_consumer.consume.return_value = ( + "key1", + value, + "clickhouse_server_logs", + ) + mock_loop.run_in_executor.side_effect = [ + ("key1", value, "clickhouse_server_logs"), + KeyboardInterrupt(), + ] + + # Act and Assert + await sut.start() + + sut.connectors["server_logs"].insert.assert_called_once_with( + message_id=fixed_id, + timestamp_in=timestamp_in, + message_text="test_text", + ) + + +class TestMain(unittest.TestCase): + @patch("src.monitoring.monitoring_agent.prepare_all_tables") + @patch("src.monitoring.monitoring_agent.MonitoringAgent") + @patch("asyncio.run") + def test_main( + self, mock_asyncio_run, mock_monitoring_agent, mock_prepare_all_tables + ): + # Arrange + mock_agent_instance = Mock() + mock_monitoring_agent.return_value = mock_agent_instance + + # Act + main() + + # Assert + mock_prepare_all_tables.assert_called_once() + mock_monitoring_agent.assert_called_once() + mock_asyncio_run.assert_called_once_with(mock_agent_instance.start()) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_utils.py b/tests/miscellaneous/test_utils.py similarity index 100% rename from tests/test_utils.py rename to tests/miscellaneous/test_utils.py diff --git a/tests/prefilter/__init__.py b/tests/prefilter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_prefilter.py b/tests/prefilter/test_prefilter.py similarity index 70% rename from tests/test_prefilter.py rename to tests/prefilter/test_prefilter.py index 04fa6cc..26caf39 100644 --- a/tests/test_prefilter.py +++ b/tests/prefilter/test_prefilter.py @@ -1,7 +1,9 @@ -import json +import datetime import unittest +import uuid from unittest.mock import MagicMock, patch +from src.base.data_classes.batch import Batch from src.base.kafka_handler import KafkaMessageFetchException from src.prefilter.prefilter import Prefilter, main @@ -36,8 +38,10 @@ class TestGetAndFillData(unittest.TestCase): @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_without_new_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -47,9 +51,14 @@ def test_get_data_without_new_data( mock_produce_handler.return_value = mock_produce_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( + mock_consume_handler_instance.consume_as_object.return_value = ( None, - {}, + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=[], + ), ) sut = Prefilter() @@ -59,14 +68,16 @@ def test_get_data_without_new_data( self.assertEqual([], sut.filtered_data) self.assertEqual(None, sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() @patch("src.prefilter.prefilter.logger") @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_with_new_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -76,13 +87,14 @@ def test_get_data_with_new_data( mock_produce_handler.return_value = mock_produce_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( - "127.0.0.0/24", - { - "begin_timestamp": "2024-05-21T08:31:28.119Z", - "end_timestamp": "2024-05-21T08:31:29.432Z", - "data": ["test_data_1", "test_data_2"], - }, + mock_consume_handler_instance.consume_as_object.return_value = ( + "127.0.0.0_24", + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=["test_data_1", "test_data_2"], + ), ) sut = Prefilter() @@ -90,16 +102,18 @@ def test_get_data_with_new_data( self.assertEqual(["test_data_1", "test_data_2"], sut.unfiltered_data) self.assertEqual([], sut.filtered_data) - self.assertEqual("127.0.0.0/24", sut.subnet_id) + self.assertEqual("127.0.0.0_24", sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() @patch("src.prefilter.prefilter.logger") @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_get_data_with_existing_data( self, + mock_clickhouse, mock_batch_handler, mock_consume_handler, mock_logline_handler, @@ -109,13 +123,14 @@ def test_get_data_with_existing_data( mock_batch_handler.return_value = mock_batch_handler_instance mock_consume_handler_instance = MagicMock() mock_consume_handler.return_value = mock_consume_handler_instance - mock_consume_handler_instance.consume_as_json.return_value = ( - "127.0.0.0/24", - { - "begin_timestamp": "2024-05-21T08:31:28.119Z", - "end_timestamp": "2024-05-21T08:31:29.432Z", - "data": ["test_data_1", "test_data_2"], - }, + mock_consume_handler_instance.consume_as_object.return_value = ( + "127.0.0.0_24", + Batch( + batch_id=uuid.uuid4(), + begin_timestamp=datetime.datetime.now(), + end_timestamp=datetime.datetime.now(), + data=["test_data_1", "test_data_2"], + ), ) sut = Prefilter() @@ -124,9 +139,9 @@ def test_get_data_with_existing_data( self.assertEqual(["test_data_1", "test_data_2"], sut.unfiltered_data) self.assertEqual([], sut.filtered_data) - self.assertEqual("127.0.0.0/24", sut.subnet_id) + self.assertEqual("127.0.0.0_24", sut.subnet_id) - mock_consume_handler_instance.consume_as_json.assert_called_once() + mock_consume_handler_instance.consume_as_object.assert_called_once() class TestFilterByError(unittest.TestCase): @@ -152,49 +167,50 @@ def test_filter_by_error_empty_data( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_no_error_types( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] @@ -208,49 +224,50 @@ def test_filter_by_error_with_data_no_error_types( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_one_error_type( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] @@ -264,49 +281,50 @@ def test_filter_by_error_with_data_one_error_type( @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_filter_by_error_with_data_two_error_types( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, mock_logger, ): - first_entry = json.dumps( - { - "timestamp": "2024-05-21T08:31:28.119Z", - "status_code": "NOERROR", - "client_ip": "192.168.0.105", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-botanik.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) - second_entry = json.dumps( - { - "timestamp": "2024-06-01T02:31:07.943Z", - "status_code": "NXDOMAIN", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.biotech-hei.com", - "record_type": "AAAA", - "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", - "size": "117b", - } - ) - third_entry = json.dumps( - { - "timestamp": "2024-06-01T01:37:41.796Z", - "status_code": "OTHER_TYPE", - "client_ip": "192.168.1.206", - "dns_ip": "8.8.8.8", - "host_domain_name": "www.heidelberg-stadtbibliothek.de", - "record_type": "A", - "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", - "size": "150b", - } - ) + first_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-05-21T08:31:28.119Z", + "status_code": "NOERROR", + "client_ip": "192.168.0.105", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-botanik.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } + + second_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T02:31:07.943Z", + "status_code": "NXDOMAIN", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.biotech-hei.com", + "record_type": "AAAA", + "response_ip": "4250:5939:b4f2:b3ec:36ef:752d:b325:189b", + "size": "117b", + } + + third_entry = { + "logline_id": str(uuid.uuid4()), + "timestamp": "2024-06-01T01:37:41.796Z", + "status_code": "OTHER_TYPE", + "client_ip": "192.168.1.206", + "dns_ip": "8.8.8.8", + "host_domain_name": "www.heidelberg-stadtbibliothek.de", + "record_type": "A", + "response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", + "size": "150b", + } sut = Prefilter() sut.unfiltered_data = [first_entry, second_entry, third_entry] @@ -323,8 +341,10 @@ class TestSendFilteredData(unittest.TestCase): @patch("src.prefilter.prefilter.LoglineHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaConsumeHandler") @patch("src.prefilter.prefilter.ExactlyOnceKafkaProduceHandler") + @patch("src.prefilter.prefilter.ClickHouseKafkaSender") def test_send_with_data( self, + mock_clickhouse, mock_produce_handler, mock_consume_handler, mock_logline_handler, @@ -358,10 +378,12 @@ def test_send_with_data( sut.unfiltered_data = [first_entry, second_entry] sut.filtered_data = [first_entry, second_entry] sut.subnet_id = "192.168.1.0_24" - sut.begin_timestamp = "2024-05-21T08:31:27.000Z" - sut.end_timestamp = "2024-05-21T08:31:29.000Z" + sut.batch_id = uuid.UUID("5236b147-5b0d-44a8-981f-bd7da8c54733") + sut.begin_timestamp = datetime.datetime(2024, 5, 21, 8, 31, 27, 000000) + sut.end_timestamp = datetime.datetime(2024, 5, 21, 8, 31, 29, 000000) expected_message = ( - '{"begin_timestamp": "2024-05-21T08:31:27.000Z", "end_timestamp": "2024-05-21T08:31:29.000Z", "data": [{' + '{"batch_id": "5236b147-5b0d-44a8-981f-bd7da8c54733", "begin_timestamp": "2024-05-21T08:31:27.000000Z", ' + '"end_timestamp": "2024-05-21T08:31:29.000000Z", "data": [{' '"timestamp": "2024-05-21T08:31:28.119Z", "status": "NXDOMAIN", "client_ip": "192.168.1.105", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}, {"timestamp": "2024-06-01T02:31:07.943Z", ' @@ -413,7 +435,8 @@ def test_send_without_data( sut.unfiltered_data = [] sut.filtered_data = [] - self.assertIsNone(sut.send_filtered_data()) + with self.assertRaises(ValueError): + sut.send_filtered_data() mock_produce_handler.add_message.assert_not_called()