Skip to content

Commit

Permalink
Update Ip analyzer
Browse files Browse the repository at this point in the history
  • Loading branch information
stefanDeveloper committed Mar 22, 2024
1 parent 28ef588 commit def7a10
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 11 deletions.
34 changes: 29 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,41 @@
# heiDGAF - DGA Finder

> ML based DNS analyzer to detect Domain Generation Algorithms (DGAs) and tunneling of malicious actors.
##
> ML based DNS analyzer to detect Domain Generation Algorithms (DGAs) tunneling, and data exfiltration of malicious actors.
## Getting Started

```sh
python -m venv .venv
pip install .

heidgaf -h
```

Run your analysis:

```sh
heidgaf process start -r data/...
```

### Data

Currently, we support the data format scheme:

`{{ .timestamp }} {{ .return_code }} {{ .client_ip }} {{ .server_ip }} {{ .query }} {{ .type }} {{ .answer }} {{ .size }}b`

For training our models, we rely on the following data sets:

- CICBellDNS2021
- DGTA Benchmark
- Majestic Million

### Exploratory Data Analysis (EDA)

In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to verify the features of interest for our application.

## Literature

## Exploratory Data Analysis (EDA)
Based on the following work we implement heiDGAF to find malicious behaviour in DNS request.

In the folder `./example` we conducted a Exploratory Data Analysis (EDA) to verify the features of interest for our application.
- EXPOSURE: Finding Malicious Domains Using Passive DNS Analysis
-
5 changes: 5 additions & 0 deletions heidgaf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from enum import Enum
import logging

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"], show_default=True)


class ReturnCode(Enum):
NOERROR = "NOERROR"


# set up logging to file
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
Expand Down
1 change: 1 addition & 0 deletions heidgaf/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def training_model():
@click.option("-r", "--read", "input_dir", required=True, type=click.Path())
def training_start(input_dir):
pipeline = DNSAnalyzerPipeline(input_dir)
pipeline.run()


if __name__ == "__main__":
Expand Down
21 changes: 16 additions & 5 deletions heidgaf/main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@

import os
import polars as pl
import fnmatch
import redis
import logging
from enum import Enum
from click import Path

from heidgaf.pre.ip_analyzer import IPAnalyzer


class FileType(Enum):
CSV = "csv"
Expand All @@ -18,16 +20,25 @@ class Separator(Enum):


class DNSAnalyzerPipeline:
def __init__(self, path: Path, redis_host="redis", filetype=FileType.TXT, separator=Separator.SPACE) -> None:
def __init__(self, path: Path, redis_host="localhost", redis_port=6379, redis_db=0, redis_max_connections=20, filetype=FileType.TXT, separator=Separator.SPACE) -> None:
logging.debug("Connect to Redis server")
pool = redis.ConnectionPool(host=redis_host, port=redis_port, db=redis_db, max_connections=redis_max_connections)
self.redis_client = redis.Redis(connection_pool=pool)
self.redis_client.ping()


if os.path.isfile(path):
logging.debug(f"Processing files: {path}")
self.data = self.load_data(path)
self.data = self.load_data(path, separator.value)
elif os.path.isdir(path):
logging.debug(f"Processing files: {path}/*.{filetype.value}")
self.data = self.load_data(f'{path}/*.{filetype.value}', separator.value)

self.redis_client.set("key", self.data.write_ipc(file = None, compression="lz4").getvalue())
logging.info(pl.read_ipc(self.redis_client.get("key")))

def load_data(self, path, separator):
dataframes = pl.read_csv(path, separator=separator, try_parse_dates=True, has_header=False).with_columns(
dataframes = pl.read_csv(path, separator=separator, try_parse_dates=False, has_header=False).with_columns(
[
(pl.col('column_1').str.strptime(pl.Datetime).cast(pl.Datetime)),
]
Expand Down Expand Up @@ -97,4 +108,4 @@ def load_data(self, path, separator):
return dataframes

def run(self):
pass
IPAnalyzer.run(self.data)
18 changes: 17 additions & 1 deletion heidgaf/pre/ip_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,24 @@
import polars as pl
import logging

from heidgaf import ReturnCode
from heidgaf.pre import Analyzer


class IPAnalyzer(Analyzer):
def __init__(self) -> None:
super().__init__()
super().__init__()

@classmethod
def run(self, data):
# Filter data with no errors
df = data.filter(pl.col("query") != "|").filter(pl.col("return_code") != ReturnCode.NOERROR.value).filter(pl.col("query").str.split(".").list.len() != 1)
# Get frequency count of distinct IP addresses and DNS servers
client_ip_frequency = df.select([
pl.col("client_ip").value_counts()
])
logging.debug(f'Client IP freq: {client_ip_frequency}')
dns_server_frequency = df.select([
pl.col("dns_server").value_counts()
])
logging.debug(f'Client IP freq: {dns_server_frequency}')

0 comments on commit def7a10

Please sign in to comment.