From 1e7b8953fa3023890ae5e45fdc7585ebc9d9af88 Mon Sep 17 00:00:00 2001 From: msm-code Date: Tue, 24 Jan 2023 21:13:27 +0100 Subject: [PATCH] Migrate Mquery to typed config library (#324) This changes the current mquery configuration mechanism from editing config.py file, to a "real" config file like this (mquery.ini): ```ini [redis] host=redis-server.example.com [mquery] backend=tcp://ursadb-server.example.com:9281 plugins=plugins.archive:GzipPlugin ``` --- .github/workflows/test_code.yml | 9 +- .gitignore | 1 - INSTALL.md | 1 - deploy/docker/daemon.Dockerfile | 1 - deploy/docker/web.Dockerfile | 1 - docker-compose.dev.yml | 6 +- docker-compose.e2etests-local.yml | 6 + docker-compose.yml | 6 +- docs/README.md | 1 + docs/configuration.md | 135 +++++++++++++++++++++ docs/plugins.md | 12 +- requirements.plain.txt | 1 + requirements.txt | 1 + src/.dockerignore | 1 - src/app.py | 6 +- src/config.docker.py | 10 -- src/config.example.py | 8 -- src/config.py | 34 ++++++ src/daemon.py | 4 +- src/plugins/README.md | 14 --- src/plugins/__init__.py | 8 +- src/plugins/example_plugin.py | 4 +- src/plugins/example_typed_config_plugin.py | 41 +++++++ src/tasks.py | 11 +- 24 files changed, 254 insertions(+), 68 deletions(-) create mode 100644 docs/configuration.md delete mode 100644 src/config.docker.py delete mode 100644 src/config.example.py create mode 100644 src/config.py delete mode 100644 src/plugins/README.md create mode 100644 src/plugins/example_typed_config_plugin.py diff --git a/.github/workflows/test_code.yml b/.github/workflows/test_code.yml index 3b8e6d4e..8028a90e 100644 --- a/.github/workflows/test_code.yml +++ b/.github/workflows/test_code.yml @@ -28,8 +28,6 @@ jobs: run: pip3 install mypy==0.790 - name: install requirements run: pip3 install -r requirements.txt - - name: copy config - run: cp src/config.example.py src/config.py - name: run mypy run: mypy src test_python_style: @@ -41,8 +39,6 @@ jobs: uses: actions/setup-python@v1 with: python-version: '3.10' - - name: copy config - run: cp src/config.example.py src/config.py - name: install flake8==3.7.9 run: pip3 install flake8==3.7.9 - name: run flake8 @@ -56,8 +52,6 @@ jobs: uses: actions/setup-python@v1 with: python-version: '3.10' - - name: copy config - run: cp src/config.example.py src/config.py - name: install black run: pip3 install black==22.3.0 - name: run black @@ -115,6 +109,9 @@ jobs: run: docker-compose up --scale daemon=1 --build -d - name: run e2e tests run: docker run --net mquery_default -v $(readlink -f ./samples):/mnt/samples mquery_tests + - name: get run logs + if: always() + run: docker-compose logs - name: stop docker compose if: always() run: docker-compose down diff --git a/.gitignore b/.gitignore index 10bc4730..dcdcf80c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.pyc venv/ -config.py .vscode .idea .mypy_cache diff --git a/INSTALL.md b/INSTALL.md index b01f95bc..8489c8cb 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -32,7 +32,6 @@ Docker compose dedicated for developers. ``` git clone --recurse-submodules https://github.com/CERT-Polska/mquery.git cd mquery -cp src/config.docker.py src/config.py # now set SAMPLES_DIR to a directory with your files, and INDEX_DIR to # empty directory for database files to live in. By default database will # expect files in ./samples directory, and keep index in ./index. diff --git a/deploy/docker/daemon.Dockerfile b/deploy/docker/daemon.Dockerfile index 27bff875..ef010bd6 100644 --- a/deploy/docker/daemon.Dockerfile +++ b/deploy/docker/daemon.Dockerfile @@ -8,6 +8,5 @@ RUN ls /tmp/requirements*.txt | xargs -i,, pip --no-cache-dir install -r ,, COPY "src/" "/app" RUN chmod +x "/app/daemon.py" -COPY "src/config.docker.py" "/app/config.py" ENTRYPOINT ["/app/daemon.py"] diff --git a/deploy/docker/web.Dockerfile b/deploy/docker/web.Dockerfile index cebd038a..43287f88 100644 --- a/deploy/docker/web.Dockerfile +++ b/deploy/docker/web.Dockerfile @@ -17,5 +17,4 @@ RUN ls /tmp/requirements*.txt | xargs -i,, pip --no-cache-dir install -r ,, COPY "src/." "." COPY --from=build "/app/build" "./mqueryfront/build" -COPY "src/config.docker.py" "config.py" CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "5000"] diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 8bff3ece..2b41beae 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -27,7 +27,8 @@ services: - "redis" - "ursadb" environment: - - "MQUERY_PLUGINS=${MQUERY_PLUGINS}" + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" dev-daemon: build: context: . @@ -42,7 +43,8 @@ services: - "redis" - "ursadb" environment: - - "MQUERY_PLUGINS=${MQUERY_PLUGINS}" + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" ursadb: image: mqueryci/ursadb:v1.5.1 ports: diff --git a/docker-compose.e2etests-local.yml b/docker-compose.e2etests-local.yml index 9cf41afc..a270a259 100644 --- a/docker-compose.e2etests-local.yml +++ b/docker-compose.e2etests-local.yml @@ -25,6 +25,9 @@ services: depends_on: - "redis" - "ursadb" + environment: + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" dev-daemon: build: context: . @@ -38,6 +41,9 @@ services: depends_on: - "redis" - "ursadb" + environment: + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" ursadb: image: mqueryci/ursadb:v1.5.1 ports: diff --git a/docker-compose.yml b/docker-compose.yml index 980393ff..4ba4ee49 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,8 @@ services: - "redis" - "ursadb" environment: - - "MQUERY_PLUGINS=${MQUERY_PLUGINS}" + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" daemon: restart: always build: @@ -30,7 +31,8 @@ services: - "redis" - "ursadb" environment: - - "MQUERY_PLUGINS=${MQUERY_PLUGINS}" + - "REDIS_HOST=redis" + - "MQUERY_BACKEND=tcp://ursadb:9281" ursadb: restart: always image: mqueryci/ursadb:v1.5.1 diff --git a/docs/README.md b/docs/README.md index 836a0ab4..85bdb20e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -3,6 +3,7 @@ ## User guide - [Installation](../INSTALL.md): Installation instruction. +- [Configuration](./configuration.md): Additional configuration options. - [Components](./components.md): More detailed description of mquery components. - [Indexing](./indexing.md): Indexing files is one of the most important things in mquery. In simple cases it can be solved without leaving the web UI, but diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 00000000..9b866520 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,135 @@ +# Configuration + +There are three different things you can configure within Mquery: core, plugins +and ursadb. Unfortunately, all are configured differently. + +## Mquery core configuration + +Mquery is configured with [typed-config](https://github.com/bwindsor/typed-config). There are two ways to pass every configuration field - with +a config file, or a environment variable. For example: + +```ini +[redis] +host=redis-server.example.com + +[mquery] +backend=tcp://ursadb-server.example.com:9281 +plugins=plugins.archive:GzipPlugin +``` + +This is a simple INI configuration file that mquery understands. It should +be saved in a file called `mquery.ini`. The file should be in one of the +following locations (checked in that order): + +* Mquery's working directory (usually `src` folder in the cloned repository) +* In the current user's xdg config directory: `~/.config/mquery/mquery.ini` +* In the system config directory: `/etc/mquery/mquery.ini` + +Alternatively, you can use environment variables to configure mquery. All +field names are mapped intuitively to environment variables by joining +the ini section name with a key name - for example, to change redis host +value use `REDIS_HOST`. Environment variables take precedence over values +from the config file! + +Currently, supported configuration keys are: + +- `redis.host`: Hostname of a main redis server. +- `redis.port`: Port of a main redis server. +- `mquery.backend`: URL to a ursadb instance (for example, + `tcp://ursadb-server:9281`) +- `mquery.plugins`: List of supported plugins, separated by commas (for + example `plugins.archive:GzipPlugin, plugins.custom:CustomPlugin`) + +## Mquery plugin configuration + +In contrast to the core configuration, plugins can be configured dynamically. +Every worker registers its list of active plugins in the database, and it's +possible to configure them using the web UI: + +![](./plugin-config.png) + +This configuration mechanism is used by the plugins shipped with Mquery. +Despite this, it's optional, and plugin authors don't have to use it. +Since plugins are arbitrary code, plugins can read their configuration from +anywhere they want, including the environment, other config files, etc. + +It's also easy to use the same config file for Mquery and plugins - see +[example_typed_config_plugin.py](../src/plugins/example_typed_config_plugin.py) +file for an example. + +## UrsaDB configuration + +UrsaDB is not technically part of Mquery, but both systems work closely +together and depend on each other for optimal performance. + +Mquery currently does not allow you to configure UrsaDB nicely. +You have to do it "manually", by connecting with `ursacli` program to the +TCP port exposed by UrsaDB. This program is built together with UrsaDB, and +available in all official docker images. You can execute it in docker-compose +like this: + +``` +sudo docker-compose -f docker-compose.dev.yml exec ursadb ursacli +``` + +Or you can download the latest ursadb release and run a client from there. + +To set a configuration field, issue a command like this: + +``` +$ ursacli +ursadb> config set "database_workers" 10; +``` + +The configuration keys are already documented in the UrsaDB's docs here: +https://cert-polska.github.io/ursadb/configuration.html. We won't copy +all relevant information here, but the most important config keys are: + +* `database_workers` - How many tasks can be processed at once (you can + increase this for strong servers, restart the database to apply). +* `merge_max_files` - Biggest supported dataset size. UrsaDB keeps indexed + files in so-called "datasets". The fewer datasets the faster the database is, + but you might not want overly huge datasets (at some point merging datasets + becomes very slow, and you get diminishing returns for merging them). Decide + on the value before indexing your files. Good values include the default + (infinite), 10 million, and 1 million. +* `merge_max_datasets` - a very memory and CPU-intensive operation. + If your database OOMs during indexing, consider lowering this number and + the number of `database_workers` that are merging in parallel. + +## .env file + +Finally, in the main directory of the repository there is a file named `.env`. +Mquery does not use it in any way, but it's read by Docker. + +```bash +$ cat .env +# This file is only relevant for docker-compose deployments. + +# Directory where your samples are stored. By default you have to copy them +# to ./samples subdirectory in this repository. +SAMPLES_DIR=./samples +# Directory where the index files should be saved. By default ./index +# subdirectory in this repository. +INDEX_DIR=./index +``` + +If you use docker-compose to start mquery, you can use this file to specify +a location on the host for your samples_dir and index_dir. These variables are +then used when creating containers. See for example ursadb container spec: + +```yaml + ursadb: + restart: always + image: mqueryci/ursadb:v1.5.0 + ports: + - "127.0.0.1:9281:9281" + volumes: + - "${SAMPLES_DIR}:/mnt/samples" + - "${INDEX_DIR}:/var/lib/ursadb" + user: "0:0" +``` + +As you can see, variables from `.env` are used to specify mount point for +the data volumes. You can also ignore this file, and edit docker-compose +directly to your liking. diff --git a/docs/plugins.md b/docs/plugins.md index 98dd2759..5268b6d8 100644 --- a/docs/plugins.md +++ b/docs/plugins.md @@ -13,19 +13,19 @@ by plugins. ![](plugin-config.png) -To add a new plugin to the system, you need to change PLUGINS key in -`config.py` for bare metal setup. For example: +To add a new plugin to the system, you need to change mquery.plugins key in +[the config](./configuration.md). For example: ```python -PLUGINS = ["plugins.mwdb_uploads:MalwarecageUploadsMetadata"] +[mquery] +plugins=plugins.mwdb_uploads:MalwarecageUploadsMetadata ``` To load a plugin `MalwarecageUploadsMetadata` from `plugins.mwdb_uploads` module. -To load plugins with docker-compose deployment, you can change -`MQUERY_PLUGINS` environment variable in the container to load existing -plugin, but to load your own plugin you need to create your own image. +Remember that you can also use environment variable MQUERY_PLUGINS to do the +same thing - this may be useful for docker-based deployments. ## Filter plugins diff --git a/requirements.plain.txt b/requirements.plain.txt index a5110953..4765c9a4 100644 --- a/requirements.plain.txt +++ b/requirements.plain.txt @@ -9,3 +9,4 @@ yara-python yaramod cachetools pyjwt[crypto] +typed-config diff --git a/requirements.txt b/requirements.txt index 8831b051..d83900ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,3 +22,4 @@ yara-python==4.1.3 yaramod==3.12.1 PyJWT[crypto]==2.3.0 rq==1.11.1 +typed-config==1.3.2 diff --git a/src/.dockerignore b/src/.dockerignore index be51a77b..1d23974a 100644 --- a/src/.dockerignore +++ b/src/.dockerignore @@ -1,4 +1,3 @@ -config.py .pytest_cache .mypy_cache __pycache__ diff --git a/src/app.py b/src/app.py index c6588e5b..ccbcd5f4 100644 --- a/src/app.py +++ b/src/app.py @@ -3,7 +3,7 @@ from threading import Lock import uvicorn # type: ignore -import config +from config import app_config from fastapi import ( FastAPI, Body, @@ -48,9 +48,9 @@ ) -db = Database(config.REDIS_HOST, config.REDIS_PORT) +db = Database(app_config.redis.host, app_config.redis.port) app = FastAPI() -plugins = PluginManager(config.PLUGINS, db) +plugins = PluginManager(app_config.mquery.plugins, db) plugin_lock = Lock() diff --git a/src/config.docker.py b/src/config.docker.py deleted file mode 100644 index 9ce55a36..00000000 --- a/src/config.docker.py +++ /dev/null @@ -1,10 +0,0 @@ -import os -from plugins import parse_plugin_list - -BACKEND = os.environ.get("MQUERY_BACKEND", "tcp://ursadb:9281") -REDIS_HOST = os.environ.get("REDIS_HOST", "redis") -REDIS_PORT = int(os.environ.get("REDIS_PORT", 6379)) -JOB_EXPIRATION_MINUTES = int( - os.environ.get("JOB_EXPIRATION_MINUTES", 0) -) # infinite by default -PLUGINS = parse_plugin_list(os.environ.get("MQUERY_PLUGINS", "")) diff --git a/src/config.example.py b/src/config.example.py deleted file mode 100644 index acba7aca..00000000 --- a/src/config.example.py +++ /dev/null @@ -1,8 +0,0 @@ -from typing import List - -INDEX_DIR = "./samples" -BACKEND = "tcp://127.0.0.1:9281" -REDIS_HOST = "127.0.0.1" -REDIS_PORT = 6379 -JOB_EXPIRATION_MINUTES = 0 # infinite by default -PLUGINS: List[str] = [] diff --git a/src/config.py b/src/config.py new file mode 100644 index 00000000..12fb1f3e --- /dev/null +++ b/src/config.py @@ -0,0 +1,34 @@ +from typedconfig import Config, key, section, group_key # type: ignore +from typedconfig.source import EnvironmentConfigSource, IniFileConfigSource # type: ignore +import os + + +@section("redis") +class RedisConfig(Config): + host = key(cast=str, required=False, default="localhost") + port = key(cast=int, required=False, default=6379) + + +@section("mquery") +class MqueryConfig(Config): + backend = key(cast=str, required=False, default="tcp://127.0.0.1:9281") + plugins = key(cast=str, required=False, default="") + + +class AppConfig(Config): + redis = group_key(RedisConfig) + mquery = group_key(MqueryConfig) + + +def _config_sources(): + return [ + EnvironmentConfigSource(), + IniFileConfigSource("mquery.ini", must_exist=False), + IniFileConfigSource( + os.path.expanduser("~/.config/mquery/mquery.ini"), must_exist=False + ), + IniFileConfigSource("/etc/mquery/mquery.ini", must_exist=False), + ] + + +app_config = AppConfig(sources=_config_sources()) diff --git a/src/daemon.py b/src/daemon.py index 5c64e415..525fc457 100644 --- a/src/daemon.py +++ b/src/daemon.py @@ -4,7 +4,7 @@ import logging from util import setup_logging import tasks -import config +from config import app_config from redis import Redis from rq import Connection, Worker # type: ignore @@ -15,7 +15,7 @@ def start_worker(args: argparse.Namespace, process_index: int) -> None: "Agent [%s] running (process %s)...", args.group_id, process_index ) - with Connection(Redis(config.REDIS_HOST, config.REDIS_PORT)): + with Connection(Redis(app_config.redis.host, app_config.redis.port)): w = Worker([args.group_id]) w.work() diff --git a/src/plugins/README.md b/src/plugins/README.md deleted file mode 100644 index d1ad650e..00000000 --- a/src/plugins/README.md +++ /dev/null @@ -1,14 +0,0 @@ -mquery plugins -============== - -Code used for integration of other systems with mquery. - -If you want to enable plugin, go to `__init__.py` file and add it to `METADATA_PLUGINS` list: -```python -# Feel free to import plugins here and add them to list below -from .mwdb_uploads import MalwarecageUploadsMetadata - -METADATA_PLUGINS = [ - MalwarecageUploadsMetadata -] -``` diff --git a/src/plugins/__init__.py b/src/plugins/__init__.py index d58a4a94..42a46f49 100644 --- a/src/plugins/__init__.py +++ b/src/plugins/__init__.py @@ -9,8 +9,8 @@ def parse_plugin_list(plugins: str) -> List[str]: """Parses and validates a plugin list into a list of non-empty components divided by `,`. - >>> parse_plugin_list("plugins.Test, plugins.Other") - ["plugins.Test", "plugins.Other"] + >>> parse_plugin_list("plugins.Test:A, plugins.Other:A") + ["plugins.Test:A", "plugins.Other:A"] >>> parse_plugin_list("") [] @@ -38,8 +38,8 @@ def load_plugins(specs: List[str]) -> List[Type[MetadataPlugin]]: class PluginManager: - def __init__(self, specs: List[str], db: Database) -> None: - self.plugin_classes = load_plugins(specs) + def __init__(self, spec: str, db: Database) -> None: + self.plugin_classes = load_plugins(parse_plugin_list(spec)) active_plugins = [] for plugin_class in self.plugin_classes: diff --git a/src/plugins/example_plugin.py b/src/plugins/example_plugin.py index 4863824f..78711bf1 100644 --- a/src/plugins/example_plugin.py +++ b/src/plugins/example_plugin.py @@ -1,9 +1,11 @@ from db import Database - from metadata import Metadata, MetadataPlugin, MetadataPluginConfig class ExampleTagPlugin(MetadataPlugin): + """This plugin is a minimal (almost) example of extractor plugin. + It will tag every processed file with configured tag and URL.""" + cacheable = True is_extractor = True config_fields = { diff --git a/src/plugins/example_typed_config_plugin.py b/src/plugins/example_typed_config_plugin.py new file mode 100644 index 00000000..91452ee8 --- /dev/null +++ b/src/plugins/example_typed_config_plugin.py @@ -0,0 +1,41 @@ +"""Plugin that serves as an example how to use existing typed-config +machinery to configure your own plugins""" + +from db import Database +from metadata import Metadata, MetadataPlugin, MetadataPluginConfig +from typedconfig import Config, key, section +from config import app_config + + +@section("plugin.example") +class ExamplePluginConfig(Config): + """Plugin configuration""" + + tag = key(cast=str) + tag_url = key(cast=str) + + +# You will need to add this to your config file (or use env vars): +# +# [plugin.example] +# tag=kot +# tag_url=http://google.com + + +class ExamplePluginWithTypedConfig(MetadataPlugin): + """This plugin serves as an example how to use typed-config and + mquery config file to configure your own plugins. It's equivalent + to ExamplePlugin in all except the configuration method""" + + is_extractor = True + + def __init__(self, db: Database, config: MetadataPluginConfig) -> None: + super().__init__(db, config) + my_config = ExamplePluginConfig(provider=app_config.provider) + self.tag = my_config.tag + self.tag_url = my_config.tag_url + + def extract( + self, identifier: str, matched_fname: str, current_meta: Metadata + ) -> Metadata: + return {"example_tag": {"display_text": self.tag, "url": self.tag_url}} diff --git a/src/tasks.py b/src/tasks.py index 6fec9d57..4d07be53 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -4,7 +4,7 @@ from schema import JobSchema from lib.yaraparse import parse_yara, combine_rules from plugins import PluginManager -import config +from config import app_config from rq import get_current_job, Queue # type: ignore from db import Database, JobId, MatchInfo from redis import Redis @@ -21,12 +21,13 @@ def __init__(self, group_id: str) -> None: single group, but they must all work on the same ursadb instance. Reads connection parameters and plugins from the global config.""" self.group_id = group_id - self.ursa_url = config.BACKEND - self.db = Database(config.REDIS_HOST, config.REDIS_PORT) + self.ursa_url = app_config.mquery.backend + self.db = Database(app_config.redis.host, app_config.redis.port) self.ursa = UrsaDb(self.ursa_url) - self.plugins = PluginManager(config.PLUGINS, self.db) + self.plugins = PluginManager(app_config.mquery.plugins, self.db) self.queue = Queue( - group_id, connection=Redis(config.REDIS_HOST, config.REDIS_PORT) + group_id, + connection=Redis(app_config.redis.host, app_config.redis.port), ) def register(self) -> None: