diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index bb769852..91a98060 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -22,9 +22,9 @@ jobs: codecov: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Python Tools diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 5adbab58..6c4e59e7 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -22,9 +22,9 @@ jobs: codestyle: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Kestrel package diff --git a/.github/workflows/kaas-docker-image.yml b/.github/workflows/kaas-docker-image.yml index 1d36879e..1738aa07 100644 --- a/.github/workflows/kaas-docker-image.yml +++ b/.github/workflows/kaas-docker-image.yml @@ -14,7 +14,7 @@ jobs: run: sleep 600s shell: bash - name: Checkout - uses: actions/checkout@v3.5.3 + uses: actions/checkout@v4 - name: Info run: echo "Parameters. ${{ github.event.base_ref }}, ${{ github.ref_type }}, ${{ github.ref }}" - name: Log in to Docker Hub diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 47c52fb2..343fcf29 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -22,8 +22,8 @@ jobs: shell: bash working-directory: ./packages/${{ matrix.package }} steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install building environment diff --git a/.github/workflows/stixshifter-module-verification.yml b/.github/workflows/stixshifter-module-verification.yml index f9b1265c..66949595 100644 --- a/.github/workflows/stixshifter-module-verification.yml +++ b/.github/workflows/stixshifter-module-verification.yml @@ -15,9 +15,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_datasource_stixshifter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Python Tools diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml index 06b79fde..4113a1e1 100644 --- a/.github/workflows/unit-testing-kestrel2.yml +++ b/.github/workflows/unit-testing-kestrel2.yml @@ -30,9 +30,9 @@ jobs: shell: bash working-directory: ./packages-nextgen/kestrel_core steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -42,3 +42,28 @@ jobs: - name: Unit testing run: pytest -vv + test-kestrel-interface-opensearch: + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + working-directory: ./packages-nextgen/kestrel_interface_opensearch + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install Python Tools + run: pip install --upgrade pip setuptools wheel pytest + - name: Install kestrel_core + working-directory: ./packages-nextgen/kestrel_core + run: pip install . + - name: Install kestrel_interface_opensearch + run: pip install . + - name: Unit testing + run: pytest -vv diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 1733bfba..8af6b843 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -23,16 +23,16 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: shell: bash working-directory: ./packages/kestrel_core steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -52,16 +52,16 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11.6'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: shell: bash working-directory: ./packages/kestrel_datasource_stixshifter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -78,16 +78,16 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: shell: bash working-directory: ./packages/kestrel_analytics_python steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -107,16 +107,16 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: shell: bash working-directory: ./packages/kestrel_jupyter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml index 150c9b34..e1174ba5 100644 --- a/.github/workflows/unused-import.yml +++ b/.github/workflows/unused-import.yml @@ -22,9 +22,9 @@ jobs: unusedimports: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Kestrel package diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 083ad89d..bf88f8dc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,22 @@ The format is based on `Keep a Changelog`_. Unreleased ========== +1.8.3 (2024-04-22) +================== + +Added +----- + +- Support of disabling certificate verification of stix-shifter v7 with config option `verify_cert` +- Documentation on how to use the `verify_cert` option in the stix-shifter interface +- Python 3.12 support (multiprocessing library behavior steering to avoid a CPU-blocking issue) +- More generic HTML parsing of PyPI for stix-shfiter connector verification + +Changed +------- + +- stix-shifter upgraded to v7 (v7.0.6), the first version abandoning invalid certificate support + 1.8.2 (2024-02-20) ================== diff --git a/README.rst b/README.rst index 1edf91ad..cdcd4c06 100644 --- a/README.rst +++ b/README.rst @@ -2,31 +2,11 @@ :width: 460 :alt: Kestrel Threat Hunting Language -.. image:: https://readthedocs.org/projects/kestrel/badge/?version=latest - :target: https://kestrel.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - -.. image:: https://img.shields.io/pypi/v/kestrel-jupyter - :target: https://pypi.python.org/pypi/kestrel-jupyter - :alt: Latest Version - -.. image:: https://img.shields.io/pypi/dm/kestrel-core - :target: https://pypistats.org/packages/kestrel-core - :alt: PyPI Downloads - -.. image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3 - :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang - :alt: Code Coverage - -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - :alt: Code Style: Black - | -**[News]** Kestrel session at `Black Hat USA 2023`_ +|readthedocs| |pypi| |downloads| |codecoverage| |black| --------- +| Kestrel is a threat hunting language aiming to make cyber threat hunting *fast* by providing a layer of abstraction to build reusable, composable, and @@ -215,3 +195,24 @@ Connecting With The Community .. _contributing guideline: CONTRIBUTING.rst .. _governance documentation: GOVERNANCE.rst .. _Apache License 2.0: LICENSE.md + + +.. |readthedocs| image:: https://readthedocs.org/projects/kestrel/badge/?version=latest + :target: https://kestrel.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +.. |pypi| image:: https://img.shields.io/pypi/v/kestrel-jupyter + :target: https://pypi.python.org/pypi/kestrel-jupyter + :alt: Latest Version + +.. |downloads| image:: https://img.shields.io/pypi/dm/kestrel-core + :target: https://pypistats.org/packages/kestrel-core + :alt: PyPI Downloads + +.. |codecoverage| image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3 + :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang + :alt: Code Coverage + +.. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/psf/black + :alt: Code Style: Black diff --git a/docs/installation/runtime.rst b/docs/installation/runtime.rst index c220f264..b70d4072 100644 --- a/docs/installation/runtime.rst +++ b/docs/installation/runtime.rst @@ -8,7 +8,11 @@ please use Python inside Windows Subsystem for Linux (WSL). General Requirements ==================== -Python 3.8 is required. Follow the `Python installation guide`_ to install or upgrade Python. +Python 3 is required. + +* End-of-life Python versions are not supported. Check `Python releases`_. + +* Follow the `Python installation guide`_ to install or upgrade Python. OS-specific Requirements ======================== @@ -190,6 +194,7 @@ What's to Do Next - :doc:`../language/index` .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ +.. _Python releases: https://devguide.python.org/versions/ .. _Python virtual environment: https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/ .. _Xcode: https://developer.apple.com/xcode/ .. _kestrel-lang: http://github.com/opencybersecurityalliance/kestrel-lang diff --git a/packages-nextgen/kestrel_core/pyproject.toml b/packages-nextgen/kestrel_core/pyproject.toml index 61f48941..e57a5bca 100644 --- a/packages-nextgen/kestrel_core/pyproject.toml +++ b/packages-nextgen/kestrel_core/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "mashumaro>=3.10", "networkx>=3.1", # networkx==3.2.1 only for Python>=3.9 "SQLAlchemy>=2.0.23", + "dpath>=2.1.6", ] [project.optional-dependencies] diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py index b4f5f101..4d1a94bb 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py @@ -1,23 +1,24 @@ +from __future__ import annotations from pandas import DataFrame from typing import MutableMapping from uuid import UUID from abc import abstractmethod from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.interface.datasource import AbstractDataSourceInterface +from kestrel.interface import AbstractInterface -class AbstractCache(AbstractDataSourceInterface, MutableMapping): +class AbstractCache(AbstractInterface, MutableMapping): """Base class for Kestrel cache - Additional @abstractmethod from AbstractDataSourceInterface: + Additional @abstractmethod from AbstractInterface: - evaluate_graph() """ - @property - def name(self): - return CACHE_INTERFACE_IDENTIFIER + @staticmethod + def schemes() -> Iterable[str]: + return [CACHE_INTERFACE_IDENTIFIER] @abstractmethod def __del__(self): @@ -28,6 +29,8 @@ def __del__(self): def __getitem__(self, instruction_id: UUID) -> DataFrame: """Get the dataframe for the cached instruction + This method will automatically support `uuid in cache` + Parameters: instruction_id: id of the instruction @@ -57,16 +60,32 @@ def __delitem__(self, instruction_id: UUID): """ ... - def store(self, instruction_id: UUID, data: DataFrame): - self[instruction_id] = data + @abstractmethod + def get_virtual_copy(self) -> AbstractCache: + """Create a virtual cache object from this cache - def __contain__(self, instruction_id: UUID) -> bool: - """Whether the evaluated instruction is cached + This method needs to reimplement __del__, __getitem__, __setitem__, + __delitem__ to not actually hit the store media, e.g., SQLite. - Parameters: - instruction_id: id of the instruction + The virtual cache is useful for the implementation of the Explain() + instruction, pretending the dependent graphs are evaluated, so the + evaluation can continue towards the Return() instruction. + + Because Python invokes special methods from class methods, replacing + the __getitem__, __setitem__, and __delitem__ in the object does not + help. It is better to derive a subclass and replace __class__ of the + object to the subclass to correctly invoke the new set of __xitem___. + + https://docs.python.org/3/reference/datamodel.html#special-lookup + + And Python garbage collector could clean up the virtual cache when + not in use, so the __del__ method should be reimplemented to make + sure the store media is not closed. """ - return instruction_id in self.cache_catalog + ... + + def store(self, instruction_id: UUID, data: DataFrame): + self[instruction_id] = data def __iter__(self) -> UUID: """Return UUIDs of instructions cached diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py index e0527b9c..87557222 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py @@ -1,3 +1,4 @@ +from copy import copy from pandas import DataFrame from typeguard import typechecked from uuid import UUID @@ -6,19 +7,22 @@ MutableMapping, Optional, Iterable, + Any, ) from kestrel.cache.base import AbstractCache from kestrel.ir.graph import IRGraphEvaluable +from kestrel.display import GraphletExplanation, NativeQuery from kestrel.ir.instructions import ( Instruction, Return, + Explain, Variable, Filter, SourceInstruction, TransformingInstruction, ) -from kestrel.interface.datasource.codegen.dataframe import ( +from kestrel.interface.codegen.dataframe import ( evaluate_source_instruction, evaluate_transforming_instruction, ) @@ -44,7 +48,7 @@ def __getitem__(self, instruction_id: UUID) -> DataFrame: return self.cache[self.cache_catalog[instruction_id]] def __delitem__(self, instruction_id: UUID): - del self.cache[instruction_id] + del self.cache[self.cache_catalog[instruction_id]] del self.cache_catalog[instruction_id] def __setitem__( @@ -52,23 +56,42 @@ def __setitem__( instruction_id: UUID, data: DataFrame, ): - self.cache[instruction_id] = data - self.cache_catalog[instruction_id] = instruction_id + self.cache_catalog[instruction_id] = instruction_id.hex + self.cache[self.cache_catalog[instruction_id]] = data + + def get_virtual_copy(self) -> AbstractCache: + v = copy(self) + v.cache_catalog = copy(self.cache_catalog) + v.__class__ = InMemoryCacheVirtual + return v def evaluate_graph( self, graph: IRGraphEvaluable, instructions_to_evaluate: Optional[Iterable[Instruction]] = None, ) -> Mapping[UUID, DataFrame]: + mapping = {} if not instructions_to_evaluate: instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + df = self._evaluate_instruction_in_graph(graph, instruction) + self[instruction.id] = df + mapping[instruction.id] = df + return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: mapping = {} - for ins in instructions_to_evaluate: - df = self._evaluate_instruction_in_graph(graph, ins) - self[ins.id] = df - mapping[ins.id] = df - + if not instructions_to_evaluate: + instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query = NativeQuery("DataFrame", "") + mapping[instruction.id] = GraphletExplanation(graph_dict, query) return mapping def _evaluate_instruction_in_graph( @@ -81,7 +104,7 @@ def _evaluate_instruction_in_graph( elif isinstance(instruction, TransformingInstruction): trunk, r2n = graph.get_trunk_n_branches(instruction) df = self._evaluate_instruction_in_graph(graph, trunk) - if isinstance(instruction, Return): + if isinstance(instruction, (Return, Explain)): pass elif isinstance(instruction, Variable): self[instruction.id] = df @@ -99,3 +122,15 @@ def _evaluate_instruction_in_graph( else: raise NotImplementedError(f"Unknown instruction type: {instruction}") return df + + +@typechecked +class InMemoryCacheVirtual(InMemoryCache): + def __getitem__(self, instruction_id: UUID) -> Any: + return self.cache_catalog[instruction_id] + + def __delitem__(self, instruction_id: UUID): + del self.cache_catalog[instruction_id] + + def __setitem__(self, instruction_id: UUID, data: Any): + self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 545513a5..97b8fb13 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -1,5 +1,6 @@ import logging -from typing import Iterable, Mapping, Optional, Union +from copy import copy +from typing import Iterable, Mapping, Optional, Union, Any from uuid import UUID import sqlalchemy @@ -8,12 +9,14 @@ from typeguard import typechecked from kestrel.cache.base import AbstractCache -from kestrel.interface.datasource.codegen.sql import SqlTranslator +from kestrel.interface.codegen.sql import SqlTranslator from kestrel.ir.graph import IRGraphEvaluable +from kestrel.display import GraphletExplanation, NativeQuery from kestrel.ir.instructions import ( Construct, Instruction, Return, + Explain, Variable, Filter, SourceInstruction, @@ -28,12 +31,13 @@ class SqliteTranslator(SqlTranslator): def __init__(self, from_obj: Union[SqlTranslator, str]): if isinstance(from_obj, SqlTranslator): - fc = from_obj.query.subquery() + fc = from_obj.query.subquery(name=from_obj.associated_variable) else: # str to represent table name fc = sqlalchemy.table(from_obj) super().__init__( sqlalchemy.dialects.sqlite.dialect(), dt_parser, "time", fc ) # FIXME: need mapping for timestamp? + self.associated_variable = None @typechecked @@ -45,12 +49,12 @@ def __init__( ): super().__init__() - basename = self.session_id or "cache" - path = f"{basename}.db" + basename = session_id or "cache" + self.db_path = f"{basename}.db" # for an absolute file path, the three slashes are followed by the absolute path # for a relative path, it's also three slashes? - self.engine = sqlalchemy.create_engine(f"sqlite:///{path}") + self.engine = sqlalchemy.create_engine(f"sqlite:///{self.db_path}") self.connection = self.engine.connect() if initial_cache: @@ -77,6 +81,12 @@ def __setitem__( self.cache_catalog[instruction_id] = table_name data.to_sql(table_name, con=self.connection, if_exists="replace", index=False) + def get_virtual_copy(self) -> AbstractCache: + v = copy(self) + v.cache_catalog = copy(self.cache_catalog) + v.__class__ = SqliteCacheVirtual + return v + def evaluate_graph( self, graph: IRGraphEvaluable, @@ -93,6 +103,22 @@ def evaluate_graph( mapping[instruction.id] = read_sql(translator.result(), self.connection) return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + translator = self._evaluate_instruction_in_graph(graph, instruction) + query = NativeQuery("SQL", str(translator.result_w_literal_binds())) + mapping[instruction.id] = GraphletExplanation(graph_dict, query) + return mapping + def _evaluate_instruction_in_graph( self, graph: IRGraphEvaluable, @@ -118,12 +144,13 @@ def _evaluate_instruction_in_graph( translator = self._evaluate_instruction_in_graph(graph, trunk) if isinstance(instruction, SolePredecessorTransformingInstruction): - if isinstance(instruction, Return): + if isinstance(instruction, (Return, Explain)): pass elif isinstance(instruction, Variable): # start a new translator and use previous one as subquery # this allows using the variable as a dependent node # if the variable is a sink, `SELECT * FROM (subquery)` also works + translator.associated_variable = instruction.name translator = SqliteTranslator(translator) else: translator.add_instruction(instruction) @@ -147,3 +174,18 @@ def _evaluate_instruction_in_graph( raise NotImplementedError(f"Unknown instruction type: {instruction}") return translator + + +@typechecked +class SqliteCacheVirtual(SqliteCache): + def __getitem__(self, instruction_id: UUID) -> Any: + return self.cache_catalog[instruction_id] + + def __delitem__(self, instruction_id: UUID): + del self.cache_catalog[instruction_id] + + def __setitem__(self, instruction_id: UUID, data: Any): + self.cache_catalog[instruction_id] = instruction_id.hex + "v" + + def __del__(self): + pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py index 8911b8a7..0b912e7a 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py @@ -1,24 +1,33 @@ import os import yaml -import pathlib +from pathlib import Path import logging +from typeguard import typechecked +from typing import Mapping, Union from kestrel.utils import update_nested_dict, load_data_file -CONFIG_DIR_DEFAULT = pathlib.Path.home() / ".config" / "kestrel" +CONFIG_DIR_DEFAULT = Path.home() / ".config" / "kestrel" CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml" CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG" # override CONFIG_PATH_DEFAULT if provided _logger = logging.getLogger(__name__) -def load_default_config(): +@typechecked +def load_default_config() -> Mapping: _logger.debug(f"Loading default config file...") default_config = load_data_file("kestrel.config", "kestrel.yaml") - return yaml.safe_load(os.path.expandvars(default_config)) + config_with_envvar_expanded = os.path.expandvars(default_config) + config_content = yaml.safe_load(config_with_envvar_expanded) + return config_content -def load_user_config(config_path_env_var, config_path_default): +@typechecked +def load_user_config( + config_path_env_var: str, config_path_default: Union[str, Path] +) -> Mapping: + config_path_default = config_path_default.absolute().as_posix() config_path = os.getenv(config_path_env_var, config_path_default) config_path = os.path.expanduser(config_path) config = {} @@ -32,13 +41,10 @@ def load_user_config(config_path_env_var, config_path_default): return config -def load_config(): +@typechecked +def load_config() -> Mapping: config_default = load_default_config() config_user = load_user_config(CONFIG_PATH_ENV_VAR, CONFIG_PATH_DEFAULT) _logger.debug(f"User configuration loaded: {config_user}") _logger.debug(f"Updating default config with user config...") return update_nested_dict(config_default, config_user) - - -if __name__ == "__main__": - ... diff --git a/packages-nextgen/kestrel_core/src/kestrel/display.py b/packages-nextgen/kestrel_core/src/kestrel/display.py index 49758f4d..e6729f85 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/display.py +++ b/packages-nextgen/kestrel_core/src/kestrel/display.py @@ -1 +1,34 @@ +from typing import List, Union, Mapping +from dataclasses import dataclass +from mashumaro.mixins.json import DataClassJSONMixin +from pandas import DataFrame + + +@dataclass +class NativeQuery(DataClassJSONMixin): + # which query language + language: str + # what query statement + statement: str + + +@dataclass +class GraphletExplanation(DataClassJSONMixin): + # serialized IRGraph + graph: Mapping + # data source query + query: NativeQuery + + +@dataclass +class GraphExplanation(DataClassJSONMixin): + graphlets: List[GraphletExplanation] + + # Kestrel Display Object +Display = Union[ + str, + dict, + DataFrame, + GraphExplanation, +] diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py index ae278f9a..cd088afe 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py @@ -74,6 +74,10 @@ class DuplicatedReferenceInFilter(KestrelError): pass +class MissingReferenceInFilter(KestrelError): + pass + + class InvalidSerializedDatasourceInterfaceCacheCatalog(KestrelError): pass @@ -90,23 +94,19 @@ class InterfaceNotFound(KestrelError): pass -class InterfaceNameCollision(KestrelError): - pass - - class IRGraphMissingNode(KestrelError): pass -class DataSourceInterfaceNotFound(KestrelError): +class InterfaceNotConfigured(KestrelError): pass -class InvalidDataSourceInterfaceImplementation(KestrelError): +class InvalidInterfaceImplementation(KestrelError): pass -class ConflictingDataSourceInterfaceScheme(KestrelError): +class ConflictingInterfaceScheme(KestrelError): pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py index fcbab5b4..cb1f897f 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py @@ -1,5 +1,6 @@ # Lark Transformer +import logging from datetime import datetime, timedelta from functools import reduce @@ -7,6 +8,7 @@ from lark import Transformer, Token from typeguard import typechecked +from kestrel.mapping.data_model import translate_comparison_to_ocsf from kestrel.utils import unescape_quoted_string from kestrel.ir.filter import ( FExpression, @@ -41,10 +43,14 @@ Return, Sort, Variable, + Explain, ) from kestrel.exceptions import IRGraphMissingNode +_logger = logging.getLogger(__name__) + + DEFAULT_VARIABLE = "_" DEFAULT_SORT_ORDER = "DESC" @@ -94,17 +100,29 @@ def _map_filter_exp( if ":" not in field: field = f"{entity_name}:{field}" # map field to new syntax (e.g. STIX to OCSF) - map_result = property_map.get(field, filter_exp.field) + # TODO: ECS to OCSF? Would need to merge STIX and ECS data model maps. + map_result = translate_comparison_to_ocsf( + property_map, field, filter_exp.op, filter_exp.value + ) # Build a MultiComp if field maps to several values - if isinstance(map_result, (list, tuple)): - op = filter_exp.op - value = filter_exp.value + if len(map_result) > 1: filter_exp = MultiComp( - ExpOp.OR, [_create_comp(field, op, value) for field in map_result] + ExpOp.OR, + [_create_comp(field, op, value) for field, op, value in map_result], ) - else: # change the name of the field if it maps to a single value - filter_exp.field = map_result - + elif len(map_result) == 1: # it maps to a single value + mapping = map_result[0] + _logger.debug("mapping = %s", mapping) + field = mapping[0] + prefix = f"{entity_name}." + if field.startswith(prefix): + # Need to prune the entity name + field = field[len(prefix) :] + filter_exp.field = field + filter_exp.op = mapping[1] + filter_exp.value = mapping[2] + else: # pass-through + pass # TODO: for RefComparison, map the attribute in value (may not be possible here) elif isinstance(filter_exp, BoolExp): @@ -151,7 +169,7 @@ def __init__( self.default_sort_order = default_sort_order self.token_prefix = token_prefix self.entity_map = entity_map - self.property_map = property_map + self.property_map = property_map # TODO: rename to data_model_map? super().__init__() def start(self, args): @@ -371,3 +389,10 @@ def disp(self, args): graph, root = args[0] graph.add_node(Return(), root) return graph + + def explain(self, args): + graph = IRGraph() + reference = graph.add_node(Reference(args[0].value)) + explain = graph.add_node(Explain(), reference) + graph.add_node(Return(), explain) + return graph diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark index eda6958c..1e00bfc9 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark @@ -28,10 +28,11 @@ assignment: VARIABLE "=" expression | sort ?command_no_result: apply + | explain + | describe | disp | info | save - | describe // // All commands @@ -61,6 +62,8 @@ save: "SAVE"i VARIABLE "TO"i stdpath describe: "DESCRIBE"i var_attr +explain: "EXPLAIN"i VARIABLE + // // Variable definition // diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py index e5bcbdab..0ff482c5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py @@ -1,14 +1,20 @@ # parse Kestrel syntax, apply frontend mapping, transform to IR +import logging +import os from itertools import chain from kestrel.frontend.compile import _KestrelT +from kestrel.mapping.data_model import reverse_mapping from kestrel.utils import load_data_file from lark import Lark -import os from typeguard import typechecked import yaml + +_logger = logging.getLogger(__name__) + + frontend_mapping = {} @@ -21,9 +27,13 @@ def get_mapping(mapping_type: str, mapping_package: str, mapping_filepath: str) try: mapping_str = load_data_file(mapping_package, mapping_filepath) mapping = yaml.safe_load(mapping_str) + if mapping_type == "property": + # New data model map is always OCSF->native + mapping = reverse_mapping(mapping) frontend_mapping[mapping_type] = mapping except Exception as ex: - mapping = None + _logger.error("Failed to load %s", mapping_str, exc_info=ex) + mapping = None # FIXME: this is not a dict return mapping diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py index e69de29b..3c4b25e5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py @@ -0,0 +1,2 @@ +from kestrel.interface.base import AbstractInterface +from kestrel.interface.manager import InterfaceManager diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py similarity index 70% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/base.py index 0e730d89..50f5601f 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py @@ -9,6 +9,7 @@ Iterable, ) +from kestrel.display import GraphletExplanation from kestrel.ir.instructions import Instruction from kestrel.ir.graph import IRGraphEvaluable from kestrel.exceptions import ( @@ -16,11 +17,11 @@ ) -MODULE_PREFIX = "kestrel_datasource_" +MODULE_PREFIX = "kestrel_interface_" -class AbstractDataSourceInterface(ABC): - """Abstract class for datasource interface +class AbstractInterface(ABC): + """Abstract class for datasource/analytics interface Concepts: @@ -43,7 +44,6 @@ def __init__( session_id: Optional[UUID] = None, ): self.session_id = session_id - self.datasources: Mapping[str, str] = {} self.cache_catalog: MutableMapping[UUID, str] = {} if serialized_cache_catalog: @@ -52,12 +52,14 @@ def __init__( except: raise InvalidSerializedDatasourceInterfaceCacheCatalog() - @property + # Python 3.13 will drop chain of @classmethod and @property + # use @staticmethod instead (cannot make it a property) + @staticmethod @abstractmethod - def name(self) -> str: - """The name of the interface + def schemes() -> Iterable[str]: + """The schemes to specify the interface - The name should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` + Each scheme should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` """ ... @@ -97,7 +99,7 @@ def evaluate_graph( Parameters: - graph: The IRGraph with zero or one interface + graph: The evaluate IRGraph instructions_to_evaluate: instructions to evaluate and return; by default, it will be all Return instructions in the graph @@ -107,6 +109,26 @@ def evaluate_graph( """ ... + @abstractmethod + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + """Explain how to evaluate the IRGraph + + Parameters: + + graph: The evaluable IRGraph + + instructions_to_explain: instructions to explain and return; by default, it will be all Return instructions in the graph + + Returns: + + GraphletExplanation (a Kestrel Display object) for each instruction in instructions_to_explain. + """ + ... + def cache_catalog_to_json(self) -> str: """Serialize the cache catalog to a JSON string""" return json.dumps(self.cache_catalog) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py deleted file mode 100644 index bd74f728..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel.interface.datasource.base import AbstractDataSourceInterface diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py deleted file mode 100644 index d6806715..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py +++ /dev/null @@ -1,21 +0,0 @@ -from kestrel.exceptions import ( - DataSourceInterfaceNotFound, - InvalidDataSourceInterfaceImplementation, - ConflictingDataSourceInterfaceScheme, -) -from kestrel.interface.manager import InterfaceManager -from kestrel.interface.datasource.base import ( - MODULE_PREFIX, - AbstractDataSourceInterface, -) - - -class DataSourceManager(InterfaceManager): - def __init__(self): - super().__init__( - MODULE_PREFIX, - AbstractDataSourceInterface, - DataSourceInterfaceNotFound, - InvalidDataSourceInterfaceImplementation, - ConflictingDataSourceInterfaceScheme, - ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py deleted file mode 100644 index 33a49975..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -from typing import Iterable -from typeguard import typechecked - -from kestrel.interface.datasource import AbstractDataSourceInterface -from kestrel.exceptions import ( - InterfaceNotFound, - InterfaceNameCollision, -) - - -@typechecked -def get_interface_by_name( - interface_name: str, interfaces: Iterable[AbstractDataSourceInterface] -): - """Find an interface by its name - - Parameters: - interface_name: the name of an interface - interfaces: the list of interfaces - - Returns: - The interface found - """ - ifs = filter(lambda x: x.name == interface_name, interfaces) - try: - interface = next(ifs) - except StopIteration: - raise InterfaceNotFound(interface_name) - else: - try: - next(ifs) - except StopIteration: - # expected behavior - pass - else: - raise InterfaceNameCollision(interface_name) - return interface diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index a66a1ce1..b5fd0904 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -1,94 +1,112 @@ -from abc import ABC - +from __future__ import annotations import importlib import pkgutil import logging import inspect import sys +import itertools +from copy import copy +from typeguard import typechecked +from typing import Mapping, Iterable, Type -from kestrel.exceptions import KestrelError +from kestrel.exceptions import ( + InterfaceNotConfigured, + InterfaceNotFound, + InvalidInterfaceImplementation, + ConflictingInterfaceScheme, +) +from kestrel.interface.base import MODULE_PREFIX, AbstractInterface +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER _logger = logging.getLogger(__name__) -class InterfaceManager: - def __init__( - self, - module_name_prefix: str, - interface_class: ABC, - nonexist_interface_exception: KestrelError, - invalid_interface_exception: KestrelError, - conflict_interface_exception: KestrelError, - ): - self.scheme_to_interface: dict[str, ABC] = {} - self.nonexist_interface_exception = nonexist_interface_exception - - for iface_cls in _load_interfaces( - module_name_prefix, - interface_class, - invalid_interface_exception, - conflict_interface_exception, - ).values(): - iface = iface_cls() - _logger.debug("Loading data source interface '%s' (%s)", iface.name, iface) - self.scheme_to_interface[iface.name] = iface - - def interfaces(self): - return list(self.scheme_to_interface.values()) - - def schemes(self): - return list(self.scheme_to_interface.keys()) - - -def _load_interfaces( - module_name_prefix, - interface_class, - invalid_interface_exception, - conflict_interface_exception, -): - is_interface = _is_class(interface_class) - interface_names = _list_interfaces(module_name_prefix) - interfaces = {} - for interface_name in interface_names: - mod = importlib.import_module(interface_name) - _logger.debug("Imported %s from interface name %s", mod, interface_name) - cls = inspect.getmembers(sys.modules[interface_name], is_interface) +# basically a scheme to interface mapping +@typechecked +class InterfaceManager(Mapping): + def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): + interface_classes = _load_interface_classes() + self.interfaces = list(init_interfaces) # copy/recreate the list + for iface_cls in interface_classes: + try: + iface = iface_cls() + _logger.debug(f"Initialize interface {iface_cls.__name__}") + self.interfaces.append(iface) + except InterfaceNotConfigured as e: + _logger.debug(f"Interface {iface_cls.__name__} not configured; ignored") + + def __getitem__(self, scheme: str) -> AbstractInterface: + for interface in self.interfaces: + if scheme in interface.schemes(): + return interface + else: + raise InterfaceNotFound(f"no interface loaded for scheme {scheme}") + + def __iter__(self) -> Iterable[str]: + return itertools.chain(*[i.schemes() for i in self.interfaces]) + + def __len__(self) -> int: + return sum(1 for _ in iter(self)) + + def copy_with_virtual_cache(self) -> InterfaceManager: + im = copy(self) + # shallow copy refers to the same list, so create/copy a new one + im.interfaces = copy(im.interfaces) + # now swap in virtual cache + cache = im[CACHE_INTERFACE_IDENTIFIER] + im.interfaces.remove(cache) + im.interfaces.append(cache.get_virtual_copy()) + return im + + def del_cache(self): + cache = self[CACHE_INTERFACE_IDENTIFIER] + self.interfaces.remove(cache) + del cache + + +def _load_interface_classes(): + interface_clss = [] + for itf_pkg_name in _list_interface_pkg_names(): + mod = importlib.import_module(itf_pkg_name) + _logger.debug(f"Imported {mod} from package {itf_pkg_name}") + cls = inspect.getmembers( + sys.modules[itf_pkg_name], _is_class(AbstractInterface) + ) if not cls: - raise invalid_interface_exception( - f'no interface class found in "{interface_name}"' + raise InvalidInterfaceImplementation( + f'no interface class found in package "{itf_pkg_name}"' ) elif len(cls) > 1: - raise invalid_interface_exception( - f'more than one interface class found in "{interface_name}"' + raise InvalidInterfaceImplementation( + f'more than one interface class found in package "{itf_pkg_name}"' ) else: - interface = cls[0][1] - interface_conflict, scheme_conflict = _search_scheme_conflict( - interface, interfaces.values() - ) - if interface_conflict: - raise conflict_interface_exception( - interface, interface_conflict, scheme_conflict - ) - interfaces[interface_name] = interface - return interfaces + interface_cls = cls[0][1] + _guard_scheme_conflict(interface_cls, interface_clss) + interface_clss.append(interface_cls) + return interface_clss -def _list_interfaces(module_name_prefix): +def _list_interface_pkg_names(): pkg_names = [x.name for x in pkgutil.iter_modules()] - itf_names = [pkg for pkg in pkg_names if pkg.startswith(module_name_prefix)] - return list(itf_names) + itf_names = [pkg for pkg in pkg_names if pkg.startswith(MODULE_PREFIX)] + return itf_names def _is_class(cls): return lambda obj: inspect.isclass(obj) and obj.__bases__[0] == cls -def _search_scheme_conflict(new_interface, interfaces): +@typechecked +def _guard_scheme_conflict( + new_interface: Type[AbstractInterface], + interfaces: Iterable[Type[AbstractInterface]], +): for interface in interfaces: for scheme_new in new_interface.schemes(): for scheme_old in interface.schemes(): if scheme_new == scheme_old: - return interface, scheme_new - return None, None + raise ConflictingInterfaceScheme( + f"scheme: {scheme_new} conflicting between {new_interface.__name__} and {interface.__name__}" + ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py index f948dff9..ddc41b7d 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py +++ b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py @@ -36,6 +36,7 @@ InevaluableInstruction, LargerThanOneIndegreeInstruction, DuplicatedReferenceInFilter, + MissingReferenceInFilter, DanglingReferenceInFilter, DanglingFilter, ) @@ -124,18 +125,34 @@ def add_edges_from( self.add_edge(u, v, deref) def copy(self): - """Copy the IRGraph with all nodes as reference (not deepcopy)""" + """Copy the IRGraph with all nodes as reference (not deepcopy) + + Support subclass of IRGraph to be copied. + """ g = IRGraph() g.update(self) + + # subclass support + if type(g) != type(self): + g = type(self)(g) + return g def deepcopy(self): - """Copy the IRGraph with all nodes copied as new objects""" + """Copy the IRGraph with all nodes copied as new objects + + Support subclass of IRGraph to be deep copied. + """ g = IRGraph() o2n = {n: n.deepcopy() for n in self.nodes()} for u, v in self.edges(): g.add_edge(o2n[u], o2n[v]) g.add_nodes_from([o2n[n] for n in self.nodes() if self.degree(n) == 0]) + + # subclass support + if type(g) != type(self): + g = type(self)(g) + return g def get_node_by_id(self, ux: Union[UUID, str]) -> Instruction: @@ -372,6 +389,8 @@ def get_trunk_n_branches( ps = list(self.predecessors(node)) pps = [(p, pp) for p in self.predecessors(node) for pp in self.predecessors(p)] + # may need to add a patch in find_dependent_subgraphs_of_node() + # for each new case added in the if/elif, e.g., FIlter if isinstance(node, SolePredecessorTransformingInstruction): if len(ps) > 1: raise LargerThanOneIndegreeInstruction() @@ -388,8 +407,10 @@ def get_trunk_n_branches( and p.attrs == [rv.attribute] and pp.name == rv.reference ] - if len(ppfs) > 1: - raise DuplicatedReferenceInFilter(ppfs) + if not ppfs: + raise MissingReferenceInFilter(rv, node, pps) + elif len(ppfs) > 1: + raise DuplicatedReferenceInFilter(rv, node, pps) else: p = ppfs[0][0] r2n[rv] = p @@ -536,10 +557,34 @@ def find_dependent_subgraphs_of_node( ps = set().union(*[set(g.predecessors(n)) for n in a2uns[interface]]) a2uns[interface].update(ps & cached_nodes) + # a patch (corner case handling) for get_trunk_n_branches() + # add Variable/Reference node if succeeded by ProjectAttrs and Filter, + # which are in the dependent graph; the Variable is only needed by + # get_trunk_n_branches() as an auxiliary node + for interface in a2uns: + auxs = [] + for n in a2uns[interface]: + if isinstance(n, ProjectAttrs): + # need to search in `self`, not `g`, since the boundry of + # `g` is cut by the cache + p = next(self.predecessors(n)) + s = next(g.successors(n)) + if ( + isinstance(s, Filter) + and isinstance(p, (Variable, Reference)) + and s in a2uns[interface] + ): + auxs.append(p) + a2uns[interface].update(auxs) + # remove dep graphs with only one node - # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" when v.x not in cache + # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" + # when v.x not in cache dep_nodes = [ns for ns in a2uns.values() if len(ns) > 1] - dep_graphs = [IRGraphEvaluable(g.subgraph(ns)) for ns in dep_nodes] + # need to search in `self` due to the patch for get_trunk_n_branches() + dep_graphs = [ + IRGraphEvaluable(self.subgraph(ns)).deepcopy() for ns in dep_nodes + ] return dep_graphs @@ -774,7 +819,7 @@ def _add_node(self, node: Instruction, deref: bool = True) -> Instruction: class IRGraphSimpleQuery(IRGraphEvaluable): """Simple Query IRGraph - A simple query IRGraph is an evaluatable IRGraph that + A simple query IRGraph is an evaluable IRGraph that 1. It contains one source node diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py index 0d667ea3..8b1aa1e3 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py @@ -168,6 +168,11 @@ class Reference(IntermediateInstruction): name: str +@dataclass(eq=False) +class Explain(SolePredecessorTransformingInstruction): + pass + + @dataclass(eq=False) class Limit(SolePredecessorTransformingInstruction): num: int diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py new file mode 100644 index 00000000..d05bd943 --- /dev/null +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -0,0 +1,279 @@ +import logging +from typing import Optional, Union + +import dpath +import numpy as np +import yaml +from pandas import DataFrame +from typeguard import typechecked + +from kestrel.mapping.transformers import ( + run_transformer, + run_transformer_on_series, +) +from kestrel.utils import list_folder_files + +_logger = logging.getLogger(__name__) + + +def _add_mapping(obj: dict, key: str, mapping: dict): + """Add `key` -> `mapping` to `obj`, appending if necessary""" + existing_mapping = obj.get(key) + if existing_mapping: + if isinstance(existing_mapping, str): + existing_mapping = [{"ocsf_field": existing_mapping}] + elif isinstance(existing_mapping, dict): + existing_mapping = [existing_mapping] + else: + existing_mapping = [] + existing_mapping.append(mapping) + obj[key] = existing_mapping + + +def _reverse_dict(obj: dict, k: str, v: dict): + """Reverse a single OCSF -> native mapping and add it to `obj`""" + key = v["native_field"] + mapping = {i: j for i, j in v.items() if i != "native_field"} + mapping["ocsf_field"] = k + _add_mapping(obj, key, mapping) + + +def _add_attr(obj: dict, key: str, value: str): + """Add `key` -> `value` to `obj`, appending if necessary""" + if key not in obj: + obj[key] = value + else: + existing = obj[key] + if isinstance(existing, str): + obj[key] = [existing, value] + else: + existing.append(value) + + +def reverse_mapping(obj: dict, prefix: str = None, result: dict = None) -> dict: + """Reverse the mapping; return native -> OCSF map""" + if result is None: + result = {} + for k, v in obj.items(): + k = ".".join((prefix, k)) if prefix else k + # Recurse if necessary + if isinstance(v, str): + _add_attr(result, v, k) + elif isinstance(v, list): + # Need to handle multiple mappings + for i in v: + if isinstance(i, str): + _add_attr(result, i, k) + elif "native_field" in i: + _reverse_dict(result, k, i) + else: + # Need to "deep" merge with current results + reverse_mapping(i, k, result) + elif isinstance(v, dict): + # First determine if this is a complex mapping or just another level + if "native_field" in v: + _reverse_dict(result, k, v) + else: + # Need to "deep" merge with current results + reverse_mapping(v, k, result) + + return result + + +def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple: + mapped_op = d.get(f"{prefix}_op") + transform = d.get(f"{prefix}_value") + new_value = run_transformer(transform, value) + new_op = mapped_op if mapped_op else op + return (d[f"{prefix}_field"], new_op, new_value) + + +def translate_comparison_to_native( + dmm: dict, field: str, op: str, value: Union[str, int, float] +) -> list: + """Translate the (`field`, `op`, `value`) triple using data model map `dmm` + + This function may be used in datasource interfaces to translate a comparison + in the OCSF data model to the native data model, according to the data model + mapping in `dmm`. + + This function translates the (`field`, `op`, `value`) triple into a list of + translated triples based on the provided data model map. The data model map + is a dictionary that maps fields from one data model to another. For + example, if you have a field named "user.name" in your data model, but the + corresponding field in the native data model is "username", then you can use + the data model map to translate the field name. + + Parameters: + dmm: A dictionary that maps fields from one data model to another. + field: The field name to be translated. + op: The comparison operator. + value: The value to be compared against. + + Returns: + A list of translated triples. + + Raises: + KeyError: If the field cannot be found in the data model map. + """ + _logger.debug("comp_to_native: %s %s %s", field, op, value) + result = [] + mapping = dmm.get(field) + if mapping: + if isinstance(mapping, str): + # Simple 1:1 field name mapping + result.append((mapping, op, value)) + else: + raise NotImplementedError("complex native mapping") + else: + parts = field.split(".") + tmp = dmm + for part in parts: + if isinstance(tmp, dict): + tmp = tmp.get(part, {}) + else: + break + if tmp: + if isinstance(tmp, list): + for i in tmp: + if isinstance(i, dict): + result.append(_get_map_triple(i, "native", op, value)) + else: + result.append((i, op, value)) + elif isinstance(tmp, dict): + result.append(_get_map_triple(tmp, "native", op, value)) + elif isinstance(tmp, str): + result.append((tmp, op, value)) + else: + # Pass-through + result.append((field, op, value)) + _logger.debug("comp_to_native: return %s", result) + return result + + +def translate_comparison_to_ocsf( + dmm: dict, field: str, op: str, value: Union[str, int, float] +) -> list: + """Translate the (`field`, `op`, `value`) triple using data model map `dmm` + + This function is used in the frontend to translate a comparison in + the STIX (or, in the future, ECS) data model to the OCSF data + model, according to the data model mapping in `dmm`. + + This function translates the (`field`, `op`, `value`) triple into a list of + translated triples based on the provided data model map. The data model map + is a dictionary that maps fields from one data model to another. For + example, if you have a field named "user.name" in your data model, but the + corresponding field in the native data model is "username", then you can use + the data model map to translate the field name. + + Parameters: + dmm: A dictionary that maps fields from one data model to another. + field: The field name to be translated. + op: The comparison operator. + value: The value to be compared against. + + Returns: + A list of translated triples. + + Raises: + KeyError: If the field cannot be found in the data model map. + + """ + _logger.debug("comp_to_ocsf: %s %s %s", field, op, value) + result = [] + mapping = dmm.get(field) + if isinstance(mapping, str): + # Simple 1:1 field name mapping + result.append((mapping, op, value)) + elif isinstance(mapping, list): + for i in mapping: + if isinstance(i, dict): + result.append(_get_map_triple(i, "ocsf", op, value)) + else: + result.append((i, op, value)) + return result + + +@typechecked +def load_default_mapping( + data_model_name: str, + mapping_pkg: str = "kestrel.mapping", + submodule: str = "entityattribute", +): + result = {} + entityattr_mapping_files = list_folder_files( + mapping_pkg, submodule, prefix=data_model_name, extension="yaml" + ) + for f in entityattr_mapping_files: + with open(f, "r") as fp: + result.update(yaml.safe_load(fp)) + return result + + +@typechecked +def _get_from_mapping(mapping: Union[str, list, dict], key) -> list: + result = [] + if isinstance(mapping, list): + for i in mapping: + if isinstance(i, dict): + result.append(i[key]) + else: + result.append(i) + elif isinstance(mapping, dict): + result.append(mapping[key]) + elif isinstance(mapping, str): + result.append(mapping) + return result + + +@typechecked +def translate_projection_to_native( + dmm: dict, + entity_type: Optional[str], + attrs: Optional[list], + # TODO: optional str or callable for joining entity_type and attr? +) -> list: + result = [] + if entity_type: + dmm = dmm[entity_type] + if not attrs: + for native_field, mapping in reverse_mapping(dmm).items(): + result.extend( + [(native_field, i) for i in _get_from_mapping(mapping, "ocsf_field")] + ) + attrs = [] + for attr in attrs: + mapping = dmm.get(attr) + if not mapping: + parts = attr.split(".") + tmp = dmm + for part in parts: + if isinstance(tmp, dict): + tmp = tmp.get(part, {}) + else: + break + if tmp: + mapping = tmp + if mapping: + result.extend( + [(i, attr) for i in _get_from_mapping(mapping, "native_field")] + ) + else: + # Pass-through? + result.append((attr, attr)) # FIXME: raise exception instead? + _logger.debug("proj_to_native: return %s", result) + return result + + +@typechecked +def translate_dataframe(df: DataFrame, dmm: dict) -> DataFrame: + # Translate results into Kestrel OCSF data model + # The column names of df are already mapped + df = df.replace({np.nan: None}) + for col in df.columns: + mapping = dpath.get(dmm, col, separator=".") + if isinstance(mapping, dict): + transformer_name = mapping.get("ocsf_value") + df[col] = run_transformer_on_series(transformer_name, df[col]) + return df diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index ef3ff62c..d4a1bf75 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -1,237 +1,233 @@ -process.command_line: process.cmd_line -process.end: process.end_time -process.entity_id: process.uid -process.executable: process.file.path -process.exit_code: process.exit_code -process.name: process.name -process.pid: process.pid -process.start: process.start_time -process.thread.id: process.tid -# process.args -# process.args_count -# process.entry_meta.type -# process.env_vars -# process.interactive -# process.same_as_process -# process.thread.capabilities.effective -# process.thread.capabilities.permitted -# process.thread.name -# process.title -# process.tty -# process.uptime -# process.vpid -# process.working_directory -file.accessed: file.accessed_time -file.attributes: file.attributes -file.created: file.created_time -file.ctime: file.modified_time -file.directory: file.parent_folder -file.gid: file.xattributes.primary_group -file.mime_type: file.mime_type -file.mode: file.mode -file.mtime: file.modified_time -file.name: file.name -file.owner: file.owner -file.path: file.path -file.size: file.size -file.target_path: file.xattributes.link_name -file.type: file.type -# file.device -# file.drive_letter -# file.extension -# file.fork_name -# file.inode -# file.uid -group.name: group.name -group.id: group.uid -# group.domain -client.bytes: traffic.bytes_out -client.domain: src_endpoint.domain -client.ip: src_endpoint.ip -client.mac: src_endpoint.mac -client.packets: traffic.packets_out -client.port: src_endpoint.port -# client.address -# client.nat.ip -# client.nat.port -# client.registered_domain -# client.subdomain -# client.top_level_domain -destination.bytes: traffic.bytes_in -destination.domain: dst_endpoint.domain -destination.ip: dst_endpoint.ip -destination.mac: dst_endpoint.mac -destination.packets: traffic.packets_in -destination.port: dst_endpoint.port -# destination.address -# destination.nat.ip -# destination.nat.port -# destination.registered_domain -# destination.subdomain -# destination.top_level_domain -server.bytes: traffic.bytes_in -server.domain: dst_endpoint.domain -server.ip: dst_endpoint.ip -server.mac: dst_endpoint.mac -server.packets: traffic.packets_in -server.port: dst_endpoint.port -# server.address -# server.nat.ip -# server.nat.port -# server.registered_domain -# server.subdomain -# server.top_level_domain -source.bytes: traffic.bytes_out -source.domain: src_endpoint.domain -source.ip: src_endpoint.ip -source.mac: src_endpoint.mac -source.packets: traffic.packets_out -source.port: src_endpoint.port -# source.address -# source.nat.ip -# source.nat.port -# source.registered_domain -# source.subdomain -# source.top_level_domain - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009] -network.application: app_name -network.bytes: traffic.bytes -network.direction: connection_info.direction -network.iana_number: connection_info.protocol_num -network.packets: traffic.packets -network.protocol: connection_info.protocol_name -network.type: connection_info.protocol_ver_id -# network.community_id -# network.forwarded_ip -# network.inner -# network.name -# network.transport: -hash.md5: file.hashes[?algorithm_id == 1].value -hash.sha1: file.hashes[?algorithm_id == 2].value -hash.sha256: file.hashes[?algorithm_id == 3].value -hash.sha512: file.hashes[?algorithm_id == 4].value -hash.ssdeep: file.hashes[?algorithm_id == 5].value -hash.tlsh: file.hashes[?algorithm_id == 6].value -# hash.sha384 -x509.not_after: certificate.expiration_time -x509.not_before: certificate.created_time -x509.serial_number: certificate.serial_number -x509.signature_algorithm: certificate.fingerprints.algorithm -x509.version_number: certificate.version -# x509.alternative_names -# x509.issuer.common_name: certificate.issuer -# x509.issuer.country: certificate.issuer -# x509.issuer.distinguished_name: certificate.issuer -# x509.issuer.locality: certificate.issuer -# x509.issuer.organization: certificate.issuer -# x509.issuer.organizational_unit: certificate.issuer -# x509.issuer.state_or_province: certificate.issuer -# x509.public_key_algorithm -# x509.public_key_curve -# x509.public_key_exponent -# x509.public_key_size -# x509.subject.common_name: certificate.subject -# x509.subject.country: certificate.subject -# x509.subject.distinguished_name: certificate.subject -# x509.subject.locality: certificate.subject -# x509.subject.organization: certificate.subject -# x509.subject.organizational_unit: certificate.subject -# x509.subject.state_or_province: certificate.subject -as.number: device.org.number -as.organization.name: device.org.name -geo.city_name: location.city -geo.continent_name: location.continent -geo.country_iso_code: location.county -geo.location: location.coordinates -geo.postal_code: location.postal_code -geo.region_iso_code: location.region -# geo.continent_code -# geo.country_name -# geo.name -# geo.region_name -# geo.timezone -user.domain: user.domain -user.email: user.email_addr -user.full_name: user.full_name -user.id: user.uid -user.name: user.name -# user.roles -# user.hash: - -referenced_fields: - process.group: - ref: group - prefix: process - process.hash: - ref: hash - prefix: process - process.parent: - ref: process # ECS entity used for attribute mapping - prefix: process # OCSF Prefix - target_entity: parent_process # Updated OCSF entity name - process.user: - ref: user - prefix: process - # process.code_signature: code_signature - # process.entry_leader: process - # process.entry_leader.parent: process - # process.entry_leader.parent.session_leader: process - # process.entry_meta.source: source - # process.group_leader: process - # process.macho: macho - # process.parent.group_leader: process - # process.pe: pe - # process.previous: process - # process.real_group: group - # process.real_user: user - # process.saved_group: group - # process.saved_user: user - # process.session_leader: process - # process.session_leader.parent: process - # process.session_leader.parent.session_leader: process - # process.supplemental_groups: group - file.hash: - ref: hash - prefix: null - file.x509: - ref: x509 - prefix: tls - # file.code_signature.* - # file.pe.* - client.as: - ref: as - prefix: null - client.geo: - ref: geo - prefix: src_endpoint - # client.user: - # ref: user - # prefix: src_endpoint - destination.as: - ref: as - prefix: null - destination.geo: - ref: geo - prefix: dst_endpoint - # destination.user: - # ref: user - # prefix: dst_endpoint - server.as: - ref: as - prefix: null - server.geo: - ref: geo - prefix: dst_endpoint - # server.user: - # ref: user - # prefix: dst_endpoint - source.as: - ref: as - prefix: null - source.geo: - ref: geo - prefix: src_endpoint - # source.user: - # ref: user - # prefix: src_endpoint +# https://schema.ocsf.io/1.1.0/objects/file +file: + accessed_time: file.accessed + attributes: file.attributes + created_time: file.created + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: hash.md5 + hashes[?algorithm_id == 2]: + value: hash.sha1 + hashes[?algorithm_id == 3]: + value: hash.sha256 + hashes[?algorithm_id == 4]: + value: hash.sha512 + hashes[?algorithm_id == 5]: + value: hash.ssdeep + hashes[?algorithm_id == 6]: + value: hash.tlsh + hashes[*]: + value: + - hash.md5 + - hash.sha1 + - hash.sha256 + - hash.sha512 + - hash.ssdeep + - hash.tlsh + modified_time: file.ctime + mime_type: file.mime_type + mode: file.mode + modified_time: file.mtime + name: file.name + owner: file.owner + parent_folder: file.directory + path: file.path + size: file.size + type: file.type + xattributes: + primary_group: file.gid + link_name: file.target_path + + +# https://schema.ocsf.io/1.1.0/objects/group +group: + domain: group.domain + name: group.name + uid: group.id + + +# https://schema.ocsf.io/1.1.0/objects/process +process: + cmd_line: process.command_line + name: process.name + pid: process.pid + uid: process.entity_id + file: + name: + native_field: process.executable + native_op: LIKE + native_value: endswith + ocsf_value: basename + path: process.executable + parent_folder: + native_field: process.executable + native_op: LIKE + native_value: startswith + ocsf_value: dirname + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: process.hash.md5 + hashes[?algorithm_id == 2]: + value: process.hash.sha1 + hashes[?algorithm_id == 3]: + value: process.hash.sha256 + hashes[?algorithm_id == 4]: + value: process.hash.sha512 + hashes[?algorithm_id == 5]: + value: process.hash.ssdeep + hashes[?algorithm_id == 6]: + value: process.hash.tlsh + hashes[*]: + value: + - process.hash.md5 + - process.hash.sha1 + - process.hash.sha256 + - process.hash.sha512 + - process.hash.ssdeep + - process.hash.tlsh + parent_process: + cmd_line: process.parent.command_line + name: process.parent.name + pid: process.parent.pid + uid: process.parent.entity_id + file: + name: + native_field: process.parent.executable + native_op: LIKE + native_value: endswith + ocsf_value: basename + path: process.parent.executable + parent_folder: + native_field: process.parent.executable + native_op: LIKE + native_value: startswith + ocsf_value: dirname + + +# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +src_endpoint: &src_ref + domain: + - client.domain + - source.domain + hostname: + - client.domain + - source.domain + ip: + - client.ip + - source.ip + mac: + - client.mac + - source.mac + port: + - client.port + - source.port + + +# endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +endpoint: + domain: + - client.domain + - source.domain + - server.domain + - destination.domain + hostname: + - client.domain + - source.domain + - server.domain + - destination.domain + ip: + - client.ip + - source.ip + - server.ip + - destination.ip + mac: + - client.mac + - source.mac + - server.mac + - destination.mac + port: + - client.port + - source.port + - server.port + - destination.port + + +# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +dst_endpoint: &dst_ref + domain: + - server.domain + - destination.domain + hostname: + - server.domain + - destination.domain + ip: + - server.ip + - destination.ip + mac: + - server.mac + - destination.mac + port: + - server.port + - destination.port + + +# https://schema.ocsf.io/1.1.0/objects/network_traffic +# should be `network_traffic`? +traffic: &traffic + bytes: network.bytes + bytes_in: + - destination.bytes + - server.bytes + bytes_out: + - client.bytes + - source.bytes + packets: network.packets + packets_in: + - destination.packets + - server.packets + packets_out: + - client.packets + - source.packets + + +# https://schema.ocsf.io/1.1.0/objects/network_connection_info +connection_info: + direction: network.direction #TODO: need transformer? + protocol_num: network.iana_number + protocol_name: network.transport + protocol_ver: network.type + protocol_ver_id: + native_field: network.type + native_value: ip_version_to_network_layer + ocsf_value: network_layer_to_ip_version + + +# https://schema.ocsf.io/1.1.0/objects/certificate +certificate: + expiration_time: x509.not_after + created_time: x509.not_before + serial_number: x509.serial_number + fingerprints[*]: + algorithm: x509.signature_algorithm + version: x509.version_number + issuer: x509.issuer.distinguished_name + subject: x509.subject.distinguished_name + #uid: + + +# https://schema.ocsf.io/1.1.0/objects/user +user: + domain: user.domain + full_name: user.full_name + name: user.name + uid: user.id + + +# https://schema.ocsf.io/1.1.0/classes/network_activity +# Network Activity [4001] Class +network_activity: + src_endpoint: *src_ref + dst_endpoint: *dst_ref + traffic: *traffic diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml index f0ed912a..7082e6dd 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml @@ -1,210 +1,143 @@ -# All Categories [*] -autonomous-system:name: device.org.name -autonomous-system:number: device.org.uid - -# File System Activity [1001] -directory:path: file.path -directory:accessed: file.accessed_time -directory:created: file.created_time -directory:modified: file.modified_time - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009] -domain-name.value: - - src_endpoint.domain - - dst_endpoint.domain - - dns_query.hostname - -# Email Activity [4009] -email-addr:value: user.email_addr -email-addr:display_name: user.full_name -# email-message:is_multipart -# email-message:date -# email-message:content_type -email-message:from_ref.value: email.from -email-message:sender_ref.value: email.smtp_from -email-message:to_refs[*].value: email.to -email-message:cc_refs[*].value: email.cc -email-message:subject: email.subject -# email-message:received_lines -email-message:additional_header_fields: email.raw_header -# email-message:body -email-message:body_multipart.body_raw_ref.name: file.name -# email-message:raw_email_ref -# email-message:body_multipart.body: file.mime_type - -# File System Activity [1001], Network File Activity [4010], Email File Activity [4011] -file:accessed: file.accessed_time -file:created: file.created_time -file:name: file.name -file:size: file.size -file:hashes.SHA-256: file.hashes[?algorithm_id == 3].value -file:hashes.SHA-1: file.hashes[?algorithm_id == 2].value -file:hashes.MD5: file.hashes[?algorithm_id == 1].value -file:parent_directory_ref.path: file.parent_folder -# file:name_enc -# file:magic_number_hex -file:mime_type: file.mime_type -# file:is_encrypted -# file:encryption_algorithm -# file:decryption_key -# file:contains_refs -# file:content_ref - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -ipv4-addr:value: - - dst_endpoint.ip - - src_endpoint.ip - - device.ip -# ipv4-addr.belongs_to_refs -# ipv4-addr.resolves_to_refs - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -ipv6-addr:value: - - dst_endpoint.ip - - src_endpoint.ip - - device.ip - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -mac-addr:value: - - dst_endpoint.mac - - src_endpoint.mac - - device.mac - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003] -network-traffic:dst_byte_count: traffic.bytes_in -network-traffic:dst_packets: traffic.packets_in -network-traffic:dst_port: dst_endpoint.port -network-traffic:dst_ref.value: dst_endpoint.ip -network-traffic:protocols[*]: - - connection_info.protocol_num - - connection_info.protocol_name - - connection_info.protocol_ver_id -network-traffic:src_byte_count: traffic.bytes_out -network-traffic:src_packets: traffic.packets_out -network-traffic:src_port: src_endpoint.port -network-traffic:src_ref.value: src_endpoint.ip -network_traffic:start: start_time -network_traffic:end: end_time -# network_traffic:is_active -# network_traffic:ipfix -# network_traffic:src_payload_ref -# network_traffic:dst_payload_ref -# network_traffic:encapsulates_refs -# network_traffic:encapsulated_by_ref - -# Process Activity [1007] -process:binary_ref.name: file.name -process:command_line: process.cmd_line -process:created: process.created_time -process:mime_type: mime_type -process:name: process.name -process:pid: process.pid -process:x_unique_id: process.uid -process:parent_ref.name: - - actor.process.name - - process.parent_process.name - -# Base Event [0] -software:extension.product.feature_name: metadata.product.feature.name -software:extension.product.feature_uid: metadata.product.feature.uid -software:extension.product.feature_version: metadata.product.feature.version -software:extension.product.path: metadata.product.path -software:extension.product.uid: metadata.product.uid -software:languages: metadata.product.lang -software:name: metadata.product.name -software:vendor: metadata.product.vendor_name -software:version: metadata.product.version - -# HTTP Activity [4002] -url:value: http_request.url - -# Account Change [3001], Authentication [3002], Authorize Session [3003], User Access Management [3005] -user-account:account_type: user.account.type -user-account:display_name: user.account.name -user-account:user_id: user.account.uid - -# Base Event [0] -x-ibm-finding:alert_id: - - observables.type_id - - finding.uid -x-ibm-finding:description: observables.value -x-ibm-finding:dst_ip_ref.value: dst_endpoint.ip -x-ibm-finding:end: end_time -x-ibm-finding:event_count: count -x-ibm-finding:finding_type: observables.type -x-ibm-finding:name: - - observables.name - - finding.title -x-ibm-finding:severity: severity_id -x-ibm-finding:src_ip_ref.value: src_endpoint.ip -x-ibm-finding:start: finding.created_time -x-ibm-finding:time_observed: finding.first_seen_time -x-ibm-finding:types: finding.types - -# All Categories [*] -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_id: attacks[*].tactics.uid -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_name: attacks[*].tactics.name -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.technique_id: attacks[*].technique.uid -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.version: attacks[*].version -x-ibm-ttp-tagging:name: attacks[*].technique.name - -# All Categories [*] -x-oca-asset:name: - - dst_endpoint.name - - src_endpoint.name - - device.name -x-oca-asset:os_name: device.os.name -x-oca-asset:hostname: device.hostname -x-oca-asset:device_id: device.uid -x-oca-asset:ip_refs[*].value: device.network_interfaces[*].ip -x-oca-asset:mac_refs[*].value: device.network_interfaces[*].mac -x-oca-asset:os_ref: device.os -x-oca-asset:architecture: device.hw_info -x-oca-asset:host_type: device.type -x-oca-asset:ingress: device.network_interfaces -x-oca-asset:egress: device.network_interfaces -x-oca-asset:geo_ref: device.location - -# Base Event [0] -x-oca-event:action: - - activity - - activity_name -x-oca-event:category: category_name -x-oca-event:code: - - activity_id - - category_uid -x-oca-event:confidence: confidence -x-oca-event:created: time -x-oca-event:duration: duration -x-oca-event:module: class_name -x-oca-event:network_ref.dst_ref.value: dst_endpoint.ip -x-oca-event:network_ref.src_ref.value: src_endpoint.ip -x-oca-event:timezone: timezone_offset - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -x509-certificate:hashes.SHA-256: file.hashes[?algorithm_id == 3].value -x509-certificate:hashes.SHA-1: file.hashes[?algorithm_id == 2].value -x509-certificate:hashes.MD5: file.hashes[?algorithm_id == 1].value -x509-certificate:version: tls.certificate.version -x509-certificate:serial_number: tls.certificate.serial_number -x509-certificate:issuer: tls.certificate.issuer -x509-certificate:validity_not_before: tls.certificate.created_time -x509-certificate:validity_not_after: tls.certificate.expiration_time -x509-certificate:subject: tls.certificate.subject -x509-certificate:x509_v3_extensions: tls.extension_list -x509-certificate:signature_algorithm: tls.certificate.fingerprints.algorithm - -# Registry Key Activity [201001] -windows-registry-key:key: win/registry_key.path - -# Additional mapping for STIX 2.1 -# File System Activity [1001] -directory:atime: file.accessed_time -directory:ctime: file.created_time -directory:mtime: file.modified_time -file:atime: file.accessed_time -file:ctime: file.created_time -file:mtime: file.modified_time - -# Process Activity [1007] -process:image_ref.name: file.name +# https://schema.ocsf.io/1.1.0/objects/file +file: + name: file:name + size: file:size + accessed_time: file:accessed + created_time: file:created + modified_time: file:modified + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: file:hashes.MD5 + hashes[?algorithm_id == 2]: + value: "file:hashes.'SHA-1'" + hashes[?algorithm_id == 3]: + value: "file:hashes.'SHA-256'" + hashes[?algorithm_id == 4]: + value: "file:hashes.'SHA-512'" + hashes[?algorithm_id == 5]: + value: file:hashes.SSDEEP + hashes[?algorithm_id == 6]: + value: file:hashes.TLSH + hashes[*]: + value: + - file:hashes.MD5 + - "file:hashes.'SHA-1'" + - "file:hashes.'SHA-256'" + - "file:hashes.'SHA-512'" + - file:hashes.SSDEEP + - file:hashes.TLSH + + +# https://schema.ocsf.io/1.1.0/objects/group +# group: +# domain: +# name: +# uid: + + +# https://schema.ocsf.io/1.1.0/objects/process +process: + cmd_line: process:command_line + name: process:name + pid: process:pid + uid: process:x_unique_id + file: + name: process:binary_ref.name + parent_folder: process:binary_ref.parent_directory_ref.path + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: process:binary_ref.hashes.MD5 + hashes[?algorithm_id == 2]: + value: process:binary_ref.hashes.'SHA-1' + hashes[?algorithm_id == 3]: + value: process:binary_ref.hashes.'SHA-256' + hashes[?algorithm_id == 4]: + value: process:binary_ref.hashes.'SHA-512' + hashes[?algorithm_id == 5]: + value: process:binary_ref.hashes.SSDEEP + hashes[?algorithm_id == 6]: + value: process:binary_ref.hashes.TLSH + hashes[*]: + value: + - process:binary_ref.hashes.MD5 + - process:binary_ref.hashes.'SHA-1' + - process:binary_ref.hashes.'SHA-256' + - process:binary_ref.hashes.'SHA-512' + - process:binary_ref.hashes.SSDEEP + - process:binary_ref.hashes.TLSH + parent_process: + cmd_line: process:parent_ref.command_line + name: process:parent_ref.name + pid: process:parent_ref.pid + uid: process:parent_ref.x_unique_id + file: + name: process:parent_ref.binary_ref.name + parent_folder: process:parent_ref.binary_ref.parent_directory_ref.path + + +# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +dst_endpoint: + ip: + - network-traffic:dst_ref.value + - ipv4-addr:value + port: network-traffic:dst_port + + +# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +src_endpoint: + ip: + - network-traffic:src_ref.value + - ipv4-addr:value + port: network-traffic:src_port + + +# https://schema.ocsf.io/1.1.0/objects/endpoint +endpoint: + ip: ipv4-addr:value + + +# https://schema.ocsf.io/1.1.0/objects/device +device: + ip: ipv4-addr:value + + +# https://schema.ocsf.io/1.1.0/objects/network_traffic +traffic: # should be `network_traffic`? + #TODO: bytes: sum of byte counts? + bytes_in: network-traffic:dst_byte_count + bytes_out: network-traffic:src_byte_count + #TODO: packets: sum of packet counts? + packets_in: network-traffic:dst_packets + packets_out: network-traffic:src_packets + + +# https://schema.ocsf.io/1.1.0/objects/network_connection_info +# connection_info: +# direction: +# protocol_num: +# protocol_name: +# protocol_ver: +# protocol_ver_id: + + +# https://schema.ocsf.io/1.1.0/objects/certificate +certificate: + expiration_time: x509-certificate:validity_not_after + created_time: x509-certificate:validity_not_before + serial_number: x509-certificate:serial_number + fingerprints[*]: + algorithm: x509-certificate:signature_algorithm + version: x509-certificate:version_number + issuer: x509-certificate:issuer + subject: x509-certificate:subject + #uid: + + +# https://schema.ocsf.io/1.1.0/objects/user +user: + full_name: user-account:display_name + name: user-account:account_login + type: user-account:account_type + uid: user-account:user_id diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py new file mode 100644 index 00000000..82202dcb --- /dev/null +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py @@ -0,0 +1,110 @@ +"""Kestrel Data Model Map value transformers""" + +from datetime import datetime, timezone +from typing import Callable + +from pandas import Series + + +# Dict of "registered" transformers +_transformers = {} + + +def transformer(func: Callable) -> Callable: + """A decorator for registering a transformer""" + _transformers[func.__name__] = func + return func + + +@transformer +def to_epoch_ms(value: str) -> int: + """Convert a time value to milliseconds since the epoch""" + if "." in value: + time_pattern = "%Y-%m-%dT%H:%M:%S.%fZ" + else: + time_pattern = "%Y-%m-%dT%H:%M:%SZ" + dt = datetime.strptime(value, time_pattern).replace(tzinfo=timezone.utc) + return int(dt.timestamp() * 1000) + + +@transformer +def dirname(path: str) -> str: # TODO: rename to winpath_dirname? + """Get the directory part of `path`""" + path_dir, _, _ = path.rpartition("\\") + return path_dir + + +@transformer +def basename(path: str) -> str: # TODO: rename to winpath_dirname? + """Get the filename part of `path`""" + _, _, path_file = path.rpartition("\\") + return path_file + + +@transformer +def startswith(value: str) -> str: # TODO: rename to winpath_startswith? + return f"{value}\\%" + + +@transformer +def endswith(value: str) -> str: # TODO: rename to winpath_endswith? + return f"%\\{value}" + + +@transformer +def to_int(value) -> int: + """Ensure `value` is an int""" + try: + return int(value) + except ValueError: + # Maybe it's a hexadecimal string? + return int(value, 16) + + +@transformer +def to_str(value) -> str: + """Ensure `value` is a str""" + return str(value) + + +@transformer +def ip_version_to_network_layer(value: int) -> str: + if value == 4: + return "ipv4" + elif value == 6: + return "ipv6" + elif value == 99: + return "other" + return "unknown" + + +@transformer +def network_layer_to_ip_version(val: str) -> int: + value = val.lower() + if value == "ipv4": + return 4 + elif value == "ipv6": + return 6 + elif value == "other": + return 99 + return 0 + + +def run_transformer(transformer_name: str, value): + """Run the registered transformer with name `transformer_name` on `value`""" + func = _transformers.get(transformer_name) + if func: + result = func(value) + else: + raise NameError(transformer_name) + return result + + +def run_transformer_on_series(transformer_name: str, value: Series): + """Run the registered transformer with name `transformer_name` on `value`""" + func = _transformers.get(transformer_name) + if func: + result = value.apply(func) + else: + raise NameError(transformer_name) + return result diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py deleted file mode 100644 index 3e15b036..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py +++ /dev/null @@ -1,172 +0,0 @@ -from kestrel.exceptions import MappingParseError -from kestrel.utils import load_data_file, list_folder_files -import os -from typeguard import typechecked -from typing import ( - Iterable, - Union, -) -import yaml - - -# _entityname_mapping and _entityattr_mapping are dictionaries that contain -# the info needed to translate: -# a. queries between: -# 1. STIX and OCSF -# 2. ECS and OCSF -# 3. OCSF and ECS -# b. results between: -# 1. ECS and OCSF -_entityname_mapping = {} -_entityattr_mapping = {} - - -@typechecked -def load_standard_config(mapping_pkg: str): - global _entityname_mapping - global entityattr_mapping - if len(_entityname_mapping) > 0 and len(_entityattr_mapping) > 0: - return - entityname_mapping_files = list_folder_files( - mapping_pkg, "entityname", suffix=".yaml" - ) - for f in entityname_mapping_files: - parse_entityname_mapping_file(mapping_pkg, f.name) - entityattr_mapping_files = list_folder_files( - mapping_pkg, "entityattribute", suffix=".yaml" - ) - for f in entityattr_mapping_files: - parse_entityattr_mapping_file(mapping_pkg, f.name) - - -@typechecked -def parse_entityname_mapping_file(mapping_pkg: str, filename: str): - global _entityname_mapping - mapping_fpath = os.path.join("entityname", filename) - filename_no_ext, _ = filename.split(".") - src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext - dst_lang = "ocsf" - src_dict = _entityname_mapping.get(src_lang, {}) - dst_dict = src_dict.get(dst_lang, {}) - try: - mapping_str = load_data_file(mapping_pkg, mapping_fpath) - mapping = yaml.safe_load(mapping_str) - dst_dict.update(mapping) - except Exception as ex: - raise MappingParseError() - src_dict[dst_lang] = dst_dict - _entityname_mapping[src_lang] = src_dict - - -@typechecked -def expand_referenced_field(mapping: dict, key: str, value: dict) -> dict: - res = {} - ref = value.get("ref") - prefix = value.get("prefix") - target_entity = value.get("target_entity") - for k, v in mapping.items(): - if k.startswith(f"{ref}."): - k_no_ref = k[len(ref) + 1 :] - ref_key = ".".join([key, k_no_ref]) - if prefix is None: - ref_value = v - else: - prefix_tokens = prefix.split(".") - v_tokens = v.split(".") - if target_entity is not None: - v_tokens[0] = target_entity - ref_value = ".".join(prefix_tokens + v_tokens) - res[ref_key] = ref_value - return res - - -@typechecked -def parse_entityattr_mapping_file(mapping_pkg: str, filename: str): - global _entityattr_mapping - mapping_fpath = os.path.join("entityattribute", filename) - filename_no_ext, _ = filename.split(".") - src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext - dst_lang = "ocsf" - src_dict = _entityattr_mapping.get(src_lang, {}) - dst_dict = src_dict.get(dst_lang, {}) - try: - mapping_str = load_data_file(mapping_pkg, mapping_fpath) - mapping = yaml.safe_load(mapping_str) - mapping_referenced_fields = mapping.pop("referenced_fields", {}) - expanded_refs = {} - for key, value in mapping_referenced_fields.items(): - expanded_ref = expand_referenced_field(mapping, key, value) - expanded_refs.update(expanded_ref) - mapping.update(expanded_refs) - dst_dict.update(mapping) - except Exception as ex: - raise MappingParseError() - src_dict[dst_lang] = dst_dict - _entityattr_mapping[src_lang] = src_dict - - -def load_custom_config(): - # ~/.config/kestrel/mapping/entity/*.yaml - # ~/.config/kestrel/mapping/property/*.yaml - return - - -@typechecked -def normalize_entity( - entityname: str, src_lang: str, dst_lang: str -) -> Union[str, Iterable[str]]: - return ( - _entityname_mapping.get(src_lang, {}) - .get(dst_lang, {}) - .get(entityname, entityname) - ) - - -@typechecked -def normalize_property( - entityattr: str, src_lang: str, dst_lang: str -) -> Union[str, Iterable[str]]: - return ( - _entityattr_mapping.get(src_lang, {}) - .get(dst_lang, {}) - .get(entityattr, entityattr) - ) - - -@typechecked -def from_ocsf_key_value_pair(from_ocsf_dict: dict, key: str, value: str): - values = from_ocsf_dict.get(key, []) - if value not in values: - values.append(value) - from_ocsf_dict[key] = values - - -@typechecked -def from_ocsf_dictionary(to_oscf_dict: dict) -> dict: - from_ocsf_dict = {} - for key, value in to_oscf_dict.items(): - if isinstance(value, list): - for val in value: - from_ocsf_key_value_pair(from_ocsf_dict, val, key) - else: - from_ocsf_key_value_pair(from_ocsf_dict, value, key) - return from_ocsf_dict - - -@typechecked -def generate_from_ocsf_dictionaries(source_schema_name: str) -> (dict, dict): - attr_map = _entityattr_mapping.get(source_schema_name, {}).get("ocsf", {}) - name_map = _entityname_mapping.get(source_schema_name, {}).get("ocsf", {}) - from_ocsf_names = from_ocsf_dictionary(name_map) - from_ocsf_attrs = from_ocsf_dictionary(attr_map) - return (from_ocsf_names, from_ocsf_attrs) - - -# if __name__ == "__main__": -# load_standard_config("kestrel.mapping") -# res = normalize_entity("ecs", "ocsf", "process") -# from_ocsf_names, from_ocsf_attrs = generate_from_ocsf_dictionaries("ecs") -# print("\n\n\n NAMES ") -# print(yaml.dump(from_ocsf_names)) -# print("\n\n\n ATTRIBUTES ") -# print(yaml.dump(from_ocsf_attrs)) diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index bbbe1ad4..48ebf1f8 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -1,17 +1,17 @@ import logging from contextlib import AbstractContextManager -from typing import Iterable from uuid import UUID, uuid4 - -from pandas import DataFrame +from typing import Iterable from typeguard import typechecked +from kestrel.display import Display, GraphExplanation from kestrel.ir.graph import IRGraph +from kestrel.ir.instructions import Instruction, Explain from kestrel.frontend.parser import parse_kestrel from kestrel.cache import AbstractCache, SqliteCache -from kestrel.interface.datasource import AbstractDataSourceInterface -from kestrel.interface.datasource.manager import DataSourceManager -from kestrel.interface.datasource.utils import get_interface_by_name +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER +from kestrel.interface import AbstractInterface, InterfaceManager +from kestrel.exceptions import InstructionNotFound _logger = logging.getLogger(__name__) @@ -22,19 +22,14 @@ class Session(AbstractContextManager): """Kestrel huntflow execution session""" def __init__(self): - self.session_id: UUID = uuid4() - self.irgraph: IRGraph = IRGraph() - self.cache: AbstractCache = SqliteCache() + self.session_id = uuid4() + self.irgraph = IRGraph() - # Datasource interfaces in this session - # Cache is a special datasource interface and should always be added - self.interfaces: Iterable[AbstractDataSourceInterface] = [self.cache] + # load all interfaces; cache is a special interface + cache = SqliteCache() + self.interface_manager = InterfaceManager([cache]) - # Load data sources and add to list - data_source_manager = DataSourceManager() - self.interfaces.extend(data_source_manager.interfaces()) - - def execute(self, huntflow_block: str) -> Iterable[DataFrame]: + def execute(self, huntflow_block: str) -> Iterable[Display]: """Execute a Kestrel huntflow block. Execute a Kestrel statement or multiple consecutive statements (a @@ -50,7 +45,7 @@ def execute(self, huntflow_block: str) -> Iterable[DataFrame]: """ return list(self.execute_to_generate(huntflow_block)) - def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]: + def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: """Execute a Kestrel huntflow and put results in a generator. Parameters: @@ -59,24 +54,55 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]: Yields: Evaluated result per Return instruction """ - - # TODO: return type generalization - irgraph_new = parse_kestrel(huntflow_block) self.irgraph.update(irgraph_new) for ret in irgraph_new.get_returns(): - ret_df = None - while ret_df is None: - for g in self.irgraph.find_dependent_subgraphs_of_node(ret, self.cache): - interface = get_interface_by_name(g.interface, self.interfaces) - for iid, df in interface.evaluate_graph(g).items(): - if g.interface != self.cache.name: - self.cache[iid] = df - if iid == ret.id: - ret_df = df - else: - yield ret_df + yield self.evaluate_instruction(ret) + + def evaluate_instruction(self, ins: Instruction) -> Display: + """Evaluate a single Instruction. + + Parameters: + ins: the instruction to evaluate + + Returns: + Evaluated result (Kestrel Display object) + """ + if ins not in self.irgraph: + raise InstructionNotFound(ins.to_dict()) + + pred = self.irgraph.get_trunk_n_branches(ins)[0] + is_explain = isinstance(pred, Explain) + display = GraphExplanation([]) + + _interface_manager = ( + self.interface_manager.copy_with_virtual_cache() + if is_explain + else self.interface_manager + ) + _cache = _interface_manager[CACHE_INTERFACE_IDENTIFIER] + + # The current logic leads to caching results from non-cache and lastly + # evaluate in cache. + # TODO: may evaluate cache first, then push dependent variables to the + # last interface to eval; this requires priority of interfaces + while True: + for g in self.irgraph.find_dependent_subgraphs_of_node(ins, _cache): + interface = _interface_manager[g.interface] + for iid, _display in ( + interface.explain_graph(g) + if is_explain + else interface.evaluate_graph(g) + ).items(): + if is_explain: + display.graphlets.append(_display) + else: + display = _display + if interface is not _cache: + _cache[iid] = display + if iid == ins.id: + return display def do_complete(self, huntflow_block: str, cursor_pos: int): """Kestrel code auto-completion. @@ -97,9 +123,8 @@ def close(self): """ # Note there are two conditions that trigger this function, so it is probably executed twice # Be careful to write the logic in this function to avoid deleting nonexist files/dirs - if self.cache: - del self.cache - self.cache = None + if CACHE_INTERFACE_IDENTIFIER in self.interface_manager: + self.interface_manager.del_cache() def __exit__(self, exception_type, exception_value, traceback): self.close() diff --git a/packages-nextgen/kestrel_core/src/kestrel/utils.py b/packages-nextgen/kestrel_core/src/kestrel/utils.py index 70db2ae3..02cbb5b3 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/utils.py @@ -5,10 +5,11 @@ from pathlib import Path from pkgutil import get_data from typeguard import typechecked -from typing import Union, Mapping +from typing import Optional, Mapping, Iterable -def load_data_file(package_name, file_name): +@typechecked +def load_data_file(package_name: str, file_name: str) -> str: try: # resources.files() is introduced in Python 3.9 content = resources.files(package_name).joinpath(file_name).read_text() @@ -20,7 +21,16 @@ def load_data_file(package_name, file_name): return content -def list_folder_files(package_name, folder_name, prefix=None, suffix=None): +@typechecked +def list_folder_files( + package_name: str, + folder_name: str, + prefix: Optional[str] = None, + extension: Optional[str] = None, +) -> Iterable[str]: + # preprocesss extension to add dot it not there + if extension and extension[0] != ".": + extension = "." + extension try: file_paths = resources.files(package_name).joinpath(folder_name).iterdir() except AttributeError: @@ -41,7 +51,7 @@ def list_folder_files(package_name, folder_name, prefix=None, suffix=None): for f in file_paths if ( f.is_file() - and (f.name.endswith(suffix) if suffix else True) + and (f.name.endswith(extension) if extension else True) and (f.name.startswith(prefix) if prefix else True) ) ) @@ -57,7 +67,7 @@ def unescape_quoted_string(s: str) -> str: @typechecked -def update_nested_dict(dict_old: Mapping, dict_new: Union[Mapping, None]): +def update_nested_dict(dict_old: Mapping, dict_new: Optional[Mapping]) -> Mapping: if dict_new: for k, v in dict_new.items(): if isinstance(v, collections.abc.Mapping) and k in dict_old: diff --git a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py index f750c38d..1a0bb9ca 100644 --- a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py +++ b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py @@ -3,6 +3,7 @@ from uuid import uuid4 from kestrel.cache import InMemoryCache +from kestrel.cache.inmemory import InMemoryCacheVirtual from kestrel.ir.graph import IRGraph, IRGraphEvaluable from kestrel.frontend.parser import parse_kestrel @@ -84,3 +85,37 @@ def test_eval_filter_with_ref(): assert len(rets) == 1 df = mapping[rets[0].id] assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] + +def test_get_virtual_copy(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' +""" + graph = IRGraphEvaluable(parse_kestrel(stmt)) + c = InMemoryCache() + mapping = c.evaluate_graph(graph) + v = c.get_virtual_copy() + new_entry = uuid4() + v[new_entry] = True + + # v[new_entry] calls the right method + assert isinstance(v, InMemoryCacheVirtual) + assert v[new_entry].startswith("virtual") + + # v[new_entry] does not hit v.cache + assert len(c.cache) == 2 + assert len(v.cache) == 2 + + # the two cache_catalog are different + assert new_entry not in c + assert new_entry in v + del v[new_entry] + assert new_entry not in v + for u in c: + del v[u] + assert len(v) == 0 + assert len(c) == 2 diff --git a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py index f5b99090..5db07fb6 100644 --- a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py +++ b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py @@ -1,7 +1,8 @@ from uuid import uuid4 from pandas import DataFrame -from kestrel.cache.sqlite import SqliteCache +from kestrel.cache import SqliteCache +from kestrel.cache.sqlite import SqliteCacheVirtual from kestrel.ir.graph import IRGraphEvaluable from kestrel.frontend.parser import parse_kestrel @@ -150,3 +151,33 @@ def test_eval_filter_with_ref(): assert len(rets) == 1 df = mapping[rets[0].id] assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] + +def test_get_virtual_copy(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' +""" + graph = IRGraphEvaluable(parse_kestrel(stmt)) + c = SqliteCache() + mapping = c.evaluate_graph(graph) + v = c.get_virtual_copy() + new_entry = uuid4() + v[new_entry] = True + + # v[new_entry] calls the right method + assert isinstance(v, SqliteCacheVirtual) + assert v[new_entry].endswith("v") + + # the two cache_catalog are different + assert new_entry not in c + assert new_entry in v + del v[new_entry] + assert new_entry not in v + for u in c: + del v[u] + assert len(v) == 0 + assert len(c) == 1 diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py index dc6164b6..4f9f7507 100644 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py +++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -from kestrel.interface.datasource.codegen.dataframe import ( +from kestrel.interface.codegen.dataframe import ( evaluate_source_instruction, evaluate_transforming_instruction, ) @@ -56,7 +56,7 @@ def test_evaluate_ProjectAttrs(): def test_evaluate_Construct_Filter_ProjectAttrs(): - stmt = """ + stmt = r""" proclist = NEW process [ {"name": "cmd.exe", "pid": 123} , {"name": "explorer.exe", "pid": 99} , {"name": "firefox.exe", "pid": 201} diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py index 27e0aca4..1cc3c46c 100644 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py +++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py @@ -1,7 +1,7 @@ from datetime import datetime from dateutil import parser -from kestrel.interface.datasource.codegen.sql import SqlTranslator +from kestrel.interface.codegen.sql import SqlTranslator from kestrel.ir.filter import ( BoolExp, ExpOp, diff --git a/packages-nextgen/kestrel_core/tests/test_ir_graph.py b/packages-nextgen/kestrel_core/tests/test_ir_graph.py index 38fa0c1c..cd77da7d 100644 --- a/packages-nextgen/kestrel_core/tests/test_ir_graph.py +++ b/packages-nextgen/kestrel_core/tests/test_ir_graph.py @@ -332,22 +332,22 @@ def test_find_dependent_subgraphs_of_node(): assert len(c) == 2 gs = graph.find_dependent_subgraphs_of_node(ret, c) assert len(gs) == 1 - assert len(gs[0]) == 10 + assert len(gs[0]) == 11 assert p2 in gs[0] assert p21 in gs[0] assert p4 in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) + assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) p4_projattr = next(graph.successors(p4)) c[p4_projattr.id] = DataFrame() gs = graph.find_dependent_subgraphs_of_node(ret, c) assert len(gs) == 1 - assert len(gs[0]) == 7 + assert len(gs[0]) == 8 assert p4_projattr.id in c assert p4_projattr in gs[0] assert p5 in gs[0] assert ret in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) + assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) def test_find_simple_query_subgraphs(): @@ -400,7 +400,7 @@ def test_find_simple_query_subgraphs(): gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c) assert len(gs) == 1 assert sink in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity]) + assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity, Variable]) for g in gs[0].find_simple_query_subgraphs(c): assert Counter(map(type, g.nodes())) == Counter([ProjectAttrs, Variable, Filter, ProjectEntity, DataSource]) assert sink in g diff --git a/packages-nextgen/kestrel_core/tests/test_mapping.py b/packages-nextgen/kestrel_core/tests/test_mapping.py deleted file mode 100644 index c0860c42..00000000 --- a/packages-nextgen/kestrel_core/tests/test_mapping.py +++ /dev/null @@ -1,78 +0,0 @@ -import kestrel.mapping.utils as mapping_utils - - -def test_mapping_load_config(): - mapping_utils.load_standard_config("kestrel.mapping") - entity_name_map = mapping_utils._entityname_mapping - assert "stix" in entity_name_map - assert "ocsf" in entity_name_map.get("stix", {}) - assert "ecs" in entity_name_map - assert "ocsf" in entity_name_map.get("ecs", {}) - entity_attr_map = mapping_utils._entityattr_mapping - assert "stix" in entity_attr_map - assert "ocsf" in entity_attr_map.get("stix", {}) - assert "ecs" in entity_attr_map - assert "ocsf" in entity_attr_map.get("ecs", {}) - - -def test_mapping_entity_names(): - res = mapping_utils.normalize_entity("process", "ecs", "ocsf") - assert res == "process" - res = mapping_utils.normalize_entity("i_dont_exist", "ecs", "ocsf") - assert res == "i_dont_exist" - res = mapping_utils.normalize_entity("network", "ecs", "ocsf") - assert res == "network_activity" - - -def test_mapping_entity_attributes(): - res = mapping_utils.normalize_property("process.parent.executable", - "ecs", "ocsf") - assert res == "process.parent_process.file.path" - res = mapping_utils.normalize_property("process.hash.md5", "ecs", "ocsf") - assert res == "process.file.hashes[?algorithm_id == 1].value" - res = mapping_utils.normalize_property("process.group.id", "ecs", "ocsf") - assert res == "process.group.uid" - res = mapping_utils.normalize_property("processx.non.existent", - "ecs", "ocsf") - assert res == "processx.non.existent" - res = mapping_utils.normalize_property("file.hash.md5", "ecs", "ocsf") - assert res == "file.hashes[?algorithm_id == 1].value" - - -def test_from_ocsf_dicionaries(): - from_ocsf_names, from_ocsf_attrs = mapping_utils.generate_from_ocsf_dictionaries("ecs") - res = from_ocsf_names.get("process") - assert (len(res) == 1 and "process" in res) - res = from_ocsf_names.get("network_endpoint") - assert (len(res) == 4 and "client" in res and "destination" in res and - "server" in res and "source" in res) - res = from_ocsf_attrs.get("process.name") - assert (len(res) == 1 and "process.name" in res) - res = from_ocsf_attrs.get("process.cmd_line") - assert (len(res) == 1 and "process.command_line" in res) - res = from_ocsf_attrs.get("process.file.hashes[?algorithm_id == 1].value") - assert (len(res) == 1 and "process.hash.md5" in res) - res = from_ocsf_attrs.get("process.file.path") - assert (len(res) == 1 and "process.executable" in res) - res = from_ocsf_attrs.get("process.parent_process.file.path") - assert (len(res) == 1 and "process.parent.executable" in res) - res = from_ocsf_attrs.get("process.parent_process.tid") - assert (len(res) == 1 and "process.parent.thread.id" in res) - res = from_ocsf_attrs.get("src_endpoint.domain") - assert (len(res) == 2 and "client.domain" in res and - "source.domain" in res) - res = from_ocsf_attrs.get("src_endpoint.location.city") - assert (len(res) == 2 and "client.geo.city_name" in res and - "source.geo.city_name" in res) - res = from_ocsf_attrs.get("tls.certificate.created_time") - assert (len(res) == 1 and "file.x509.not_before" in res) - res = from_ocsf_attrs.get("tls.certificate.expiration_time") - assert (len(res) == 1 and "file.x509.not_after" in res) - res = from_ocsf_attrs.get("tls.certificate.fingerprints.algorithm") - assert (len(res) == 1 and "file.x509.signature_algorithm" in res) - res = from_ocsf_attrs.get("traffic.packets_in") - assert (len(res) == 2 and "destination.packets" in res and - "server.packets" in res) - res = from_ocsf_attrs.get("file.hashes[?algorithm_id == 4].value") - assert (len(res) == 2 and "hash.sha512" in res and - "file.hash.sha512" in res) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py new file mode 100644 index 00000000..93abe83e --- /dev/null +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -0,0 +1,200 @@ +import pytest + +import pandas as pd + +from kestrel.mapping.data_model import ( + load_default_mapping, + reverse_mapping, + translate_comparison_to_native, + translate_comparison_to_ocsf, + translate_dataframe, + translate_projection_to_native, +) + + +# A "custom" mapping for an opensearch/elasticsearch datasource. +# This mapping works with data from Blue Team Village's 2023 DefCon CTF, for example. +WINLOGBEAT_MAPPING = { + "file": { + "path": "file.path", + "name": "file.name" + }, + "process": { + "cmd_line": "winlog.event_data.CommandLine", + "pid": { + "native_field": "winlog.event_data.ProcessId", + "native_value": "to_str", + "ocsf_value": "to_int" + }, + "uid": "winlog.event_data.ProcessGuid", + "file": { + "path": "winlog.event_data.Image", + "name": [ + { + "native_field": "winlog.event_data.Image", + "native_op": "LIKE", + "native_value": "endswith", + "ocsf_value": "basename" + } + ], + "parent_folder": [ + { + "native_field": "winlog.event_data.Image", + "native_op": "LIKE", + "native_value": "startswith", + "ocsf_value": "dirname" + } + ] + }, + "parent_process": { + "cmd_line": "winlog.event_data.ParentCommandLine", + "pid": "winlog.event_data.ParentProcessId", + "uid": "winlog.event_data.ParentProcessGuid", + "file": { + "path": "winlog.event_data.ParentImage", + "name": [ + { + "native_field": "winlog.event_data.ParentImage", + "native_op": "LIKE", + "native_value": "endswith", + "ocsf_value": "basename" + } + ], + "parent_folder": [ + { + "native_field": "winlog.event_data.ParentImage", + "native_op": "LIKE", + "native_value": "startswith", + "ocsf_value": "dirname" + } + ] + } + } + }, + "dst_endpoint": { + "ip": "winlog.event_data.DestinationIp", + "port": "winlog.event_data.DestinationPort" + }, + "src_endpoint": { + "ip": "winlog.event_data.SourceIp", + "port": "winlog.event_data.SourcePort" + } +} + + +# Simplified subset of the standard mapping +STIX_MAPPING = { + "device": { + "ip": "ipv4-addr:value" + }, + "endpoint": { + "ip": "ipv4-addr:value" + }, +} + + +# This mapping is used in 2 places: +# - frontend comparison from ECS to OCSF +# - backend comparison from OCSF to ECS (datasource) +ECS_MAPPING = load_default_mapping("ecs") + + +def test_reverse_mapping_ipv4(): + reverse_map = reverse_mapping(STIX_MAPPING) + ipv4 = reverse_map["ipv4-addr:value"] + assert isinstance(ipv4, list) + assert set(ipv4) == {"device.ip", "endpoint.ip"} + + +def test_reverse_mapping_executable(): + reverse_map = reverse_mapping(ECS_MAPPING) + exe = reverse_map["process.executable"] + assert isinstance(exe, list) + assert "process.file.path" in exe + for item in exe: + if isinstance(item, dict): + assert "ocsf_field" in item + if item["ocsf_field"] == "process.file.name": + # Make sure all metadata from the mapping got reversed + assert item["native_value"] == "endswith" + assert item["native_op"] == "LIKE" + assert item["ocsf_value"] == "basename" + + + +@pytest.mark.parametrize( + "dmm, field, op, value, expected_result", + [ + (WINLOGBEAT_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", + [("winlog.event_data.Image", "=", "C:\\TMP\\foo.exe")]), + (WINLOGBEAT_MAPPING, "process.file.name", "=", "foo.exe", + [("winlog.event_data.Image", "LIKE", "%\\foo.exe")]), + (ECS_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", + [("process.executable", "=", "C:\\TMP\\foo.exe")]), + (ECS_MAPPING, "process.file.name", "=", "foo.exe", + [("process.executable", "LIKE", "%\\foo.exe")]), + ], +) +def test_translate_comparison_to_native(dmm, field, op, value, expected_result): + assert translate_comparison_to_native(dmm, field, op, value) == expected_result + + +@pytest.mark.parametrize( + "dmm, field, op, value, expected_result", + [ + (ECS_MAPPING, "process.executable", "=", "C:\\TMP\\foo.exe", + [ + ("process.file.path", "=", "C:\\TMP\\foo.exe"), + ("process.file.name", "=", "foo.exe"), + ("process.file.parent_folder", "=", "C:\\TMP"), + ]), + (ECS_MAPPING, "process.executable", "LIKE", "%\\foo.exe", + [ + ("process.file.path", "LIKE", "%\\foo.exe"), + ("process.file.name", "LIKE", "foo.exe"), #TODO: could optimize this to "=" + ("process.file.parent_folder", "LIKE", "%"), #TODO: could eliminate this? + ]), + (STIX_MAPPING, "ipv4-addr:value", "=", "198.51.100.13", + [ + ("device.ip", "=", "198.51.100.13"), + ("endpoint.ip", "=", "198.51.100.13"), + ]), + ], +) +def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result): + """Test the translate function.""" + reverse_dmm = reverse_mapping(dmm) # Make the dmms fixtures? + assert set(translate_comparison_to_ocsf(reverse_dmm, field, op, value)) == set(expected_result) + + +@pytest.mark.parametrize( + "dmm, entity, field, expected_result", + [ + (WINLOGBEAT_MAPPING, "process", ["file.name", "pid"], + [("winlog.event_data.Image", "file.name"), ("winlog.event_data.ProcessId", "pid")]), + (WINLOGBEAT_MAPPING, "process", None, + [("winlog.event_data.CommandLine", "cmd_line"), + ("winlog.event_data.ProcessId", "pid"), + ("winlog.event_data.ProcessGuid", "uid"), + ("winlog.event_data.Image", "file.path"), + ("winlog.event_data.Image", "file.name"), + ("winlog.event_data.Image", "file.parent_folder"), + ("winlog.event_data.ParentCommandLine", "parent_process.cmd_line"), + ("winlog.event_data.ParentProcessId", "parent_process.pid"), + ("winlog.event_data.ParentProcessGuid", "parent_process.uid"), + ("winlog.event_data.ParentImage", "parent_process.file.path"), + ("winlog.event_data.ParentImage", "parent_process.file.name"), + ("winlog.event_data.ParentImage", "parent_process.file.parent_folder"), + ]), + ], +) +def test_translate_projection_to_native(dmm, entity, field, expected_result): + assert translate_projection_to_native(dmm, entity, field) == expected_result + + +def test_translate_dataframe(): #TODO: more testing here + df = pd.DataFrame({"file.path": [r"C:\Windows\System32\cmd.exe", r"C:\TMP"], + "pid": [1, 2]}) + dmm = load_default_mapping("ecs") + df = translate_dataframe(df, dmm["process"]) + #TODO:assert df["file.name"].iloc[0] == "cmd.exe" diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py new file mode 100644 index 00000000..9e454925 --- /dev/null +++ b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py @@ -0,0 +1,35 @@ +import pandas as pd +import pytest + +from kestrel.mapping.transformers import ( + run_transformer, + run_transformer_on_series, +) + + +@pytest.mark.parametrize( + "transform, value, expected", [ + ("dirname", r"C:\Windows\System32\cmd.exe", r"C:\Windows\System32"), + ("basename", r"C:\Windows\System32\cmd.exe", r"cmd.exe"), + ("startswith", r"C:\Windows\System32", r"C:\Windows\System32\%"), + ("endswith", "cmd.exe", r"%\cmd.exe"), + ("to_int", 1234, 1234), + ("to_int", 1234.1234, 1234), # Maybe this should fail? + ("to_int", "1234", 1234), + ("to_int", "0x4d2", 1234), + ("to_str", "1234", "1234"), + ("to_str", 1234, "1234"), + ("to_epoch_ms", "2024-03-29T12:57:56.926Z", 1711717076926), + ("to_epoch_ms", "2024-03-29T12:57:56.92Z", 1711717076920), + ("to_epoch_ms", "2024-03-29T12:57:56.9Z", 1711717076900), + ("to_epoch_ms", "2024-03-29T12:57:56Z", 1711717076000), + ] +) +def test_run_transformer(transform, value, expected): + assert run_transformer(transform, value) == expected + + +def test_run_series_basename(): + data = pd.Series([r"C:\Windows\System32\cmd.exe", r"C:\TMP"]) + result = list(run_transformer_on_series("basename", data)) + assert result == ["cmd.exe", "TMP"] diff --git a/packages-nextgen/kestrel_core/tests/test_parser.py b/packages-nextgen/kestrel_core/tests/test_parser.py index 14faa856..1ca5d314 100644 --- a/packages-nextgen/kestrel_core/tests/test_parser.py +++ b/packages-nextgen/kestrel_core/tests/test_parser.py @@ -1,5 +1,6 @@ import json import pytest +from collections import Counter from datetime import datetime, timedelta, timezone from kestrel.frontend.parser import parse_kestrel @@ -16,6 +17,8 @@ Reference, Sort, Variable, + Explain, + Return, ) @@ -108,10 +111,10 @@ def test_parser_mapping_single_comparison_to_multiple_values(): stmt = "x = GET ipv4-addr FROM if://ds WHERE value = '192.168.22.3'" parse_filter = get_parsed_filter_exp(stmt) comps = parse_filter.comps - assert isinstance(comps, list) and len(comps) == 3 + assert isinstance(comps, list) and len(comps) == 4 fields = [x.field for x in comps] assert ("dst_endpoint.ip" in fields and "src_endpoint.ip" in fields and - "device.ip" in fields) + "device.ip" in fields and "endpoint.ip" in fields) def test_parser_mapping_multiple_comparison_to_multiple_values(): @@ -121,12 +124,9 @@ def test_parser_mapping_multiple_comparison_to_multiple_values(): field1 = parse_filter.lhs.field assert field1 == 'file.name' field2 = parse_filter.rhs.lhs.field - assert field2 == 'process.name' - comps3 = parse_filter.rhs.rhs.comps - assert isinstance(comps3, list) and len(comps3) == 2 - fields3 = [x.field for x in comps3] - assert ("actor.process.name" in fields3 and - "process.parent_process.name" in fields3) + assert field2 == 'name' # 'process.name' + field3 = parse_filter.rhs.rhs.field + assert field3 == "parent_process.name" def test_parser_new_json(): @@ -265,3 +265,26 @@ def test_parser_disp_after_new(): assert (proj, limit) in graph.edges assert (limit, offset) in graph.edges assert (offset, ret) in graph.edges + + +def test_parser_explain_alone(): + stmt = "EXPLAIN abc" + graph = parse_kestrel(stmt) + assert len(graph) == 3 + assert len(graph.edges) == 2 + assert Counter(map(type, graph.nodes())) == Counter([Reference, Explain, Return]) + + +def test_parser_explain_dereferred(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +EXPLAIN proclist +""" + graph = parse_kestrel(stmt) + assert len(graph) == 4 + assert len(graph.edges) == 3 + assert Counter(map(type, graph.nodes())) == Counter([Construct, Variable, Explain, Return]) diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index bcbfdeb0..115154d4 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -1,6 +1,14 @@ import pytest +import os from kestrel import Session from pandas import DataFrame +from uuid import uuid4 + +from kestrel.display import GraphExplanation +from kestrel.ir.instructions import Construct +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER +from kestrel.frontend.parser import parse_kestrel +from kestrel.cache import SqliteCache def test_execute_in_cache(): @@ -26,3 +34,153 @@ def test_execute_in_cache(): assert b2.equals(next(res)) with pytest.raises(StopIteration): next(res) + + +def test_double_deref_in_cache(): + # When the Filter node is dereferred twice + # The node should be deepcopied each time to avoid issue + hf = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +px = proclist WHERE name != "cmd.exe" AND pid = 205 +chrome = proclist WHERE pid IN px.pid +DISP chrome +DISP chrome +""" + df = DataFrame([ {"name": "chrome.exe", "pid": 205} ]) + with Session() as session: + res = session.execute_to_generate(hf) + assert df.equals(next(res)) + assert df.equals(next(res)) + with pytest.raises(StopIteration): + next(res) + + +def test_explain_in_cache(): + hf = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name != "cmd.exe" +chrome = browsers WHERE pid = 205 +EXPLAIN chrome +""" + with Session() as session: + ress = session.execute_to_generate(hf) + res = next(ress) + assert isinstance(res, GraphExplanation) + assert len(res.graphlets) == 1 + ge = res.graphlets[0] + assert ge.graph == session.irgraph.to_dict() + construct = session.irgraph.get_nodes_by_type(Construct)[0] + assert ge.query.language == "SQL" + stmt = ge.query.statement.replace('"', '') + assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}v) AS proclist \nWHERE name != \'cmd.exe\') AS browsers \nWHERE pid = 205) AS chrome' + with pytest.raises(StopIteration): + next(ress) + + +def test_multi_interface_explain(): + + class DataLake(SqliteCache): + @staticmethod + def schemes(): + return ["datalake"] + + class Gateway(SqliteCache): + @staticmethod + def schemes(): + return ["gateway"] + + extra_db = [] + with Session() as session: + stmt1 = """ +procs = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +DISP procs +""" + session.execute(stmt1) + session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = DataLake + session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "datalake" + + new_cache = SqliteCache(session_id = uuid4()) + extra_db.append(new_cache.db_path) + session.interface_manager.interfaces.append(new_cache) + stmt2 = """ +nt = NEW network [ {"pid": 123, "source": "192.168.1.1", "destination": "1.1.1.1"} + , {"pid": 205, "source": "192.168.1.1", "destination": "1.1.1.2"} + ] +DISP nt +""" + session.execute(stmt2) + session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = Gateway + session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "gateway" + + new_cache = SqliteCache(session_id = uuid4()) + extra_db.append(new_cache.db_path) + session.interface_manager.interfaces.append(new_cache) + stmt3 = """ +domain = NEW domain [ {"ip": "1.1.1.1", "domain": "cloudflare.com"} + , {"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"} + ] +DISP domain +""" + session.execute(stmt3) + + stmt = """ +p2 = procs WHERE name IN ("firefox.exe", "chrome.exe") +ntx = nt WHERE pid IN p2.pid +d2 = domain WHERE ip IN ntx.destination +EXPLAIN d2 +DISP d2 +""" + ress = session.execute_to_generate(stmt) + disp = next(ress) + df_res = next(ress) + + with pytest.raises(StopIteration): + next(ress) + + assert isinstance(disp, GraphExplanation) + assert len(disp.graphlets) == 4 + + assert len(disp.graphlets[0].graph["nodes"]) == 5 + query = disp.graphlets[0].query.statement.replace('"', '') + procs = session.irgraph.get_variable("procs") + c1 = next(session.irgraph.predecessors(procs)) + assert query == f"SELECT pid \nFROM (SELECT * \nFROM (SELECT * \nFROM {c1.id.hex}) AS procs \nWHERE name IN ('firefox.exe', 'chrome.exe')) AS p2" + + assert len(disp.graphlets[1].graph["nodes"]) == 2 + query = disp.graphlets[1].query.statement.replace('"', '') + nt = session.irgraph.get_variable("nt") + c2 = next(session.irgraph.predecessors(nt)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM {c2.id.hex}) AS nt" + + # the current session.execute_to_generate() logic does not store + # in cache if evaluated by cache; the behavior may change in the future + assert len(disp.graphlets[2].graph["nodes"]) == 2 + query = disp.graphlets[2].query.statement.replace('"', '') + domain = session.irgraph.get_variable("domain") + c3 = next(session.irgraph.predecessors(domain)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain" + + assert len(disp.graphlets[3].graph["nodes"]) == 12 + print(disp.graphlets[3].graph["nodes"]) + query = disp.graphlets[3].query.statement.replace('"', '') + p2 = session.irgraph.get_variable("p2") + p2pa = next(session.irgraph.successors(p2)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain \nWHERE ip IN (SELECT destination \nFROM (SELECT * \nFROM {nt.id.hex}v \nWHERE pid IN (SELECT * \nFROM {p2pa.id.hex}v)) AS ntx)) AS d2" + + df_ref = DataFrame([{"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"}]) + assert df_ref.equals(df_res) + + for db_file in extra_db: + os.remove(db_file) diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py deleted file mode 100644 index f932e879..00000000 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel_datasource_opensearch.interface import OpenSearchInterface diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py b/packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml similarity index 96% rename from packages-nextgen/kestrel_datasource_opensearch/pyproject.toml rename to packages-nextgen/kestrel_interface_opensearch/pyproject.toml index 6d5017a0..6270f6d0 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml +++ b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools >= 68.2.2", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "kestrel_datasource_opensearch" +name = "kestrel_interface_opensearch" version = "2.0.0" description = "Kestrel OpenSearch Datasource Interface" readme = "README.rst" diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py new file mode 100644 index 00000000..3ee389ca --- /dev/null +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py @@ -0,0 +1 @@ +from kestrel_interface_opensearch.interface import OpenSearchInterface diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py similarity index 62% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index add15f4a..26d02ccf 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass, field -from typing import Dict, Optional +from typing import Dict, Mapping, Optional import yaml from mashumaro.mixins.json import DataClassJSONMixin @@ -9,10 +9,8 @@ CONFIG_DIR_DEFAULT, load_user_config, ) -from kestrel.mapping.utils import ( - generate_from_ocsf_dictionaries, - load_standard_config, -) +from kestrel.exceptions import InterfaceNotConfigured +from kestrel.mapping.data_model import load_default_mapping PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "opensearch.yaml" @@ -42,22 +40,16 @@ class Index(DataClassJSONMixin): connection: str timestamp: str timestamp_format: str - data_model_mapping: Optional[str] = None - data_model_map: dict = field(default_factory=dict) + data_model_mapping: Optional[str] = None # Filename for mapping + data_model_map: Mapping = field(default_factory=dict) def __post_init__(self): if self.data_model_mapping: with open(self.data_model_mapping, "r") as fp: - data_model_map = yaml.safe_load(fp) - # Reverse it so it's ocsf -> native - self.data_model_map = { - v: k for k, v in data_model_map.items() if isinstance(v, str) - } + self.data_model_map = yaml.safe_load(fp) else: # Default to the built-in ECS mapping - load_standard_config("kestrel.mapping") - _, data_model_map = generate_from_ocsf_dictionaries("ecs") - self.data_model_map = {k: v[0] for k, v in data_model_map.items()} + self.data_model_map = load_default_mapping("ecs") @dataclass @@ -71,4 +63,7 @@ def __post_init__(self): def load_config(): - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + try: + return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + except TypeError: + raise InterfaceNotConfigured() diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py similarity index 73% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index c1406abc..8c70eb95 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -5,8 +5,9 @@ from opensearchpy import OpenSearch from pandas import DataFrame, Series, concat +from kestrel.display import GraphletExplanation from kestrel.exceptions import DataSourceError -from kestrel.interface.datasource.base import AbstractDataSourceInterface +from kestrel.interface import AbstractInterface from kestrel.ir.graph import IRGraphEvaluable from kestrel.ir.instructions import ( DataSource, @@ -18,9 +19,10 @@ TransformingInstruction, SolePredecessorTransformingInstruction, ) +from kestrel.mapping.data_model import translate_dataframe -from kestrel_datasource_opensearch.config import load_config -from kestrel_datasource_opensearch.ossql import OpenSearchTranslator +from kestrel_interface_opensearch.config import load_config +from kestrel_interface_opensearch.ossql import OpenSearchTranslator _logger = logging.getLogger(__name__) @@ -32,11 +34,12 @@ def _jdbc2df(schema: dict, datarows: dict) -> DataFrame: return DataFrame(datarows, columns=columns) -def read_sql(sql: str, conn: OpenSearch) -> DataFrame: +def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFrame: """Execute `sql` and return the results as a DataFrame, a la pandas.read_sql""" # https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#query-api body = { - "fetch_size": 10000, # Should we make this configurable? + # Temporarily comment out fetch_size due to https://github.com/opensearch-project/sql/issues/2579 + # FIXME: "fetch_size": 10000, # Should we make this configurable? "query": sql, } query_resp = conn.http.post("/_plugins/_sql?format=jdbc", body=body) @@ -56,7 +59,12 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame: dfs = [] done = False while not done: - dfs.append(_jdbc2df(schema, query_resp["datarows"])) + df = _jdbc2df(schema, query_resp["datarows"]) + if dmm is not None: + # Need to use Data Model Map to do results translation + dfs.append(translate_dataframe(df, dmm)) + else: + dfs.append(df) cursor = query_resp.get("cursor") if not cursor: break @@ -68,7 +76,7 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame: return concat(dfs) -class OpenSearchInterface(AbstractDataSourceInterface): +class OpenSearchInterface(AbstractInterface): def __init__( self, serialized_cache_catalog: Optional[str] = None, @@ -89,9 +97,9 @@ def __init__( ) self.conns[name] = client - @property - def name(self): - return "opensearch" + @staticmethod + def schemes() -> Iterable[str]: + return ["opensearch"] def store( self, @@ -111,7 +119,8 @@ def evaluate_graph( for instruction in instructions_to_evaluate: translator = self._evaluate_instruction_in_graph(graph, instruction) # TODO: may catch error in case evaluation starts from incomplete SQL - _logger.debug("SQL query generated: %s", translator.result()) + sql = translator.result() + _logger.debug("SQL query generated: %s", sql) ds = self.config.indexes[translator.table] # table == datasource conn = self.config.connections[ds.connection] client = OpenSearch( @@ -119,10 +128,28 @@ def evaluate_graph( http_auth=(conn.auth.username, conn.auth.password), verify_certs=conn.verify_certs, ) - mapping[instruction.id] = read_sql(translator.result(), client) + mapping[instruction.id] = read_sql( + sql, client, translator.from_ocsf_map[translator.entity] + ) client.close() return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + translator = self._evaluate_instruction_in_graph(graph, instruction) + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query_stmt = translator.result() + mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + return mapping + def _evaluate_instruction_in_graph( self, graph: IRGraphEvaluable, @@ -175,7 +202,10 @@ def get_schema(self, index: str) -> dict: client = self._get_client_for_index(index) if index not in self.schemas: df = read_sql(f"DESCRIBE TABLES LIKE {index}", client) - self.schemas[index] = Series( - df["TYPE_NAME"], index=df["COLUMN_NAME"] - ).to_dict() + self.schemas[index] = ( + df[["TYPE_NAME", "COLUMN_NAME"]] + .set_index("COLUMN_NAME") + .T.to_dict("records")[0] + ) + _logger.debug("%s schema:\n%s", index, self.schemas[index]) return self.schemas[index] diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py similarity index 79% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 55976d23..018cd4c8 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -9,7 +9,6 @@ BoolExp, ExpOp, FComparison, - ListComparison, ListOp, MultiComp, NumCompOp, @@ -26,6 +25,10 @@ Sort, SortDirection, ) +from kestrel.mapping.data_model import ( + translate_comparison_to_native, + translate_projection_to_native, +) _logger = logging.getLogger(__name__) @@ -68,6 +71,16 @@ def _or(lhs: str, rhs: Value) -> str: } +def _format_value(value): + if isinstance(value, str): + # Need to quote string values + value = f"'{value}'" + elif isinstance(value, list): + # SQL uses parens for lists + value = tuple(value) + return value + + @typechecked class OpenSearchTranslator: def __init__( @@ -102,23 +115,21 @@ def __init__( @typechecked def _render_comp(self, comp: FComparison) -> str: - if isinstance(comp, StrComparison): - # Need to quote string values - value = f"'{comp.value}'" - elif isinstance(comp, ListComparison): - # SQL uses parens for lists - value = tuple(comp.value) - else: - value = comp.value - # Need to map OCSF filter field to native - prefix = f"{self.entity}." if self.entity else "" + prefix = ( + f"{self.entity}." if (self.entity and comp.field != self.timestamp) else "" + ) ocsf_field = f"{prefix}{comp.field}" - field = self.from_ocsf_map.get(ocsf_field, comp.field) - _logger.debug("Mapped field '%s' to '%s'", ocsf_field, field) + comps = translate_comparison_to_native( + self.from_ocsf_map, ocsf_field, comp.op, comp.value + ) try: - result = f"{field} {comp2func[comp.op]} {value}" + comps = [f"{f} {comp2func[o]} {_format_value(v)}" for f, o, v in comps] + conj = " OR ".join(comps) + result = conj if len(comps) == 1 else f"({conj})" except KeyError: - raise UnsupportedOperatorError(comp.op.value) + raise UnsupportedOperatorError( + comp.op.value + ) # FIXME: need to report the mapped op, not the original return result @typechecked @@ -177,24 +188,20 @@ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: # Just save projection and compile it later self.project = proj - def _get_ocsf_cols(self): - prefix = f"{self.entity}." if self.entity else "" - if not self.project: - ocsf_cols = [k for k in self.from_ocsf_map.keys() if k.startswith(prefix)] - else: - ocsf_cols = [f"{prefix}{col}" for col in self.project.attrs] - _logger.debug("OCSF fields: %s", ocsf_cols) - return ocsf_cols - def _render_proj(self): - fields = { - self.from_ocsf_map.get(col, col): col for col in self._get_ocsf_cols() - } - _logger.debug("Fields: %s", fields) + """Get a list of native cols to project with their OCSF equivalents as SQL aliases""" + projection = self.project.attrs if self.project else None + name_pairs = translate_projection_to_native( + self.from_ocsf_map, self.entity, projection + ) proj = [ - f"`{k}` AS `{v.partition('.')[2]}`" if "." in v else v - for k, v in fields.items() + f"`{k}` AS `{v}`" if k != v else f"`{k}`" + for k, v in name_pairs + if k in self.schema # Ignore mapped attrs the index doesn't have ] + if not proj: + # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? + proj = [f"`{attr}`" for attr in self.project.attrs] _logger.debug("Set projection to %s", proj) return proj @@ -230,7 +237,7 @@ def result(self) -> str: if where: stages.append(f"WHERE {where}") if self.order_by: - stages.append(f"ORDER BY {self.order_by} {self.sort_dir}") + stages.append(f"ORDER BY {self.order_by} {self.sort_dir.value}") if self.limit: # https://opensearch.org/docs/latest/search-plugins/sql/sql/basic/#limit if self.offset: diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py b/packages-nextgen/kestrel_interface_opensearch/tests/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py rename to packages-nextgen/kestrel_interface_opensearch/tests/__init__.py diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py similarity index 97% rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py rename to packages-nextgen/kestrel_interface_opensearch/tests/test_config.py index 51964889..85241b71 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py @@ -2,7 +2,7 @@ import yaml -from kestrel_datasource_opensearch.config import ( +from kestrel_interface_opensearch.config import ( PROFILE_PATH_ENV_VAR, Connection, load_config, diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py similarity index 74% rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py rename to packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py index d4e17eaf..838b57e2 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py @@ -1,7 +1,7 @@ from datetime import datetime from dateutil import parser -from kestrel_datasource_opensearch.ossql import OpenSearchTranslator +from kestrel_interface_opensearch.ossql import OpenSearchTranslator from kestrel.exceptions import UnsupportedOperatorError from kestrel.ir.filter import ( BoolExp, @@ -33,12 +33,28 @@ TIMEFMT = '%Y-%m-%dT%H:%M:%S.%fZ' +# A much-simplified test mapping data_model_map = { - "process.cmd_line": "CommandLine", - "process.file.path": "Image", - "process.pid": "ProcessId", - "actor.process.pid": "ParentProcessId", + "process": { + "cmd_line": "CommandLine", + "file": { + "path": "Image", + # "name": [ + # { + # "native_field": "Image", + # "native_value": "basename", + # "ocsf_op": "LIKE", + # "ocsf_value": "endswith" + # } + # ] + }, + "pid": "ProcessId", + "parent_process": { + "pid": "ParentProcessId", + }, + }, } + schema = { "CommandLine": "text", "Image": "text", @@ -68,10 +84,10 @@ def _remove_nl(s): "SELECT {} FROM my_table WHERE foo >= 0 AND timestamp >= '2023-12-06T08:17:00.000000Z' AND timestamp < '2023-12-07T08:17:00.000000Z'"), # Add a limit and projection ([Limit(3), ProjectAttrs(['foo', 'bar', 'baz']), Filter(StrComparison('foo', StrCompOp.EQ, 'abc'))], - "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"), + "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), # Same as above but reverse order ([Filter(StrComparison('foo', StrCompOp.EQ, 'abc')), ProjectAttrs(['foo', 'bar', 'baz']), Limit(3)], - "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"), + "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), ([Filter(ListComparison('foo', ListOp.NIN, ['abc', 'def']))], "SELECT {} FROM my_table WHERE foo NOT IN ('abc', 'def')"), ([Filter(MultiComp(ExpOp.OR, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], @@ -86,9 +102,11 @@ def _remove_nl(s): ] ) def test_opensearch_translator(iseq, sql): - cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`' - if ProjectEntity not in {type(i) for i in iseq}: - cols += ', `ParentProcessId` AS `process.pid`' + cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' + if ProjectEntity in {type(i) for i in iseq}: + cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' + else: + cols = '`CommandLine` AS `process.cmd_line`, `Image` AS `process.file.path`, `ProcessId` AS `process.pid`, `ParentProcessId` AS `process.parent_process.pid`' trans = OpenSearchTranslator(TIMEFMT, "timestamp", "my_table", data_model_map, schema) for i in iseq: trans.add_instruction(i) diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml new file mode 100644 index 00000000..c4309e70 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["setuptools >= 68.2.2", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "kestrel_interface_sqlalchemy" +version = "2.0.0" +description = "Kestrel SQLAlchemy Datasource Interface" +readme = "README.rst" +requires-python = ">=3.8" +license = {text = "Apache 2.0 License"} +maintainers = [ + {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, + {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, +] +keywords = [ + "kestrel", + "cybersecurity", + "threat hunting", +] +classifiers = [ + "Topic :: Security", + "Operating System :: OS Independent", + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3", +] + +dependencies = [ + "kestrel_core>=2.0.0", +] + +[project.urls] +Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" +Documentation = "https://kestrel.readthedocs.io/" +Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py new file mode 100644 index 00000000..781df021 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py @@ -0,0 +1 @@ +from kestrel_interface_sqlalchemy.interface import SQLAlchemyInterface diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py new file mode 100644 index 00000000..e9d148e4 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py @@ -0,0 +1,58 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Mapping, Optional + +import yaml +from mashumaro.mixins.json import DataClassJSONMixin + +from kestrel.config.utils import ( + CONFIG_DIR_DEFAULT, + load_user_config, +) +from kestrel.exceptions import InterfaceNotConfigured +from kestrel.mapping.data_model import load_default_mapping + + +PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "sqlalchemy.yaml" +PROFILE_PATH_ENV_VAR = "KESTREL_SQLALCHEMY_CONFIG" + +_logger = logging.getLogger(__name__) + + +@dataclass +class Connection(DataClassJSONMixin): + url: str # SQLAlchemy "connection URL" or "connection string" + + +@dataclass +class Table(DataClassJSONMixin): + connection: str + timestamp: str + timestamp_format: str + data_model_mapping: Optional[str] = None # Filename for mapping + data_model_map: Mapping = field(default_factory=dict) + + def __post_init__(self): + if self.data_model_mapping: + with open(self.data_model_mapping, "r") as fp: + self.data_model_map = yaml.safe_load(fp) + else: + # Default to the built-in ECS mapping + self.data_model_map = load_default_mapping("ecs") # FIXME: need a default? + + +@dataclass +class Config(DataClassJSONMixin): + connections: Dict[str, Connection] + tables: Dict[str, Table] + + def __post_init__(self): + self.connections = {k: Connection(**v) for k, v in self.connections.items()} + self.tables = {k: Table(**v) for k, v in self.tables.items()} + + +def load_config(): + try: + return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + except TypeError: + raise InterfaceNotConfigured() diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py new file mode 100644 index 00000000..6197ab5e --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py @@ -0,0 +1,268 @@ +import logging +from functools import reduce +from typing import Callable, Iterable, Mapping, Optional +from uuid import UUID + +from pandas import DataFrame, read_sql +import sqlalchemy +from sqlalchemy import and_, column, or_ +from sqlalchemy.sql.elements import BooleanClauseList +from sqlalchemy.sql.expression import ColumnClause +from typeguard import typechecked + +from kestrel.display import GraphletExplanation +from kestrel.interface import AbstractInterface +from kestrel.interface.codegen.sql import SqlTranslator, comp2func +from kestrel.ir.filter import ( + BoolExp, + ExpOp, + FComparison, + MultiComp, + StrComparison, + StrCompOp, +) +from kestrel.ir.graph import IRGraphEvaluable +from kestrel.ir.instructions import ( + DataSource, + Filter, + Instruction, + ProjectAttrs, + ProjectEntity, + Return, + SolePredecessorTransformingInstruction, + SourceInstruction, + TransformingInstruction, + Variable, +) +from kestrel.mapping.data_model import ( + translate_comparison_to_native, + translate_dataframe, + translate_projection_to_native, +) + +from kestrel_interface_sqlalchemy.config import load_config + + +_logger = logging.getLogger(__name__) + + +@typechecked +class SQLAlchemyTranslator(SqlTranslator): + def __init__( + self, + dialect: sqlalchemy.engine.default.DefaultDialect, + timefmt: Callable, + timestamp: str, + from_obj: sqlalchemy.FromClause, + dmm: dict, + ): + super().__init__(dialect, timefmt, timestamp, from_obj) + self.dmm = dmm + self.proj = None + self.entity_type = None + + @typechecked + def _render_comp(self, comp: FComparison): + prefix = ( + f"{self.entity_type}." + if (self.entity_type and comp.field != self.timestamp) + else "" + ) + ocsf_field = f"{prefix}{comp.field}" + comps = translate_comparison_to_native( + self.dmm, ocsf_field, comp.op, comp.value + ) + translated_comps = [] + for comp in comps: + field, op, value = comp + col: ColumnClause = column(field) + if op == StrCompOp.NMATCHES: + tmp = ~comp2func[op](col, value) + else: + tmp = comp2func[op](col, value) + translated_comps.append(tmp) + return reduce(or_, translated_comps) + + @typechecked + def _render_multi_comp(self, comps: MultiComp): + op = and_ if comps.op == ExpOp.AND else or_ + return reduce(op, map(self._render_comp, comps.comps)) + + # This is copied verbatim from sql.py but we need to supply our own _render_comp + def _render_exp(self, exp: BoolExp) -> BooleanClauseList: + if isinstance(exp.lhs, BoolExp): + lhs = self._render_exp(exp.lhs) + elif isinstance(exp.lhs, MultiComp): + lhs = self._render_multi_comp(exp.lhs) + else: + lhs = self._render_comp(exp.lhs) + if isinstance(exp.rhs, BoolExp): + rhs = self._render_exp(exp.rhs) + elif isinstance(exp.rhs, MultiComp): + rhs = self._render_multi_comp(exp.rhs) + else: + rhs = self._render_comp(exp.rhs) + return and_(lhs, rhs) if exp.op == ExpOp.AND else or_(lhs, rhs) + + @typechecked + def _add_filter(self) -> Optional[str]: + if not self.filt: + return None + filt = self.filt + if filt.timerange.start: + # Convert the timerange to the appropriate pair of comparisons + start_comp = StrComparison( + self.timestamp, ">=", self.timefmt(filt.timerange.start) + ) + stop_comp = StrComparison( + self.timestamp, "<", self.timefmt(filt.timerange.stop) + ) + # AND them together + time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp) + # AND that with any existing filter expression + exp = BoolExp(filt.exp, ExpOp.AND, time_exp) + else: + exp = filt.exp + if isinstance(exp, BoolExp): + comp = self._render_exp(exp) + elif isinstance(exp, MultiComp): + comp = self._render_multi_comp(exp) + else: + comp = self._render_comp(exp) + self.query = self.query.where(comp) + + def add_Filter(self, filt: Filter) -> None: + # Just save filter and compile it later + # Probably need the entity projection set first + self.filt = filt + + def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: + self.proj = proj + + def add_ProjectEntity(self, proj: ProjectEntity) -> None: + self.entity_type = proj.entity_type + + def result(self) -> sqlalchemy.Compiled: + proj = self.proj.attrs if self.proj else None + pairs = translate_projection_to_native(self.dmm, self.entity_type, proj) + cols = [sqlalchemy.column(i).label(j) for i, j in pairs] + self._add_filter() + self.query = self.query.with_only_columns(*cols) # TODO: mapping? + return self.query.compile(dialect=self.dialect) + + +class SQLAlchemyInterface(AbstractInterface): + def __init__( + self, + serialized_cache_catalog: Optional[str] = None, + session_id: Optional[UUID] = None, + ): + _logger.debug("SQLAlchemyInterface: loading config") + super().__init__(serialized_cache_catalog, session_id) + self.config = load_config() + self.schemas: dict = {} # Schema per table (index) + self.engines: dict = {} # Map of conn name -> engine + self.conns: dict = {} # Map of conn name -> connection + for info in self.config.tables.values(): + name = info.connection + conn_info = self.config.connections[name] + if name not in self.engines: + self.engines[name] = sqlalchemy.create_engine(conn_info.url) + if name not in self.conns: + engine = self.engines[name] + self.conns[name] = engine.connect() + _logger.debug("SQLAlchemyInterface: configured %s", name) + + @staticmethod + def schemes() -> Iterable[str]: + return ["sqlalchemy"] + + def store( + self, + instruction_id: UUID, + data: DataFrame, + ): + raise NotImplementedError("SQLAlchemyInterface.store") # TEMP + + def evaluate_graph( + self, + graph: IRGraphEvaluable, + instructions_to_evaluate: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, DataFrame]: + mapping = {} + if not instructions_to_evaluate: + instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + translator = self._evaluate_instruction_in_graph(graph, instruction) + # TODO: may catch error in case evaluation starts from incomplete SQL + sql = translator.result() + _logger.debug("SQL query generated: %s", sql) + # Get the "from" table for this query + tables = translator.query.selectable.get_final_froms() + table = tables[0].name # TODO: what if there's more than 1? + # Get the data source's SQLAlchemy connection object + conn = self.conns[self.config.tables[table].connection] + df = read_sql(sql, conn) + dmm = translator.dmm[ + translator.entity_type + ] # TODO: need a method for this? + mapping[instruction.id] = translate_dataframe(df, dmm) + return mapping + + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + translator = self._evaluate_instruction_in_graph(graph, instruction) + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query_stmt = translator.result() + mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + return mapping + + def _evaluate_instruction_in_graph( + self, + graph: IRGraphEvaluable, + instruction: Instruction, + ) -> SQLAlchemyTranslator: + _logger.debug("instruction: %s", str(instruction)) + translator = None + if isinstance(instruction, TransformingInstruction): + trunk, _r2n = graph.get_trunk_n_branches(instruction) + translator = self._evaluate_instruction_in_graph(graph, trunk) + + if isinstance(instruction, SolePredecessorTransformingInstruction): + if isinstance(instruction, Return): + pass + elif isinstance(instruction, Variable): + pass + else: + translator.add_instruction(instruction) + + elif isinstance(instruction, Filter): + translator.add_instruction(instruction) + + else: + raise NotImplementedError(f"Unknown instruction type: {instruction}") + + elif isinstance(instruction, SourceInstruction): + if isinstance(instruction, DataSource): + ds = self.config.tables[instruction.datasource] + connection = ds.connection + dialect = self.engines[connection].dialect + translator = SQLAlchemyTranslator( + dialect, + lambda dt: dt.strftime(ds.timestamp_format), + ds.timestamp, + sqlalchemy.table(instruction.datasource), + ds.data_model_map, + ) + else: + raise NotImplementedError(f"Unhandled instruction type: {instruction}") + + return translator diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py new file mode 100644 index 00000000..a19d97a6 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py @@ -0,0 +1,42 @@ +import os + +import yaml + +from kestrel_interface_sqlalchemy.config import ( + PROFILE_PATH_ENV_VAR, + Connection, + load_config, +) + + +def test_load_config(tmp_path): + config = { + "connections": { + "localhost": { + "url": "sqlite:////home/jdoe/test.db", + }, + "some-data-lake": { + "url": "presto://jdoe@example.com:8889/hive", + } + }, + "tables": { + "cloud_table": { + "connection": "some-data-lake", + "timestamp": "eventTime", + "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", + "data_model_mapping": str(tmp_path / "mapping.yaml") + } + } + } + map_file = tmp_path / "mapping.yaml" + with open(map_file, 'w') as fp: + fp.write("some.field: other.field\n") + config_file = tmp_path / "sqlalchemy.yaml" + with open(config_file, 'w') as fp: + yaml.dump(config, fp) + os.environ[PROFILE_PATH_ENV_VAR] = str(config_file) + read_config = load_config() + conn: Connection = read_config.connections["localhost"] + assert conn.url == config["connections"]["localhost"]["url"] + assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] + assert read_config.tables["cloud_table"].timestamp == config["tables"]["cloud_table"]["timestamp"] diff --git a/packages-nextgen/kestrel_jupyter/pyproject.toml b/packages-nextgen/kestrel_jupyter/pyproject.toml index 99bfea56..3cc31435 100644 --- a/packages-nextgen/kestrel_jupyter/pyproject.toml +++ b/packages-nextgen/kestrel_jupyter/pyproject.toml @@ -31,6 +31,9 @@ dependencies = [ "jupyterlab", "jupyter_client", "nbclassic", + "sqlparse==0.4.4", + "pygments==2.17.2", + "matplotlib==3.8.3", ] [project.optional-dependencies] diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py new file mode 100644 index 00000000..21e10883 --- /dev/null +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py @@ -0,0 +1,68 @@ +from pandas import DataFrame +import tempfile +import base64 +import sqlparse +from typing import Iterable, Mapping +from pygments import highlight +from pygments.lexers import guess_lexer +from pygments.lexers.sql import SqlLexer +from pygments.lexers.kusto import KustoLexer +from pygments.formatters import HtmlFormatter +import networkx as nx +import matplotlib.pyplot as plt + +from kestrel.display import Display, GraphExplanation +from kestrel.ir.graph import IRGraph +from kestrel.ir.instructions import Instruction, DataSource, Variable, Construct + + +def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]: + d = {} + for n in g: + if isinstance(n, Variable): + d[n] = n.name + elif isinstance(n, Construct): + d[n] = n.id.hex[:4] + elif isinstance(n, DataSource): + d[n] = n.datasource + else: + d[n] = f"[{n.instruction.upper()}]" + return d + + +def to_html_blocks(d: Display) -> Iterable[str]: + if isinstance(d, DataFrame): + yield d.to_html() + elif isinstance(d, GraphExplanation): + for graphlet in d.graphlets: + graph = IRGraph(graphlet.graph) + plt.figure(figsize=(4, 2)) + nx.draw( + graph, + with_labels=True, + labels=gen_label_mapping(graph), + font_size=8, + node_size=260, + node_color="#bfdff5", + ) + with tempfile.NamedTemporaryFile(delete_on_close=False) as tf: + tf.close() + plt.savefig(tf.name, format="png") + with open(tf.name, "rb") as tfx: + data = tfx.read() + + img = data_uri = base64.b64encode(data).decode("utf-8") + imgx = f'' + yield imgx + + query = graphlet.query.statement + if graphlet.query.language == "SQL": + lexer = SqlLexer() + query = sqlparse.format(query, reindent=True, keyword_case="upper") + elif graphlet.query.language == "KQL": + lexer = KustoLexer() + else: + lexer = guess_lexer(query) + query = highlight(query, lexer, HtmlFormatter()) + style = "" + yield style + query diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py index 2d935317..456cde96 100644 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py @@ -1,7 +1,9 @@ from ipykernel.kernelbase import Kernel import logging +import networkx as nx from kestrel.session import Session +from kestrel_jupyter_kernel.display import to_html_blocks _logger = logging.getLogger(__name__) @@ -35,11 +37,14 @@ def do_execute( if not silent: try: for result in self.kestrel_session.execute_to_generate(code): - self.send_response( - self.iopub_socket, - "display_data", - {"data": {"text/html": result.to_html()}, "metadata": {}}, - ) + for html in to_html_blocks(result): + self.send_response( + self.iopub_socket, + "display_data", + {"data": {"text/html": html}, "metadata": {}}, + ) + # how to clear output (if needed in the future): + # self.send_response(self.iopub_socket, "clear_output") except Exception as e: _logger.error("Exception occurred", exc_info=True) diff --git a/packages/kestrel_analytics_docker/pyproject.toml b/packages/kestrel_analytics_docker/pyproject.toml index 1f668918..3b9c9283 100644 --- a/packages/kestrel_analytics_docker/pyproject.toml +++ b/packages/kestrel_analytics_docker/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_analytics_docker" -version = "1.8.0" +version = "1.8.1" description = "Kestrel Docker Analytics Interface" readme = "README.rst" requires-python = ">=3.8" @@ -28,7 +28,7 @@ classifiers = [ dependencies = [ "kestrel_core>=1.8.0", - "docker>=6.1.3", + "docker>=7.0.0", ] [project.urls] diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml index e8fcfa87..6d38a007 100644 --- a/packages/kestrel_core/pyproject.toml +++ b/packages/kestrel_core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_core" -version = "1.8.1" +version = "1.8.2" description = "Kestrel Threat Hunting Language" readme = "README.rst" requires-python = ">=3.8" @@ -30,13 +30,13 @@ classifiers = [ ] dependencies = [ - "typeguard>=4.1.5", + "typeguard>=4.2.1", "pyyaml>=6.0.1", - "lark>=1.1.7", - "pandas>=2.0.3", - "pyarrow>=13.0.0", + "lark>=1.1.9", + "pandas>=2.0.3", # last version supporting Python 3.8 + "pyarrow>=15.0.2", "tabulate>=0.9.0", - "firepit>=2.3.32", + "firepit>=2.3.33", ] [project.optional-dependencies] diff --git a/packages/kestrel_core/src/kestrel/config.yaml b/packages/kestrel_core/src/kestrel/config.yaml index 182ddfe9..2470f465 100644 --- a/packages/kestrel_core/src/kestrel/config.yaml +++ b/packages/kestrel_core/src/kestrel/config.yaml @@ -5,7 +5,7 @@ language: default_datasource_schema: "stixshifter" default_analytics_schema: "python" -# how a Kestrel session is executed +# Kestrel session execution session: cache_directory_prefix: "kestrel-session-" # under system temp directory local_database_path: "local.db" diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index b4e4f830..05e831f7 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_datasource_stixshifter" -version = "1.8.1" +version = "1.8.2" description = "Kestrel STIX-shifter Datasource Interface" readme = "README.rst" requires-python = ">=3.8" @@ -28,11 +28,11 @@ classifiers = [ dependencies = [ "kestrel_core>=1.8.1", - "lxml>=4.9.3", + "lxml>=5.2.1", "requests>=2.31.0", - "nest-asyncio>=1.5.8", - "stix-shifter==6.2.2", - "stix-shifter-utils==6.2.2", + "nest-asyncio>=1.6.0", + "stix-shifter==7.0.6", + "stix-shifter-utils==7.0.6", ] [project.optional-dependencies] diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py index 27df919a..73eb8ff8 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py @@ -22,6 +22,7 @@ SINGLE_BATCH_TIMEOUT = 60 COOL_DOWN_AFTER_TRANSMISSION = 0 ALLOW_DEV_CONNECTOR = False +VERIFY_CERT = True FAST_TRANSLATE_CONNECTORS = [] # Suggested: ["qradar", "elastic_ecs"] @@ -175,6 +176,14 @@ def get_datasource_from_profiles(profile_name, profiles): profile_name, ) + verify_cert = _extract_param_from_connection_config( + "verify_cert", + bool, + VERIFY_CERT, + connection, + profile_name, + ) + return ( connector_name, connection, @@ -182,6 +191,7 @@ def get_datasource_from_profiles(profile_name, profiles): retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, + verify_cert, ) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py index 370eecea..d090f003 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py @@ -12,8 +12,12 @@ _logger = logging.getLogger(__name__) -XPATH_PYPI_PKG_HOME = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[1]/a/@href" -XPATH_PYPI_PKG_SOURCE = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[2]/a/@href" +XPATH_PYPI_PKG_HOME = [ + f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[1]/a/@href" for i in range(5) +] +XPATH_PYPI_PKG_SOURCE = [ + f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[2]/a/@href" for i in range(5) +] STIX_SHIFTER_HOMEPAGE = "https://github.com/opencybersecurityalliance/stix-shifter" @@ -39,8 +43,16 @@ def verify_package_origin(connector_name, stixshifter_version, requests_verify=T ) try: - p_homepage = pypi_etree.xpath(XPATH_PYPI_PKG_HOME)[0] - p_source = pypi_etree.xpath(XPATH_PYPI_PKG_SOURCE)[0] + p_homepage = [ + urls + for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_HOME] + if urls + ][0][0] + p_source = [ + urls + for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_SOURCE] + if urls + ][0][0] except: raise DataSourceError( f'STIX-shifter connector for "{connector_name}" is not installed ' diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py index 487f7944..c3631f7a 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py @@ -11,6 +11,9 @@ from kestrel_datasource_stixshifter.worker import STOP_SIGN from kestrel_datasource_stixshifter.query import translate_query from kestrel_datasource_stixshifter.worker.transmitter import Transmitter +from kestrel_datasource_stixshifter.worker.utils import ( + disable_cert_verification_on_transmission, +) from stix_shifter.stix_transmission import stix_transmission @@ -26,6 +29,7 @@ def __init__(self, datasource_name): self.retrieval_batch_size, self.cool_down_after_transmission, self.allow_dev_connector, + self.verify_cert, ) = get_datasource_from_profiles(datasource_name, self.profiles) self.if_fast_translation = ( self.connector_name in self.kestrel_options["fast_translate"] @@ -72,6 +76,9 @@ def diagnose_ping(self): self.configuration_dict, ) + if not self.verify_cert: + disable_cert_verification_on_transmission(transmission) + result = transmission.ping() print() @@ -125,6 +132,7 @@ def diagnose_run_query_and_retrieval_result(self, stix_patterns, max_batch_cnt): self.configuration_dict, self.retrieval_batch_size, self.cool_down_after_transmission, + self.verify_cert, query, result_queue, max_batch_cnt * self.retrieval_batch_size, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py index a662ea4b..9435cebe 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py @@ -26,9 +26,9 @@ connection: host: elastic.securitylog.company.com port: 9200 - selfSignedCert: false # this means do NOT check cert indices: host101 options: # use any of this section when needed + verify_cert: false # allow invalid/expired/self-signed certificate retrieval_batch_size: 10000 # set to 10000 to match default Elasticsearch page size; Kestrel default across connectors: 2000 single_batch_timeout: 120 # increase it if hit 60 seconds (Kestrel default) timeout error for each batch of retrieval cool_down_after_transmission: 2 # seconds to cool down between data source API calls, required by some API such as sentinelone; Kestrel default: 0 @@ -127,11 +127,15 @@ """ +import multiprocessing from kestrel.datasource import AbstractDataSourceInterface from kestrel_datasource_stixshifter.config import load_profiles from kestrel_datasource_stixshifter.query import query_datasource +multiprocessing.set_start_method("spawn", force=True) + + class StixShifterInterface(AbstractDataSourceInterface): @staticmethod def schemes(): diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py index aeadfc83..cdb1a719 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py @@ -22,6 +22,7 @@ def transmit( retrieval_batch_size: int, translators_count: int, cool_down_after_transmission: int, + verify_cert: bool, queries: list, raw_records_queue: Queue, limit: Optional[int], @@ -34,6 +35,7 @@ def transmit( retrieval_batch_size, translators_count, cool_down_after_transmission, + verify_cert, queries, raw_records_queue, limit, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py index fa0d61e5..46b07b7f 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py @@ -83,6 +83,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, + verify_cert, ) = map( copy.deepcopy, get_datasource_from_profiles(profile, config["profiles"]) ) @@ -123,6 +124,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): retrieval_batch_size, config["options"]["translation_workers_count"], cool_down_after_transmission, + verify_cert, dsl["queries"], raw_records_queue, profile_limit, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py index ca4cd1c0..31534781 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py @@ -6,7 +6,11 @@ from stix_shifter.stix_transmission import stix_transmission from kestrel_datasource_stixshifter.worker import STOP_SIGN -from kestrel_datasource_stixshifter.worker.utils import TransmissionResult, WorkerLog +from kestrel_datasource_stixshifter.worker.utils import ( + TransmissionResult, + WorkerLog, + disable_cert_verification_on_transmission, +) @typechecked @@ -19,6 +23,7 @@ def __init__( retrieval_batch_size: int, number_of_translators: int, cool_down_after_transmission: int, + verify_cert: bool, queries: list, output_queue: Queue, limit: Optional[int], @@ -31,6 +36,7 @@ def __init__( self.retrieval_batch_size = retrieval_batch_size self.number_of_translators = number_of_translators self.cool_down_after_transmission = cool_down_after_transmission + self.verify_cert = verify_cert self.queries = queries self.queue = output_queue self.limit = limit @@ -43,6 +49,7 @@ def run(self): self.configuration_dict, self.retrieval_batch_size, self.cool_down_after_transmission, + self.verify_cert, query, self.queue, self.limit, @@ -65,6 +72,7 @@ def __init__( configuration_dict: dict, retrieval_batch_size: int, cool_down_after_transmission: int, + verify_cert: bool, query: str, output_queue: Queue, limit: Optional[int], @@ -76,6 +84,7 @@ def __init__( self.configuration_dict = configuration_dict self.retrieval_batch_size = retrieval_batch_size self.cool_down_after_transmission = cool_down_after_transmission + self.verify_cert = verify_cert self.query = query self.queue = output_queue self.limit = limit @@ -87,6 +96,11 @@ def run(self): self.connection_dict, self.configuration_dict, ) + + # hack stix-shifter v7 to support "disable certificate verification" + if not self.verify_cert: + disable_cert_verification_on_transmission(self.transmission) + search_meta_result = self.transmission.query(self.query) if search_meta_result["success"]: diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py index 9a8d00af..406b4570 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py @@ -1,6 +1,8 @@ +import ssl from typing import Optional, Union, List from dataclasses import dataclass from pandas import DataFrame +from stix_shifter.stix_transmission.stix_transmission import StixTransmission STOP_SIGN = "STOP" @@ -30,3 +32,18 @@ class TranslationResult: success: bool data: Union[None, dict, DataFrame] log: Optional[WorkerLog] + + +def disable_cert_verification_on_transmission(trans: StixTransmission): + ot = trans.entry_point.transmission() + + # currently all the following attributes point to the same object + # iterate through them in case stix-shifter code changes in the future + for attr in [ + x + for x in dir(ot) + if x.startswith("_BaseEntryPoint__") and x.endswith("_connector") + ]: + c = getattr(ot, attr) + c.api_client.client.ssl_context.check_hostname = False + c.api_client.client.ssl_context.verify_mode = ssl.CERT_NONE diff --git a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py index 89b62efa..610a513c 100644 --- a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py +++ b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py @@ -78,6 +78,7 @@ def test_yaml_profiles_refresh(tmp_path): single_batch_timeout: 120 cool_down_after_transmission: 5 allow_dev_connector: True + verify_cert: false dialects: - beats config: @@ -106,7 +107,7 @@ def test_yaml_profiles_refresh(tmp_path): ss_config = s.config["datasources"]["kestrel_datasource_stixshifter"] ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileA" assert configuration["auth"]["api_key"] == "qwer" @@ -114,6 +115,7 @@ def test_yaml_profiles_refresh(tmp_path): assert connection["options"]["result_limit"] == 2000 * 2 assert retrieval_batch_size == 2000 assert cool_down_after_transmission == 0 + assert verify_cert == True with open(profile_file, "w") as pf: pf.write(profileB) @@ -122,7 +124,7 @@ def test_yaml_profiles_refresh(tmp_path): # need to refresh the pointers since the dict is updated ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileB" assert configuration["auth"]["api_key"] == "xxxxxx" @@ -131,5 +133,6 @@ def test_yaml_profiles_refresh(tmp_path): assert retrieval_batch_size == 10000 assert cool_down_after_transmission == 5 assert allow_dev_connector == True + assert verify_cert == False del os.environ["KESTREL_STIXSHIFTER_CONFIG"] diff --git a/packages/kestrel_jupyter/pyproject.toml b/packages/kestrel_jupyter/pyproject.toml index 70887889..888a3cac 100644 --- a/packages/kestrel_jupyter/pyproject.toml +++ b/packages/kestrel_jupyter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_jupyter" -version = "1.8.2" +version = "1.8.3" description = "Kestrel Jupyter Kernel" readme = "README.rst" requires-python = ">=3.8" @@ -26,11 +26,11 @@ classifiers = [ ] dependencies = [ - "kestrel_core==1.8.1", + "kestrel_core==1.8.2", "kestrel_datasource_stixbundle==1.8.0", - "kestrel_datasource_stixshifter==1.8.1", + "kestrel_datasource_stixshifter==1.8.2", "kestrel_analytics_python==1.8.0", - "kestrel_analytics_docker==1.8.0", + "kestrel_analytics_docker==1.8.1", "jupyterlab-server", "jupyterlab", "jupyter_client",