diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml
index bb769852..91a98060 100644
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -22,9 +22,9 @@ jobs:
codecov:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python Tools
diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml
index 5adbab58..6c4e59e7 100644
--- a/.github/workflows/code-style.yml
+++ b/.github/workflows/code-style.yml
@@ -22,9 +22,9 @@ jobs:
codestyle:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v3
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Kestrel package
diff --git a/.github/workflows/kaas-docker-image.yml b/.github/workflows/kaas-docker-image.yml
index 1d36879e..1738aa07 100644
--- a/.github/workflows/kaas-docker-image.yml
+++ b/.github/workflows/kaas-docker-image.yml
@@ -14,7 +14,7 @@ jobs:
run: sleep 600s
shell: bash
- name: Checkout
- uses: actions/checkout@v3.5.3
+ uses: actions/checkout@v4
- name: Info
run: echo "Parameters. ${{ github.event.base_ref }}, ${{ github.ref_type }}, ${{ github.ref }}"
- name: Log in to Docker Hub
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
index 47c52fb2..343fcf29 100644
--- a/.github/workflows/publish-to-pypi.yml
+++ b/.github/workflows/publish-to-pypi.yml
@@ -22,8 +22,8 @@ jobs:
shell: bash
working-directory: ./packages/${{ matrix.package }}
steps:
- - uses: actions/checkout@v3
- - uses: actions/setup-python@v3
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install building environment
diff --git a/.github/workflows/stixshifter-module-verification.yml b/.github/workflows/stixshifter-module-verification.yml
index f9b1265c..66949595 100644
--- a/.github/workflows/stixshifter-module-verification.yml
+++ b/.github/workflows/stixshifter-module-verification.yml
@@ -15,9 +15,9 @@ jobs:
shell: bash
working-directory: ./packages/kestrel_datasource_stixshifter
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v3
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Python Tools
diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml
index 06b79fde..4113a1e1 100644
--- a/.github/workflows/unit-testing-kestrel2.yml
+++ b/.github/workflows/unit-testing-kestrel2.yml
@@ -30,9 +30,9 @@ jobs:
shell: bash
working-directory: ./packages-nextgen/kestrel_core
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Tools
@@ -42,3 +42,28 @@ jobs:
- name: Unit testing
run: pytest -vv
+ test-kestrel-interface-opensearch:
+ strategy:
+ matrix:
+ os: [ubuntu-latest, macos-latest]
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ runs-on: ${{ matrix.os }}
+ defaults:
+ run:
+ shell: bash
+ working-directory: ./packages-nextgen/kestrel_interface_opensearch
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install Python Tools
+ run: pip install --upgrade pip setuptools wheel pytest
+ - name: Install kestrel_core
+ working-directory: ./packages-nextgen/kestrel_core
+ run: pip install .
+ - name: Install kestrel_interface_opensearch
+ run: pip install .
+ - name: Unit testing
+ run: pytest -vv
diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml
index 1733bfba..8af6b843 100644
--- a/.github/workflows/unit-testing.yml
+++ b/.github/workflows/unit-testing.yml
@@ -23,16 +23,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
- python-version: ['3.8', '3.9', '3.10', '3.11']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
working-directory: ./packages/kestrel_core
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Tools
@@ -52,16 +52,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
- python-version: ['3.8', '3.9', '3.10', '3.11.6']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
working-directory: ./packages/kestrel_datasource_stixshifter
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Tools
@@ -78,16 +78,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
- python-version: ['3.8', '3.9', '3.10', '3.11']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
working-directory: ./packages/kestrel_analytics_python
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Tools
@@ -107,16 +107,16 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
- python-version: ['3.8', '3.9', '3.10', '3.11']
+ python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash
working-directory: ./packages/kestrel_jupyter
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install Python Tools
diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml
index 150c9b34..e1174ba5 100644
--- a/.github/workflows/unused-import.yml
+++ b/.github/workflows/unused-import.yml
@@ -22,9 +22,9 @@ jobs:
unusedimports:
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Set up Python
- uses: actions/setup-python@v3
+ uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Kestrel package
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 083ad89d..bf88f8dc 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -9,6 +9,22 @@ The format is based on `Keep a Changelog`_.
Unreleased
==========
+1.8.3 (2024-04-22)
+==================
+
+Added
+-----
+
+- Support of disabling certificate verification of stix-shifter v7 with config option `verify_cert`
+- Documentation on how to use the `verify_cert` option in the stix-shifter interface
+- Python 3.12 support (multiprocessing library behavior steering to avoid a CPU-blocking issue)
+- More generic HTML parsing of PyPI for stix-shfiter connector verification
+
+Changed
+-------
+
+- stix-shifter upgraded to v7 (v7.0.6), the first version abandoning invalid certificate support
+
1.8.2 (2024-02-20)
==================
diff --git a/README.rst b/README.rst
index 1edf91ad..cdcd4c06 100644
--- a/README.rst
+++ b/README.rst
@@ -2,31 +2,11 @@
:width: 460
:alt: Kestrel Threat Hunting Language
-.. image:: https://readthedocs.org/projects/kestrel/badge/?version=latest
- :target: https://kestrel.readthedocs.io/en/latest/?badge=latest
- :alt: Documentation Status
-
-.. image:: https://img.shields.io/pypi/v/kestrel-jupyter
- :target: https://pypi.python.org/pypi/kestrel-jupyter
- :alt: Latest Version
-
-.. image:: https://img.shields.io/pypi/dm/kestrel-core
- :target: https://pypistats.org/packages/kestrel-core
- :alt: PyPI Downloads
-
-.. image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3
- :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang
- :alt: Code Coverage
-
-.. image:: https://img.shields.io/badge/code%20style-black-000000.svg
- :target: https://github.com/psf/black
- :alt: Code Style: Black
-
|
-**[News]** Kestrel session at `Black Hat USA 2023`_
+|readthedocs| |pypi| |downloads| |codecoverage| |black|
---------
+|
Kestrel is a threat hunting language aiming to make cyber threat hunting *fast*
by providing a layer of abstraction to build reusable, composable, and
@@ -215,3 +195,24 @@ Connecting With The Community
.. _contributing guideline: CONTRIBUTING.rst
.. _governance documentation: GOVERNANCE.rst
.. _Apache License 2.0: LICENSE.md
+
+
+.. |readthedocs| image:: https://readthedocs.org/projects/kestrel/badge/?version=latest
+ :target: https://kestrel.readthedocs.io/en/latest/?badge=latest
+ :alt: Documentation Status
+
+.. |pypi| image:: https://img.shields.io/pypi/v/kestrel-jupyter
+ :target: https://pypi.python.org/pypi/kestrel-jupyter
+ :alt: Latest Version
+
+.. |downloads| image:: https://img.shields.io/pypi/dm/kestrel-core
+ :target: https://pypistats.org/packages/kestrel-core
+ :alt: PyPI Downloads
+
+.. |codecoverage| image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3
+ :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang
+ :alt: Code Coverage
+
+.. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
+ :target: https://github.com/psf/black
+ :alt: Code Style: Black
diff --git a/docs/installation/runtime.rst b/docs/installation/runtime.rst
index c220f264..b70d4072 100644
--- a/docs/installation/runtime.rst
+++ b/docs/installation/runtime.rst
@@ -8,7 +8,11 @@ please use Python inside Windows Subsystem for Linux (WSL).
General Requirements
====================
-Python 3.8 is required. Follow the `Python installation guide`_ to install or upgrade Python.
+Python 3 is required.
+
+* End-of-life Python versions are not supported. Check `Python releases`_.
+
+* Follow the `Python installation guide`_ to install or upgrade Python.
OS-specific Requirements
========================
@@ -190,6 +194,7 @@ What's to Do Next
- :doc:`../language/index`
.. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
+.. _Python releases: https://devguide.python.org/versions/
.. _Python virtual environment: https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/
.. _Xcode: https://developer.apple.com/xcode/
.. _kestrel-lang: http://github.com/opencybersecurityalliance/kestrel-lang
diff --git a/packages-nextgen/kestrel_core/pyproject.toml b/packages-nextgen/kestrel_core/pyproject.toml
index 61f48941..e57a5bca 100644
--- a/packages-nextgen/kestrel_core/pyproject.toml
+++ b/packages-nextgen/kestrel_core/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
"mashumaro>=3.10",
"networkx>=3.1", # networkx==3.2.1 only for Python>=3.9
"SQLAlchemy>=2.0.23",
+ "dpath>=2.1.6",
]
[project.optional-dependencies]
diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py
index b4f5f101..4d1a94bb 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py
@@ -1,23 +1,24 @@
+from __future__ import annotations
from pandas import DataFrame
from typing import MutableMapping
from uuid import UUID
from abc import abstractmethod
from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER
-from kestrel.interface.datasource import AbstractDataSourceInterface
+from kestrel.interface import AbstractInterface
-class AbstractCache(AbstractDataSourceInterface, MutableMapping):
+class AbstractCache(AbstractInterface, MutableMapping):
"""Base class for Kestrel cache
- Additional @abstractmethod from AbstractDataSourceInterface:
+ Additional @abstractmethod from AbstractInterface:
- evaluate_graph()
"""
- @property
- def name(self):
- return CACHE_INTERFACE_IDENTIFIER
+ @staticmethod
+ def schemes() -> Iterable[str]:
+ return [CACHE_INTERFACE_IDENTIFIER]
@abstractmethod
def __del__(self):
@@ -28,6 +29,8 @@ def __del__(self):
def __getitem__(self, instruction_id: UUID) -> DataFrame:
"""Get the dataframe for the cached instruction
+ This method will automatically support `uuid in cache`
+
Parameters:
instruction_id: id of the instruction
@@ -57,16 +60,32 @@ def __delitem__(self, instruction_id: UUID):
"""
...
- def store(self, instruction_id: UUID, data: DataFrame):
- self[instruction_id] = data
+ @abstractmethod
+ def get_virtual_copy(self) -> AbstractCache:
+ """Create a virtual cache object from this cache
- def __contain__(self, instruction_id: UUID) -> bool:
- """Whether the evaluated instruction is cached
+ This method needs to reimplement __del__, __getitem__, __setitem__,
+ __delitem__ to not actually hit the store media, e.g., SQLite.
- Parameters:
- instruction_id: id of the instruction
+ The virtual cache is useful for the implementation of the Explain()
+ instruction, pretending the dependent graphs are evaluated, so the
+ evaluation can continue towards the Return() instruction.
+
+ Because Python invokes special methods from class methods, replacing
+ the __getitem__, __setitem__, and __delitem__ in the object does not
+ help. It is better to derive a subclass and replace __class__ of the
+ object to the subclass to correctly invoke the new set of __xitem___.
+
+ https://docs.python.org/3/reference/datamodel.html#special-lookup
+
+ And Python garbage collector could clean up the virtual cache when
+ not in use, so the __del__ method should be reimplemented to make
+ sure the store media is not closed.
"""
- return instruction_id in self.cache_catalog
+ ...
+
+ def store(self, instruction_id: UUID, data: DataFrame):
+ self[instruction_id] = data
def __iter__(self) -> UUID:
"""Return UUIDs of instructions cached
diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py
index e0527b9c..87557222 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py
@@ -1,3 +1,4 @@
+from copy import copy
from pandas import DataFrame
from typeguard import typechecked
from uuid import UUID
@@ -6,19 +7,22 @@
MutableMapping,
Optional,
Iterable,
+ Any,
)
from kestrel.cache.base import AbstractCache
from kestrel.ir.graph import IRGraphEvaluable
+from kestrel.display import GraphletExplanation, NativeQuery
from kestrel.ir.instructions import (
Instruction,
Return,
+ Explain,
Variable,
Filter,
SourceInstruction,
TransformingInstruction,
)
-from kestrel.interface.datasource.codegen.dataframe import (
+from kestrel.interface.codegen.dataframe import (
evaluate_source_instruction,
evaluate_transforming_instruction,
)
@@ -44,7 +48,7 @@ def __getitem__(self, instruction_id: UUID) -> DataFrame:
return self.cache[self.cache_catalog[instruction_id]]
def __delitem__(self, instruction_id: UUID):
- del self.cache[instruction_id]
+ del self.cache[self.cache_catalog[instruction_id]]
del self.cache_catalog[instruction_id]
def __setitem__(
@@ -52,23 +56,42 @@ def __setitem__(
instruction_id: UUID,
data: DataFrame,
):
- self.cache[instruction_id] = data
- self.cache_catalog[instruction_id] = instruction_id
+ self.cache_catalog[instruction_id] = instruction_id.hex
+ self.cache[self.cache_catalog[instruction_id]] = data
+
+ def get_virtual_copy(self) -> AbstractCache:
+ v = copy(self)
+ v.cache_catalog = copy(self.cache_catalog)
+ v.__class__ = InMemoryCacheVirtual
+ return v
def evaluate_graph(
self,
graph: IRGraphEvaluable,
instructions_to_evaluate: Optional[Iterable[Instruction]] = None,
) -> Mapping[UUID, DataFrame]:
+ mapping = {}
if not instructions_to_evaluate:
instructions_to_evaluate = graph.get_sink_nodes()
+ for instruction in instructions_to_evaluate:
+ df = self._evaluate_instruction_in_graph(graph, instruction)
+ self[instruction.id] = df
+ mapping[instruction.id] = df
+ return mapping
+ def explain_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_explain: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, GraphletExplanation]:
mapping = {}
- for ins in instructions_to_evaluate:
- df = self._evaluate_instruction_in_graph(graph, ins)
- self[ins.id] = df
- mapping[ins.id] = df
-
+ if not instructions_to_evaluate:
+ instructions_to_evaluate = graph.get_sink_nodes()
+ for instruction in instructions_to_evaluate:
+ dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction)
+ graph_dict = dep_graph.to_dict()
+ query = NativeQuery("DataFrame", "")
+ mapping[instruction.id] = GraphletExplanation(graph_dict, query)
return mapping
def _evaluate_instruction_in_graph(
@@ -81,7 +104,7 @@ def _evaluate_instruction_in_graph(
elif isinstance(instruction, TransformingInstruction):
trunk, r2n = graph.get_trunk_n_branches(instruction)
df = self._evaluate_instruction_in_graph(graph, trunk)
- if isinstance(instruction, Return):
+ if isinstance(instruction, (Return, Explain)):
pass
elif isinstance(instruction, Variable):
self[instruction.id] = df
@@ -99,3 +122,15 @@ def _evaluate_instruction_in_graph(
else:
raise NotImplementedError(f"Unknown instruction type: {instruction}")
return df
+
+
+@typechecked
+class InMemoryCacheVirtual(InMemoryCache):
+ def __getitem__(self, instruction_id: UUID) -> Any:
+ return self.cache_catalog[instruction_id]
+
+ def __delitem__(self, instruction_id: UUID):
+ del self.cache_catalog[instruction_id]
+
+ def __setitem__(self, instruction_id: UUID, data: Any):
+ self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex
diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py
index 545513a5..97b8fb13 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py
@@ -1,5 +1,6 @@
import logging
-from typing import Iterable, Mapping, Optional, Union
+from copy import copy
+from typing import Iterable, Mapping, Optional, Union, Any
from uuid import UUID
import sqlalchemy
@@ -8,12 +9,14 @@
from typeguard import typechecked
from kestrel.cache.base import AbstractCache
-from kestrel.interface.datasource.codegen.sql import SqlTranslator
+from kestrel.interface.codegen.sql import SqlTranslator
from kestrel.ir.graph import IRGraphEvaluable
+from kestrel.display import GraphletExplanation, NativeQuery
from kestrel.ir.instructions import (
Construct,
Instruction,
Return,
+ Explain,
Variable,
Filter,
SourceInstruction,
@@ -28,12 +31,13 @@
class SqliteTranslator(SqlTranslator):
def __init__(self, from_obj: Union[SqlTranslator, str]):
if isinstance(from_obj, SqlTranslator):
- fc = from_obj.query.subquery()
+ fc = from_obj.query.subquery(name=from_obj.associated_variable)
else: # str to represent table name
fc = sqlalchemy.table(from_obj)
super().__init__(
sqlalchemy.dialects.sqlite.dialect(), dt_parser, "time", fc
) # FIXME: need mapping for timestamp?
+ self.associated_variable = None
@typechecked
@@ -45,12 +49,12 @@ def __init__(
):
super().__init__()
- basename = self.session_id or "cache"
- path = f"{basename}.db"
+ basename = session_id or "cache"
+ self.db_path = f"{basename}.db"
# for an absolute file path, the three slashes are followed by the absolute path
# for a relative path, it's also three slashes?
- self.engine = sqlalchemy.create_engine(f"sqlite:///{path}")
+ self.engine = sqlalchemy.create_engine(f"sqlite:///{self.db_path}")
self.connection = self.engine.connect()
if initial_cache:
@@ -77,6 +81,12 @@ def __setitem__(
self.cache_catalog[instruction_id] = table_name
data.to_sql(table_name, con=self.connection, if_exists="replace", index=False)
+ def get_virtual_copy(self) -> AbstractCache:
+ v = copy(self)
+ v.cache_catalog = copy(self.cache_catalog)
+ v.__class__ = SqliteCacheVirtual
+ return v
+
def evaluate_graph(
self,
graph: IRGraphEvaluable,
@@ -93,6 +103,22 @@ def evaluate_graph(
mapping[instruction.id] = read_sql(translator.result(), self.connection)
return mapping
+ def explain_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_explain: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, GraphletExplanation]:
+ mapping = {}
+ if not instructions_to_explain:
+ instructions_to_explain = graph.get_sink_nodes()
+ for instruction in instructions_to_explain:
+ dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction)
+ graph_dict = dep_graph.to_dict()
+ translator = self._evaluate_instruction_in_graph(graph, instruction)
+ query = NativeQuery("SQL", str(translator.result_w_literal_binds()))
+ mapping[instruction.id] = GraphletExplanation(graph_dict, query)
+ return mapping
+
def _evaluate_instruction_in_graph(
self,
graph: IRGraphEvaluable,
@@ -118,12 +144,13 @@ def _evaluate_instruction_in_graph(
translator = self._evaluate_instruction_in_graph(graph, trunk)
if isinstance(instruction, SolePredecessorTransformingInstruction):
- if isinstance(instruction, Return):
+ if isinstance(instruction, (Return, Explain)):
pass
elif isinstance(instruction, Variable):
# start a new translator and use previous one as subquery
# this allows using the variable as a dependent node
# if the variable is a sink, `SELECT * FROM (subquery)` also works
+ translator.associated_variable = instruction.name
translator = SqliteTranslator(translator)
else:
translator.add_instruction(instruction)
@@ -147,3 +174,18 @@ def _evaluate_instruction_in_graph(
raise NotImplementedError(f"Unknown instruction type: {instruction}")
return translator
+
+
+@typechecked
+class SqliteCacheVirtual(SqliteCache):
+ def __getitem__(self, instruction_id: UUID) -> Any:
+ return self.cache_catalog[instruction_id]
+
+ def __delitem__(self, instruction_id: UUID):
+ del self.cache_catalog[instruction_id]
+
+ def __setitem__(self, instruction_id: UUID, data: Any):
+ self.cache_catalog[instruction_id] = instruction_id.hex + "v"
+
+ def __del__(self):
+ pass
diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py
index 8911b8a7..0b912e7a 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py
@@ -1,24 +1,33 @@
import os
import yaml
-import pathlib
+from pathlib import Path
import logging
+from typeguard import typechecked
+from typing import Mapping, Union
from kestrel.utils import update_nested_dict, load_data_file
-CONFIG_DIR_DEFAULT = pathlib.Path.home() / ".config" / "kestrel"
+CONFIG_DIR_DEFAULT = Path.home() / ".config" / "kestrel"
CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml"
CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG" # override CONFIG_PATH_DEFAULT if provided
_logger = logging.getLogger(__name__)
-def load_default_config():
+@typechecked
+def load_default_config() -> Mapping:
_logger.debug(f"Loading default config file...")
default_config = load_data_file("kestrel.config", "kestrel.yaml")
- return yaml.safe_load(os.path.expandvars(default_config))
+ config_with_envvar_expanded = os.path.expandvars(default_config)
+ config_content = yaml.safe_load(config_with_envvar_expanded)
+ return config_content
-def load_user_config(config_path_env_var, config_path_default):
+@typechecked
+def load_user_config(
+ config_path_env_var: str, config_path_default: Union[str, Path]
+) -> Mapping:
+ config_path_default = config_path_default.absolute().as_posix()
config_path = os.getenv(config_path_env_var, config_path_default)
config_path = os.path.expanduser(config_path)
config = {}
@@ -32,13 +41,10 @@ def load_user_config(config_path_env_var, config_path_default):
return config
-def load_config():
+@typechecked
+def load_config() -> Mapping:
config_default = load_default_config()
config_user = load_user_config(CONFIG_PATH_ENV_VAR, CONFIG_PATH_DEFAULT)
_logger.debug(f"User configuration loaded: {config_user}")
_logger.debug(f"Updating default config with user config...")
return update_nested_dict(config_default, config_user)
-
-
-if __name__ == "__main__":
- ...
diff --git a/packages-nextgen/kestrel_core/src/kestrel/display.py b/packages-nextgen/kestrel_core/src/kestrel/display.py
index 49758f4d..e6729f85 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/display.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/display.py
@@ -1 +1,34 @@
+from typing import List, Union, Mapping
+from dataclasses import dataclass
+from mashumaro.mixins.json import DataClassJSONMixin
+from pandas import DataFrame
+
+
+@dataclass
+class NativeQuery(DataClassJSONMixin):
+ # which query language
+ language: str
+ # what query statement
+ statement: str
+
+
+@dataclass
+class GraphletExplanation(DataClassJSONMixin):
+ # serialized IRGraph
+ graph: Mapping
+ # data source query
+ query: NativeQuery
+
+
+@dataclass
+class GraphExplanation(DataClassJSONMixin):
+ graphlets: List[GraphletExplanation]
+
+
# Kestrel Display Object
+Display = Union[
+ str,
+ dict,
+ DataFrame,
+ GraphExplanation,
+]
diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py
index ae278f9a..cd088afe 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py
@@ -74,6 +74,10 @@ class DuplicatedReferenceInFilter(KestrelError):
pass
+class MissingReferenceInFilter(KestrelError):
+ pass
+
+
class InvalidSerializedDatasourceInterfaceCacheCatalog(KestrelError):
pass
@@ -90,23 +94,19 @@ class InterfaceNotFound(KestrelError):
pass
-class InterfaceNameCollision(KestrelError):
- pass
-
-
class IRGraphMissingNode(KestrelError):
pass
-class DataSourceInterfaceNotFound(KestrelError):
+class InterfaceNotConfigured(KestrelError):
pass
-class InvalidDataSourceInterfaceImplementation(KestrelError):
+class InvalidInterfaceImplementation(KestrelError):
pass
-class ConflictingDataSourceInterfaceScheme(KestrelError):
+class ConflictingInterfaceScheme(KestrelError):
pass
diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py
index fcbab5b4..cb1f897f 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py
@@ -1,5 +1,6 @@
# Lark Transformer
+import logging
from datetime import datetime, timedelta
from functools import reduce
@@ -7,6 +8,7 @@
from lark import Transformer, Token
from typeguard import typechecked
+from kestrel.mapping.data_model import translate_comparison_to_ocsf
from kestrel.utils import unescape_quoted_string
from kestrel.ir.filter import (
FExpression,
@@ -41,10 +43,14 @@
Return,
Sort,
Variable,
+ Explain,
)
from kestrel.exceptions import IRGraphMissingNode
+_logger = logging.getLogger(__name__)
+
+
DEFAULT_VARIABLE = "_"
DEFAULT_SORT_ORDER = "DESC"
@@ -94,17 +100,29 @@ def _map_filter_exp(
if ":" not in field:
field = f"{entity_name}:{field}"
# map field to new syntax (e.g. STIX to OCSF)
- map_result = property_map.get(field, filter_exp.field)
+ # TODO: ECS to OCSF? Would need to merge STIX and ECS data model maps.
+ map_result = translate_comparison_to_ocsf(
+ property_map, field, filter_exp.op, filter_exp.value
+ )
# Build a MultiComp if field maps to several values
- if isinstance(map_result, (list, tuple)):
- op = filter_exp.op
- value = filter_exp.value
+ if len(map_result) > 1:
filter_exp = MultiComp(
- ExpOp.OR, [_create_comp(field, op, value) for field in map_result]
+ ExpOp.OR,
+ [_create_comp(field, op, value) for field, op, value in map_result],
)
- else: # change the name of the field if it maps to a single value
- filter_exp.field = map_result
-
+ elif len(map_result) == 1: # it maps to a single value
+ mapping = map_result[0]
+ _logger.debug("mapping = %s", mapping)
+ field = mapping[0]
+ prefix = f"{entity_name}."
+ if field.startswith(prefix):
+ # Need to prune the entity name
+ field = field[len(prefix) :]
+ filter_exp.field = field
+ filter_exp.op = mapping[1]
+ filter_exp.value = mapping[2]
+ else: # pass-through
+ pass
# TODO: for RefComparison, map the attribute in value (may not be possible here)
elif isinstance(filter_exp, BoolExp):
@@ -151,7 +169,7 @@ def __init__(
self.default_sort_order = default_sort_order
self.token_prefix = token_prefix
self.entity_map = entity_map
- self.property_map = property_map
+ self.property_map = property_map # TODO: rename to data_model_map?
super().__init__()
def start(self, args):
@@ -371,3 +389,10 @@ def disp(self, args):
graph, root = args[0]
graph.add_node(Return(), root)
return graph
+
+ def explain(self, args):
+ graph = IRGraph()
+ reference = graph.add_node(Reference(args[0].value))
+ explain = graph.add_node(Explain(), reference)
+ graph.add_node(Return(), explain)
+ return graph
diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark
index eda6958c..1e00bfc9 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark
+++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark
@@ -28,10 +28,11 @@ assignment: VARIABLE "=" expression
| sort
?command_no_result: apply
+ | explain
+ | describe
| disp
| info
| save
- | describe
//
// All commands
@@ -61,6 +62,8 @@ save: "SAVE"i VARIABLE "TO"i stdpath
describe: "DESCRIBE"i var_attr
+explain: "EXPLAIN"i VARIABLE
+
//
// Variable definition
//
diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py
index e5bcbdab..0ff482c5 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py
@@ -1,14 +1,20 @@
# parse Kestrel syntax, apply frontend mapping, transform to IR
+import logging
+import os
from itertools import chain
from kestrel.frontend.compile import _KestrelT
+from kestrel.mapping.data_model import reverse_mapping
from kestrel.utils import load_data_file
from lark import Lark
-import os
from typeguard import typechecked
import yaml
+
+_logger = logging.getLogger(__name__)
+
+
frontend_mapping = {}
@@ -21,9 +27,13 @@ def get_mapping(mapping_type: str, mapping_package: str, mapping_filepath: str)
try:
mapping_str = load_data_file(mapping_package, mapping_filepath)
mapping = yaml.safe_load(mapping_str)
+ if mapping_type == "property":
+ # New data model map is always OCSF->native
+ mapping = reverse_mapping(mapping)
frontend_mapping[mapping_type] = mapping
except Exception as ex:
- mapping = None
+ _logger.error("Failed to load %s", mapping_str, exc_info=ex)
+ mapping = None # FIXME: this is not a dict
return mapping
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py
index e69de29b..3c4b25e5 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py
@@ -0,0 +1,2 @@
+from kestrel.interface.base import AbstractInterface
+from kestrel.interface.manager import InterfaceManager
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py
similarity index 70%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/base.py
index 0e730d89..50f5601f 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py
@@ -9,6 +9,7 @@
Iterable,
)
+from kestrel.display import GraphletExplanation
from kestrel.ir.instructions import Instruction
from kestrel.ir.graph import IRGraphEvaluable
from kestrel.exceptions import (
@@ -16,11 +17,11 @@
)
-MODULE_PREFIX = "kestrel_datasource_"
+MODULE_PREFIX = "kestrel_interface_"
-class AbstractDataSourceInterface(ABC):
- """Abstract class for datasource interface
+class AbstractInterface(ABC):
+ """Abstract class for datasource/analytics interface
Concepts:
@@ -43,7 +44,6 @@ def __init__(
session_id: Optional[UUID] = None,
):
self.session_id = session_id
- self.datasources: Mapping[str, str] = {}
self.cache_catalog: MutableMapping[UUID, str] = {}
if serialized_cache_catalog:
@@ -52,12 +52,14 @@ def __init__(
except:
raise InvalidSerializedDatasourceInterfaceCacheCatalog()
- @property
+ # Python 3.13 will drop chain of @classmethod and @property
+ # use @staticmethod instead (cannot make it a property)
+ @staticmethod
@abstractmethod
- def name(self) -> str:
- """The name of the interface
+ def schemes() -> Iterable[str]:
+ """The schemes to specify the interface
- The name should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*``
+ Each scheme should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*``
"""
...
@@ -97,7 +99,7 @@ def evaluate_graph(
Parameters:
- graph: The IRGraph with zero or one interface
+ graph: The evaluate IRGraph
instructions_to_evaluate: instructions to evaluate and return; by default, it will be all Return instructions in the graph
@@ -107,6 +109,26 @@ def evaluate_graph(
"""
...
+ @abstractmethod
+ def explain_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_explain: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, GraphletExplanation]:
+ """Explain how to evaluate the IRGraph
+
+ Parameters:
+
+ graph: The evaluable IRGraph
+
+ instructions_to_explain: instructions to explain and return; by default, it will be all Return instructions in the graph
+
+ Returns:
+
+ GraphletExplanation (a Kestrel Display object) for each instruction in instructions_to_explain.
+ """
+ ...
+
def cache_catalog_to_json(self) -> str:
"""Serialize the cache catalog to a JSON string"""
return json.dumps(self.cache_catalog)
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py
deleted file mode 100644
index bd74f728..00000000
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from kestrel.interface.datasource.base import AbstractDataSourceInterface
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py
deleted file mode 100644
index d6806715..00000000
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py
+++ /dev/null
@@ -1,21 +0,0 @@
-from kestrel.exceptions import (
- DataSourceInterfaceNotFound,
- InvalidDataSourceInterfaceImplementation,
- ConflictingDataSourceInterfaceScheme,
-)
-from kestrel.interface.manager import InterfaceManager
-from kestrel.interface.datasource.base import (
- MODULE_PREFIX,
- AbstractDataSourceInterface,
-)
-
-
-class DataSourceManager(InterfaceManager):
- def __init__(self):
- super().__init__(
- MODULE_PREFIX,
- AbstractDataSourceInterface,
- DataSourceInterfaceNotFound,
- InvalidDataSourceInterfaceImplementation,
- ConflictingDataSourceInterfaceScheme,
- )
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py
deleted file mode 100644
index 33a49975..00000000
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from typing import Iterable
-from typeguard import typechecked
-
-from kestrel.interface.datasource import AbstractDataSourceInterface
-from kestrel.exceptions import (
- InterfaceNotFound,
- InterfaceNameCollision,
-)
-
-
-@typechecked
-def get_interface_by_name(
- interface_name: str, interfaces: Iterable[AbstractDataSourceInterface]
-):
- """Find an interface by its name
-
- Parameters:
- interface_name: the name of an interface
- interfaces: the list of interfaces
-
- Returns:
- The interface found
- """
- ifs = filter(lambda x: x.name == interface_name, interfaces)
- try:
- interface = next(ifs)
- except StopIteration:
- raise InterfaceNotFound(interface_name)
- else:
- try:
- next(ifs)
- except StopIteration:
- # expected behavior
- pass
- else:
- raise InterfaceNameCollision(interface_name)
- return interface
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py
index a66a1ce1..b5fd0904 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py
@@ -1,94 +1,112 @@
-from abc import ABC
-
+from __future__ import annotations
import importlib
import pkgutil
import logging
import inspect
import sys
+import itertools
+from copy import copy
+from typeguard import typechecked
+from typing import Mapping, Iterable, Type
-from kestrel.exceptions import KestrelError
+from kestrel.exceptions import (
+ InterfaceNotConfigured,
+ InterfaceNotFound,
+ InvalidInterfaceImplementation,
+ ConflictingInterfaceScheme,
+)
+from kestrel.interface.base import MODULE_PREFIX, AbstractInterface
+from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER
_logger = logging.getLogger(__name__)
-class InterfaceManager:
- def __init__(
- self,
- module_name_prefix: str,
- interface_class: ABC,
- nonexist_interface_exception: KestrelError,
- invalid_interface_exception: KestrelError,
- conflict_interface_exception: KestrelError,
- ):
- self.scheme_to_interface: dict[str, ABC] = {}
- self.nonexist_interface_exception = nonexist_interface_exception
-
- for iface_cls in _load_interfaces(
- module_name_prefix,
- interface_class,
- invalid_interface_exception,
- conflict_interface_exception,
- ).values():
- iface = iface_cls()
- _logger.debug("Loading data source interface '%s' (%s)", iface.name, iface)
- self.scheme_to_interface[iface.name] = iface
-
- def interfaces(self):
- return list(self.scheme_to_interface.values())
-
- def schemes(self):
- return list(self.scheme_to_interface.keys())
-
-
-def _load_interfaces(
- module_name_prefix,
- interface_class,
- invalid_interface_exception,
- conflict_interface_exception,
-):
- is_interface = _is_class(interface_class)
- interface_names = _list_interfaces(module_name_prefix)
- interfaces = {}
- for interface_name in interface_names:
- mod = importlib.import_module(interface_name)
- _logger.debug("Imported %s from interface name %s", mod, interface_name)
- cls = inspect.getmembers(sys.modules[interface_name], is_interface)
+# basically a scheme to interface mapping
+@typechecked
+class InterfaceManager(Mapping):
+ def __init__(self, init_interfaces: Iterable[AbstractInterface] = []):
+ interface_classes = _load_interface_classes()
+ self.interfaces = list(init_interfaces) # copy/recreate the list
+ for iface_cls in interface_classes:
+ try:
+ iface = iface_cls()
+ _logger.debug(f"Initialize interface {iface_cls.__name__}")
+ self.interfaces.append(iface)
+ except InterfaceNotConfigured as e:
+ _logger.debug(f"Interface {iface_cls.__name__} not configured; ignored")
+
+ def __getitem__(self, scheme: str) -> AbstractInterface:
+ for interface in self.interfaces:
+ if scheme in interface.schemes():
+ return interface
+ else:
+ raise InterfaceNotFound(f"no interface loaded for scheme {scheme}")
+
+ def __iter__(self) -> Iterable[str]:
+ return itertools.chain(*[i.schemes() for i in self.interfaces])
+
+ def __len__(self) -> int:
+ return sum(1 for _ in iter(self))
+
+ def copy_with_virtual_cache(self) -> InterfaceManager:
+ im = copy(self)
+ # shallow copy refers to the same list, so create/copy a new one
+ im.interfaces = copy(im.interfaces)
+ # now swap in virtual cache
+ cache = im[CACHE_INTERFACE_IDENTIFIER]
+ im.interfaces.remove(cache)
+ im.interfaces.append(cache.get_virtual_copy())
+ return im
+
+ def del_cache(self):
+ cache = self[CACHE_INTERFACE_IDENTIFIER]
+ self.interfaces.remove(cache)
+ del cache
+
+
+def _load_interface_classes():
+ interface_clss = []
+ for itf_pkg_name in _list_interface_pkg_names():
+ mod = importlib.import_module(itf_pkg_name)
+ _logger.debug(f"Imported {mod} from package {itf_pkg_name}")
+ cls = inspect.getmembers(
+ sys.modules[itf_pkg_name], _is_class(AbstractInterface)
+ )
if not cls:
- raise invalid_interface_exception(
- f'no interface class found in "{interface_name}"'
+ raise InvalidInterfaceImplementation(
+ f'no interface class found in package "{itf_pkg_name}"'
)
elif len(cls) > 1:
- raise invalid_interface_exception(
- f'more than one interface class found in "{interface_name}"'
+ raise InvalidInterfaceImplementation(
+ f'more than one interface class found in package "{itf_pkg_name}"'
)
else:
- interface = cls[0][1]
- interface_conflict, scheme_conflict = _search_scheme_conflict(
- interface, interfaces.values()
- )
- if interface_conflict:
- raise conflict_interface_exception(
- interface, interface_conflict, scheme_conflict
- )
- interfaces[interface_name] = interface
- return interfaces
+ interface_cls = cls[0][1]
+ _guard_scheme_conflict(interface_cls, interface_clss)
+ interface_clss.append(interface_cls)
+ return interface_clss
-def _list_interfaces(module_name_prefix):
+def _list_interface_pkg_names():
pkg_names = [x.name for x in pkgutil.iter_modules()]
- itf_names = [pkg for pkg in pkg_names if pkg.startswith(module_name_prefix)]
- return list(itf_names)
+ itf_names = [pkg for pkg in pkg_names if pkg.startswith(MODULE_PREFIX)]
+ return itf_names
def _is_class(cls):
return lambda obj: inspect.isclass(obj) and obj.__bases__[0] == cls
-def _search_scheme_conflict(new_interface, interfaces):
+@typechecked
+def _guard_scheme_conflict(
+ new_interface: Type[AbstractInterface],
+ interfaces: Iterable[Type[AbstractInterface]],
+):
for interface in interfaces:
for scheme_new in new_interface.schemes():
for scheme_old in interface.schemes():
if scheme_new == scheme_old:
- return interface, scheme_new
- return None, None
+ raise ConflictingInterfaceScheme(
+ f"scheme: {scheme_new} conflicting between {new_interface.__name__} and {interface.__name__}"
+ )
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py
rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py
diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py
index f948dff9..ddc41b7d 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py
@@ -36,6 +36,7 @@
InevaluableInstruction,
LargerThanOneIndegreeInstruction,
DuplicatedReferenceInFilter,
+ MissingReferenceInFilter,
DanglingReferenceInFilter,
DanglingFilter,
)
@@ -124,18 +125,34 @@ def add_edges_from(
self.add_edge(u, v, deref)
def copy(self):
- """Copy the IRGraph with all nodes as reference (not deepcopy)"""
+ """Copy the IRGraph with all nodes as reference (not deepcopy)
+
+ Support subclass of IRGraph to be copied.
+ """
g = IRGraph()
g.update(self)
+
+ # subclass support
+ if type(g) != type(self):
+ g = type(self)(g)
+
return g
def deepcopy(self):
- """Copy the IRGraph with all nodes copied as new objects"""
+ """Copy the IRGraph with all nodes copied as new objects
+
+ Support subclass of IRGraph to be deep copied.
+ """
g = IRGraph()
o2n = {n: n.deepcopy() for n in self.nodes()}
for u, v in self.edges():
g.add_edge(o2n[u], o2n[v])
g.add_nodes_from([o2n[n] for n in self.nodes() if self.degree(n) == 0])
+
+ # subclass support
+ if type(g) != type(self):
+ g = type(self)(g)
+
return g
def get_node_by_id(self, ux: Union[UUID, str]) -> Instruction:
@@ -372,6 +389,8 @@ def get_trunk_n_branches(
ps = list(self.predecessors(node))
pps = [(p, pp) for p in self.predecessors(node) for pp in self.predecessors(p)]
+ # may need to add a patch in find_dependent_subgraphs_of_node()
+ # for each new case added in the if/elif, e.g., FIlter
if isinstance(node, SolePredecessorTransformingInstruction):
if len(ps) > 1:
raise LargerThanOneIndegreeInstruction()
@@ -388,8 +407,10 @@ def get_trunk_n_branches(
and p.attrs == [rv.attribute]
and pp.name == rv.reference
]
- if len(ppfs) > 1:
- raise DuplicatedReferenceInFilter(ppfs)
+ if not ppfs:
+ raise MissingReferenceInFilter(rv, node, pps)
+ elif len(ppfs) > 1:
+ raise DuplicatedReferenceInFilter(rv, node, pps)
else:
p = ppfs[0][0]
r2n[rv] = p
@@ -536,10 +557,34 @@ def find_dependent_subgraphs_of_node(
ps = set().union(*[set(g.predecessors(n)) for n in a2uns[interface]])
a2uns[interface].update(ps & cached_nodes)
+ # a patch (corner case handling) for get_trunk_n_branches()
+ # add Variable/Reference node if succeeded by ProjectAttrs and Filter,
+ # which are in the dependent graph; the Variable is only needed by
+ # get_trunk_n_branches() as an auxiliary node
+ for interface in a2uns:
+ auxs = []
+ for n in a2uns[interface]:
+ if isinstance(n, ProjectAttrs):
+ # need to search in `self`, not `g`, since the boundry of
+ # `g` is cut by the cache
+ p = next(self.predecessors(n))
+ s = next(g.successors(n))
+ if (
+ isinstance(s, Filter)
+ and isinstance(p, (Variable, Reference))
+ and s in a2uns[interface]
+ ):
+ auxs.append(p)
+ a2uns[interface].update(auxs)
+
# remove dep graphs with only one node
- # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" when v.x not in cache
+ # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x"
+ # when v.x not in cache
dep_nodes = [ns for ns in a2uns.values() if len(ns) > 1]
- dep_graphs = [IRGraphEvaluable(g.subgraph(ns)) for ns in dep_nodes]
+ # need to search in `self` due to the patch for get_trunk_n_branches()
+ dep_graphs = [
+ IRGraphEvaluable(self.subgraph(ns)).deepcopy() for ns in dep_nodes
+ ]
return dep_graphs
@@ -774,7 +819,7 @@ def _add_node(self, node: Instruction, deref: bool = True) -> Instruction:
class IRGraphSimpleQuery(IRGraphEvaluable):
"""Simple Query IRGraph
- A simple query IRGraph is an evaluatable IRGraph that
+ A simple query IRGraph is an evaluable IRGraph that
1. It contains one source node
diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py
index 0d667ea3..8b1aa1e3 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py
@@ -168,6 +168,11 @@ class Reference(IntermediateInstruction):
name: str
+@dataclass(eq=False)
+class Explain(SolePredecessorTransformingInstruction):
+ pass
+
+
@dataclass(eq=False)
class Limit(SolePredecessorTransformingInstruction):
num: int
diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py
new file mode 100644
index 00000000..d05bd943
--- /dev/null
+++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py
@@ -0,0 +1,279 @@
+import logging
+from typing import Optional, Union
+
+import dpath
+import numpy as np
+import yaml
+from pandas import DataFrame
+from typeguard import typechecked
+
+from kestrel.mapping.transformers import (
+ run_transformer,
+ run_transformer_on_series,
+)
+from kestrel.utils import list_folder_files
+
+_logger = logging.getLogger(__name__)
+
+
+def _add_mapping(obj: dict, key: str, mapping: dict):
+ """Add `key` -> `mapping` to `obj`, appending if necessary"""
+ existing_mapping = obj.get(key)
+ if existing_mapping:
+ if isinstance(existing_mapping, str):
+ existing_mapping = [{"ocsf_field": existing_mapping}]
+ elif isinstance(existing_mapping, dict):
+ existing_mapping = [existing_mapping]
+ else:
+ existing_mapping = []
+ existing_mapping.append(mapping)
+ obj[key] = existing_mapping
+
+
+def _reverse_dict(obj: dict, k: str, v: dict):
+ """Reverse a single OCSF -> native mapping and add it to `obj`"""
+ key = v["native_field"]
+ mapping = {i: j for i, j in v.items() if i != "native_field"}
+ mapping["ocsf_field"] = k
+ _add_mapping(obj, key, mapping)
+
+
+def _add_attr(obj: dict, key: str, value: str):
+ """Add `key` -> `value` to `obj`, appending if necessary"""
+ if key not in obj:
+ obj[key] = value
+ else:
+ existing = obj[key]
+ if isinstance(existing, str):
+ obj[key] = [existing, value]
+ else:
+ existing.append(value)
+
+
+def reverse_mapping(obj: dict, prefix: str = None, result: dict = None) -> dict:
+ """Reverse the mapping; return native -> OCSF map"""
+ if result is None:
+ result = {}
+ for k, v in obj.items():
+ k = ".".join((prefix, k)) if prefix else k
+ # Recurse if necessary
+ if isinstance(v, str):
+ _add_attr(result, v, k)
+ elif isinstance(v, list):
+ # Need to handle multiple mappings
+ for i in v:
+ if isinstance(i, str):
+ _add_attr(result, i, k)
+ elif "native_field" in i:
+ _reverse_dict(result, k, i)
+ else:
+ # Need to "deep" merge with current results
+ reverse_mapping(i, k, result)
+ elif isinstance(v, dict):
+ # First determine if this is a complex mapping or just another level
+ if "native_field" in v:
+ _reverse_dict(result, k, v)
+ else:
+ # Need to "deep" merge with current results
+ reverse_mapping(v, k, result)
+
+ return result
+
+
+def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple:
+ mapped_op = d.get(f"{prefix}_op")
+ transform = d.get(f"{prefix}_value")
+ new_value = run_transformer(transform, value)
+ new_op = mapped_op if mapped_op else op
+ return (d[f"{prefix}_field"], new_op, new_value)
+
+
+def translate_comparison_to_native(
+ dmm: dict, field: str, op: str, value: Union[str, int, float]
+) -> list:
+ """Translate the (`field`, `op`, `value`) triple using data model map `dmm`
+
+ This function may be used in datasource interfaces to translate a comparison
+ in the OCSF data model to the native data model, according to the data model
+ mapping in `dmm`.
+
+ This function translates the (`field`, `op`, `value`) triple into a list of
+ translated triples based on the provided data model map. The data model map
+ is a dictionary that maps fields from one data model to another. For
+ example, if you have a field named "user.name" in your data model, but the
+ corresponding field in the native data model is "username", then you can use
+ the data model map to translate the field name.
+
+ Parameters:
+ dmm: A dictionary that maps fields from one data model to another.
+ field: The field name to be translated.
+ op: The comparison operator.
+ value: The value to be compared against.
+
+ Returns:
+ A list of translated triples.
+
+ Raises:
+ KeyError: If the field cannot be found in the data model map.
+ """
+ _logger.debug("comp_to_native: %s %s %s", field, op, value)
+ result = []
+ mapping = dmm.get(field)
+ if mapping:
+ if isinstance(mapping, str):
+ # Simple 1:1 field name mapping
+ result.append((mapping, op, value))
+ else:
+ raise NotImplementedError("complex native mapping")
+ else:
+ parts = field.split(".")
+ tmp = dmm
+ for part in parts:
+ if isinstance(tmp, dict):
+ tmp = tmp.get(part, {})
+ else:
+ break
+ if tmp:
+ if isinstance(tmp, list):
+ for i in tmp:
+ if isinstance(i, dict):
+ result.append(_get_map_triple(i, "native", op, value))
+ else:
+ result.append((i, op, value))
+ elif isinstance(tmp, dict):
+ result.append(_get_map_triple(tmp, "native", op, value))
+ elif isinstance(tmp, str):
+ result.append((tmp, op, value))
+ else:
+ # Pass-through
+ result.append((field, op, value))
+ _logger.debug("comp_to_native: return %s", result)
+ return result
+
+
+def translate_comparison_to_ocsf(
+ dmm: dict, field: str, op: str, value: Union[str, int, float]
+) -> list:
+ """Translate the (`field`, `op`, `value`) triple using data model map `dmm`
+
+ This function is used in the frontend to translate a comparison in
+ the STIX (or, in the future, ECS) data model to the OCSF data
+ model, according to the data model mapping in `dmm`.
+
+ This function translates the (`field`, `op`, `value`) triple into a list of
+ translated triples based on the provided data model map. The data model map
+ is a dictionary that maps fields from one data model to another. For
+ example, if you have a field named "user.name" in your data model, but the
+ corresponding field in the native data model is "username", then you can use
+ the data model map to translate the field name.
+
+ Parameters:
+ dmm: A dictionary that maps fields from one data model to another.
+ field: The field name to be translated.
+ op: The comparison operator.
+ value: The value to be compared against.
+
+ Returns:
+ A list of translated triples.
+
+ Raises:
+ KeyError: If the field cannot be found in the data model map.
+
+ """
+ _logger.debug("comp_to_ocsf: %s %s %s", field, op, value)
+ result = []
+ mapping = dmm.get(field)
+ if isinstance(mapping, str):
+ # Simple 1:1 field name mapping
+ result.append((mapping, op, value))
+ elif isinstance(mapping, list):
+ for i in mapping:
+ if isinstance(i, dict):
+ result.append(_get_map_triple(i, "ocsf", op, value))
+ else:
+ result.append((i, op, value))
+ return result
+
+
+@typechecked
+def load_default_mapping(
+ data_model_name: str,
+ mapping_pkg: str = "kestrel.mapping",
+ submodule: str = "entityattribute",
+):
+ result = {}
+ entityattr_mapping_files = list_folder_files(
+ mapping_pkg, submodule, prefix=data_model_name, extension="yaml"
+ )
+ for f in entityattr_mapping_files:
+ with open(f, "r") as fp:
+ result.update(yaml.safe_load(fp))
+ return result
+
+
+@typechecked
+def _get_from_mapping(mapping: Union[str, list, dict], key) -> list:
+ result = []
+ if isinstance(mapping, list):
+ for i in mapping:
+ if isinstance(i, dict):
+ result.append(i[key])
+ else:
+ result.append(i)
+ elif isinstance(mapping, dict):
+ result.append(mapping[key])
+ elif isinstance(mapping, str):
+ result.append(mapping)
+ return result
+
+
+@typechecked
+def translate_projection_to_native(
+ dmm: dict,
+ entity_type: Optional[str],
+ attrs: Optional[list],
+ # TODO: optional str or callable for joining entity_type and attr?
+) -> list:
+ result = []
+ if entity_type:
+ dmm = dmm[entity_type]
+ if not attrs:
+ for native_field, mapping in reverse_mapping(dmm).items():
+ result.extend(
+ [(native_field, i) for i in _get_from_mapping(mapping, "ocsf_field")]
+ )
+ attrs = []
+ for attr in attrs:
+ mapping = dmm.get(attr)
+ if not mapping:
+ parts = attr.split(".")
+ tmp = dmm
+ for part in parts:
+ if isinstance(tmp, dict):
+ tmp = tmp.get(part, {})
+ else:
+ break
+ if tmp:
+ mapping = tmp
+ if mapping:
+ result.extend(
+ [(i, attr) for i in _get_from_mapping(mapping, "native_field")]
+ )
+ else:
+ # Pass-through?
+ result.append((attr, attr)) # FIXME: raise exception instead?
+ _logger.debug("proj_to_native: return %s", result)
+ return result
+
+
+@typechecked
+def translate_dataframe(df: DataFrame, dmm: dict) -> DataFrame:
+ # Translate results into Kestrel OCSF data model
+ # The column names of df are already mapped
+ df = df.replace({np.nan: None})
+ for col in df.columns:
+ mapping = dpath.get(dmm, col, separator=".")
+ if isinstance(mapping, dict):
+ transformer_name = mapping.get("ocsf_value")
+ df[col] = run_transformer_on_series(transformer_name, df[col])
+ return df
diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml
index ef3ff62c..d4a1bf75 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml
+++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml
@@ -1,237 +1,233 @@
-process.command_line: process.cmd_line
-process.end: process.end_time
-process.entity_id: process.uid
-process.executable: process.file.path
-process.exit_code: process.exit_code
-process.name: process.name
-process.pid: process.pid
-process.start: process.start_time
-process.thread.id: process.tid
-# process.args
-# process.args_count
-# process.entry_meta.type
-# process.env_vars
-# process.interactive
-# process.same_as_process
-# process.thread.capabilities.effective
-# process.thread.capabilities.permitted
-# process.thread.name
-# process.title
-# process.tty
-# process.uptime
-# process.vpid
-# process.working_directory
-file.accessed: file.accessed_time
-file.attributes: file.attributes
-file.created: file.created_time
-file.ctime: file.modified_time
-file.directory: file.parent_folder
-file.gid: file.xattributes.primary_group
-file.mime_type: file.mime_type
-file.mode: file.mode
-file.mtime: file.modified_time
-file.name: file.name
-file.owner: file.owner
-file.path: file.path
-file.size: file.size
-file.target_path: file.xattributes.link_name
-file.type: file.type
-# file.device
-# file.drive_letter
-# file.extension
-# file.fork_name
-# file.inode
-# file.uid
-group.name: group.name
-group.id: group.uid
-# group.domain
-client.bytes: traffic.bytes_out
-client.domain: src_endpoint.domain
-client.ip: src_endpoint.ip
-client.mac: src_endpoint.mac
-client.packets: traffic.packets_out
-client.port: src_endpoint.port
-# client.address
-# client.nat.ip
-# client.nat.port
-# client.registered_domain
-# client.subdomain
-# client.top_level_domain
-destination.bytes: traffic.bytes_in
-destination.domain: dst_endpoint.domain
-destination.ip: dst_endpoint.ip
-destination.mac: dst_endpoint.mac
-destination.packets: traffic.packets_in
-destination.port: dst_endpoint.port
-# destination.address
-# destination.nat.ip
-# destination.nat.port
-# destination.registered_domain
-# destination.subdomain
-# destination.top_level_domain
-server.bytes: traffic.bytes_in
-server.domain: dst_endpoint.domain
-server.ip: dst_endpoint.ip
-server.mac: dst_endpoint.mac
-server.packets: traffic.packets_in
-server.port: dst_endpoint.port
-# server.address
-# server.nat.ip
-# server.nat.port
-# server.registered_domain
-# server.subdomain
-# server.top_level_domain
-source.bytes: traffic.bytes_out
-source.domain: src_endpoint.domain
-source.ip: src_endpoint.ip
-source.mac: src_endpoint.mac
-source.packets: traffic.packets_out
-source.port: src_endpoint.port
-# source.address
-# source.nat.ip
-# source.nat.port
-# source.registered_domain
-# source.subdomain
-# source.top_level_domain
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009]
-network.application: app_name
-network.bytes: traffic.bytes
-network.direction: connection_info.direction
-network.iana_number: connection_info.protocol_num
-network.packets: traffic.packets
-network.protocol: connection_info.protocol_name
-network.type: connection_info.protocol_ver_id
-# network.community_id
-# network.forwarded_ip
-# network.inner
-# network.name
-# network.transport:
-hash.md5: file.hashes[?algorithm_id == 1].value
-hash.sha1: file.hashes[?algorithm_id == 2].value
-hash.sha256: file.hashes[?algorithm_id == 3].value
-hash.sha512: file.hashes[?algorithm_id == 4].value
-hash.ssdeep: file.hashes[?algorithm_id == 5].value
-hash.tlsh: file.hashes[?algorithm_id == 6].value
-# hash.sha384
-x509.not_after: certificate.expiration_time
-x509.not_before: certificate.created_time
-x509.serial_number: certificate.serial_number
-x509.signature_algorithm: certificate.fingerprints.algorithm
-x509.version_number: certificate.version
-# x509.alternative_names
-# x509.issuer.common_name: certificate.issuer
-# x509.issuer.country: certificate.issuer
-# x509.issuer.distinguished_name: certificate.issuer
-# x509.issuer.locality: certificate.issuer
-# x509.issuer.organization: certificate.issuer
-# x509.issuer.organizational_unit: certificate.issuer
-# x509.issuer.state_or_province: certificate.issuer
-# x509.public_key_algorithm
-# x509.public_key_curve
-# x509.public_key_exponent
-# x509.public_key_size
-# x509.subject.common_name: certificate.subject
-# x509.subject.country: certificate.subject
-# x509.subject.distinguished_name: certificate.subject
-# x509.subject.locality: certificate.subject
-# x509.subject.organization: certificate.subject
-# x509.subject.organizational_unit: certificate.subject
-# x509.subject.state_or_province: certificate.subject
-as.number: device.org.number
-as.organization.name: device.org.name
-geo.city_name: location.city
-geo.continent_name: location.continent
-geo.country_iso_code: location.county
-geo.location: location.coordinates
-geo.postal_code: location.postal_code
-geo.region_iso_code: location.region
-# geo.continent_code
-# geo.country_name
-# geo.name
-# geo.region_name
-# geo.timezone
-user.domain: user.domain
-user.email: user.email_addr
-user.full_name: user.full_name
-user.id: user.uid
-user.name: user.name
-# user.roles
-# user.hash:
-
-referenced_fields:
- process.group:
- ref: group
- prefix: process
- process.hash:
- ref: hash
- prefix: process
- process.parent:
- ref: process # ECS entity used for attribute mapping
- prefix: process # OCSF Prefix
- target_entity: parent_process # Updated OCSF entity name
- process.user:
- ref: user
- prefix: process
- # process.code_signature: code_signature
- # process.entry_leader: process
- # process.entry_leader.parent: process
- # process.entry_leader.parent.session_leader: process
- # process.entry_meta.source: source
- # process.group_leader: process
- # process.macho: macho
- # process.parent.group_leader: process
- # process.pe: pe
- # process.previous: process
- # process.real_group: group
- # process.real_user: user
- # process.saved_group: group
- # process.saved_user: user
- # process.session_leader: process
- # process.session_leader.parent: process
- # process.session_leader.parent.session_leader: process
- # process.supplemental_groups: group
- file.hash:
- ref: hash
- prefix: null
- file.x509:
- ref: x509
- prefix: tls
- # file.code_signature.*
- # file.pe.*
- client.as:
- ref: as
- prefix: null
- client.geo:
- ref: geo
- prefix: src_endpoint
- # client.user:
- # ref: user
- # prefix: src_endpoint
- destination.as:
- ref: as
- prefix: null
- destination.geo:
- ref: geo
- prefix: dst_endpoint
- # destination.user:
- # ref: user
- # prefix: dst_endpoint
- server.as:
- ref: as
- prefix: null
- server.geo:
- ref: geo
- prefix: dst_endpoint
- # server.user:
- # ref: user
- # prefix: dst_endpoint
- source.as:
- ref: as
- prefix: null
- source.geo:
- ref: geo
- prefix: src_endpoint
- # source.user:
- # ref: user
- # prefix: src_endpoint
+# https://schema.ocsf.io/1.1.0/objects/file
+file:
+ accessed_time: file.accessed
+ attributes: file.attributes
+ created_time: file.created
+ # This "hashes" notation comes from jmespath (filter projection)
+ # It's much easier to use the ECS notation in this case
+ hashes[?algorithm_id == 1]:
+ value: hash.md5
+ hashes[?algorithm_id == 2]:
+ value: hash.sha1
+ hashes[?algorithm_id == 3]:
+ value: hash.sha256
+ hashes[?algorithm_id == 4]:
+ value: hash.sha512
+ hashes[?algorithm_id == 5]:
+ value: hash.ssdeep
+ hashes[?algorithm_id == 6]:
+ value: hash.tlsh
+ hashes[*]:
+ value:
+ - hash.md5
+ - hash.sha1
+ - hash.sha256
+ - hash.sha512
+ - hash.ssdeep
+ - hash.tlsh
+ modified_time: file.ctime
+ mime_type: file.mime_type
+ mode: file.mode
+ modified_time: file.mtime
+ name: file.name
+ owner: file.owner
+ parent_folder: file.directory
+ path: file.path
+ size: file.size
+ type: file.type
+ xattributes:
+ primary_group: file.gid
+ link_name: file.target_path
+
+
+# https://schema.ocsf.io/1.1.0/objects/group
+group:
+ domain: group.domain
+ name: group.name
+ uid: group.id
+
+
+# https://schema.ocsf.io/1.1.0/objects/process
+process:
+ cmd_line: process.command_line
+ name: process.name
+ pid: process.pid
+ uid: process.entity_id
+ file:
+ name:
+ native_field: process.executable
+ native_op: LIKE
+ native_value: endswith
+ ocsf_value: basename
+ path: process.executable
+ parent_folder:
+ native_field: process.executable
+ native_op: LIKE
+ native_value: startswith
+ ocsf_value: dirname
+ # This "hashes" notation comes from jmespath (filter projection)
+ # It's much easier to use the ECS notation in this case
+ hashes[?algorithm_id == 1]:
+ value: process.hash.md5
+ hashes[?algorithm_id == 2]:
+ value: process.hash.sha1
+ hashes[?algorithm_id == 3]:
+ value: process.hash.sha256
+ hashes[?algorithm_id == 4]:
+ value: process.hash.sha512
+ hashes[?algorithm_id == 5]:
+ value: process.hash.ssdeep
+ hashes[?algorithm_id == 6]:
+ value: process.hash.tlsh
+ hashes[*]:
+ value:
+ - process.hash.md5
+ - process.hash.sha1
+ - process.hash.sha256
+ - process.hash.sha512
+ - process.hash.ssdeep
+ - process.hash.tlsh
+ parent_process:
+ cmd_line: process.parent.command_line
+ name: process.parent.name
+ pid: process.parent.pid
+ uid: process.parent.entity_id
+ file:
+ name:
+ native_field: process.parent.executable
+ native_op: LIKE
+ native_value: endswith
+ ocsf_value: basename
+ path: process.parent.executable
+ parent_folder:
+ native_field: process.parent.executable
+ native_op: LIKE
+ native_value: startswith
+ ocsf_value: dirname
+
+
+# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint
+src_endpoint: &src_ref
+ domain:
+ - client.domain
+ - source.domain
+ hostname:
+ - client.domain
+ - source.domain
+ ip:
+ - client.ip
+ - source.ip
+ mac:
+ - client.mac
+ - source.mac
+ port:
+ - client.port
+ - source.port
+
+
+# endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint
+endpoint:
+ domain:
+ - client.domain
+ - source.domain
+ - server.domain
+ - destination.domain
+ hostname:
+ - client.domain
+ - source.domain
+ - server.domain
+ - destination.domain
+ ip:
+ - client.ip
+ - source.ip
+ - server.ip
+ - destination.ip
+ mac:
+ - client.mac
+ - source.mac
+ - server.mac
+ - destination.mac
+ port:
+ - client.port
+ - source.port
+ - server.port
+ - destination.port
+
+
+# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint
+dst_endpoint: &dst_ref
+ domain:
+ - server.domain
+ - destination.domain
+ hostname:
+ - server.domain
+ - destination.domain
+ ip:
+ - server.ip
+ - destination.ip
+ mac:
+ - server.mac
+ - destination.mac
+ port:
+ - server.port
+ - destination.port
+
+
+# https://schema.ocsf.io/1.1.0/objects/network_traffic
+# should be `network_traffic`?
+traffic: &traffic
+ bytes: network.bytes
+ bytes_in:
+ - destination.bytes
+ - server.bytes
+ bytes_out:
+ - client.bytes
+ - source.bytes
+ packets: network.packets
+ packets_in:
+ - destination.packets
+ - server.packets
+ packets_out:
+ - client.packets
+ - source.packets
+
+
+# https://schema.ocsf.io/1.1.0/objects/network_connection_info
+connection_info:
+ direction: network.direction #TODO: need transformer?
+ protocol_num: network.iana_number
+ protocol_name: network.transport
+ protocol_ver: network.type
+ protocol_ver_id:
+ native_field: network.type
+ native_value: ip_version_to_network_layer
+ ocsf_value: network_layer_to_ip_version
+
+
+# https://schema.ocsf.io/1.1.0/objects/certificate
+certificate:
+ expiration_time: x509.not_after
+ created_time: x509.not_before
+ serial_number: x509.serial_number
+ fingerprints[*]:
+ algorithm: x509.signature_algorithm
+ version: x509.version_number
+ issuer: x509.issuer.distinguished_name
+ subject: x509.subject.distinguished_name
+ #uid:
+
+
+# https://schema.ocsf.io/1.1.0/objects/user
+user:
+ domain: user.domain
+ full_name: user.full_name
+ name: user.name
+ uid: user.id
+
+
+# https://schema.ocsf.io/1.1.0/classes/network_activity
+# Network Activity [4001] Class
+network_activity:
+ src_endpoint: *src_ref
+ dst_endpoint: *dst_ref
+ traffic: *traffic
diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml
index f0ed912a..7082e6dd 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml
+++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml
@@ -1,210 +1,143 @@
-# All Categories [*]
-autonomous-system:name: device.org.name
-autonomous-system:number: device.org.uid
-
-# File System Activity [1001]
-directory:path: file.path
-directory:accessed: file.accessed_time
-directory:created: file.created_time
-directory:modified: file.modified_time
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009]
-domain-name.value:
- - src_endpoint.domain
- - dst_endpoint.domain
- - dns_query.hostname
-
-# Email Activity [4009]
-email-addr:value: user.email_addr
-email-addr:display_name: user.full_name
-# email-message:is_multipart
-# email-message:date
-# email-message:content_type
-email-message:from_ref.value: email.from
-email-message:sender_ref.value: email.smtp_from
-email-message:to_refs[*].value: email.to
-email-message:cc_refs[*].value: email.cc
-email-message:subject: email.subject
-# email-message:received_lines
-email-message:additional_header_fields: email.raw_header
-# email-message:body
-email-message:body_multipart.body_raw_ref.name: file.name
-# email-message:raw_email_ref
-# email-message:body_multipart.body: file.mime_type
-
-# File System Activity [1001], Network File Activity [4010], Email File Activity [4011]
-file:accessed: file.accessed_time
-file:created: file.created_time
-file:name: file.name
-file:size: file.size
-file:hashes.SHA-256: file.hashes[?algorithm_id == 3].value
-file:hashes.SHA-1: file.hashes[?algorithm_id == 2].value
-file:hashes.MD5: file.hashes[?algorithm_id == 1].value
-file:parent_directory_ref.path: file.parent_folder
-# file:name_enc
-# file:magic_number_hex
-file:mime_type: file.mime_type
-# file:is_encrypted
-# file:encryption_algorithm
-# file:decryption_key
-# file:contains_refs
-# file:content_ref
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007]
-ipv4-addr:value:
- - dst_endpoint.ip
- - src_endpoint.ip
- - device.ip
-# ipv4-addr.belongs_to_refs
-# ipv4-addr.resolves_to_refs
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007]
-ipv6-addr:value:
- - dst_endpoint.ip
- - src_endpoint.ip
- - device.ip
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007]
-mac-addr:value:
- - dst_endpoint.mac
- - src_endpoint.mac
- - device.mac
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003]
-network-traffic:dst_byte_count: traffic.bytes_in
-network-traffic:dst_packets: traffic.packets_in
-network-traffic:dst_port: dst_endpoint.port
-network-traffic:dst_ref.value: dst_endpoint.ip
-network-traffic:protocols[*]:
- - connection_info.protocol_num
- - connection_info.protocol_name
- - connection_info.protocol_ver_id
-network-traffic:src_byte_count: traffic.bytes_out
-network-traffic:src_packets: traffic.packets_out
-network-traffic:src_port: src_endpoint.port
-network-traffic:src_ref.value: src_endpoint.ip
-network_traffic:start: start_time
-network_traffic:end: end_time
-# network_traffic:is_active
-# network_traffic:ipfix
-# network_traffic:src_payload_ref
-# network_traffic:dst_payload_ref
-# network_traffic:encapsulates_refs
-# network_traffic:encapsulated_by_ref
-
-# Process Activity [1007]
-process:binary_ref.name: file.name
-process:command_line: process.cmd_line
-process:created: process.created_time
-process:mime_type: mime_type
-process:name: process.name
-process:pid: process.pid
-process:x_unique_id: process.uid
-process:parent_ref.name:
- - actor.process.name
- - process.parent_process.name
-
-# Base Event [0]
-software:extension.product.feature_name: metadata.product.feature.name
-software:extension.product.feature_uid: metadata.product.feature.uid
-software:extension.product.feature_version: metadata.product.feature.version
-software:extension.product.path: metadata.product.path
-software:extension.product.uid: metadata.product.uid
-software:languages: metadata.product.lang
-software:name: metadata.product.name
-software:vendor: metadata.product.vendor_name
-software:version: metadata.product.version
-
-# HTTP Activity [4002]
-url:value: http_request.url
-
-# Account Change [3001], Authentication [3002], Authorize Session [3003], User Access Management [3005]
-user-account:account_type: user.account.type
-user-account:display_name: user.account.name
-user-account:user_id: user.account.uid
-
-# Base Event [0]
-x-ibm-finding:alert_id:
- - observables.type_id
- - finding.uid
-x-ibm-finding:description: observables.value
-x-ibm-finding:dst_ip_ref.value: dst_endpoint.ip
-x-ibm-finding:end: end_time
-x-ibm-finding:event_count: count
-x-ibm-finding:finding_type: observables.type
-x-ibm-finding:name:
- - observables.name
- - finding.title
-x-ibm-finding:severity: severity_id
-x-ibm-finding:src_ip_ref.value: src_endpoint.ip
-x-ibm-finding:start: finding.created_time
-x-ibm-finding:time_observed: finding.first_seen_time
-x-ibm-finding:types: finding.types
-
-# All Categories [*]
-x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_id: attacks[*].tactics.uid
-x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_name: attacks[*].tactics.name
-x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.technique_id: attacks[*].technique.uid
-x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.version: attacks[*].version
-x-ibm-ttp-tagging:name: attacks[*].technique.name
-
-# All Categories [*]
-x-oca-asset:name:
- - dst_endpoint.name
- - src_endpoint.name
- - device.name
-x-oca-asset:os_name: device.os.name
-x-oca-asset:hostname: device.hostname
-x-oca-asset:device_id: device.uid
-x-oca-asset:ip_refs[*].value: device.network_interfaces[*].ip
-x-oca-asset:mac_refs[*].value: device.network_interfaces[*].mac
-x-oca-asset:os_ref: device.os
-x-oca-asset:architecture: device.hw_info
-x-oca-asset:host_type: device.type
-x-oca-asset:ingress: device.network_interfaces
-x-oca-asset:egress: device.network_interfaces
-x-oca-asset:geo_ref: device.location
-
-# Base Event [0]
-x-oca-event:action:
- - activity
- - activity_name
-x-oca-event:category: category_name
-x-oca-event:code:
- - activity_id
- - category_uid
-x-oca-event:confidence: confidence
-x-oca-event:created: time
-x-oca-event:duration: duration
-x-oca-event:module: class_name
-x-oca-event:network_ref.dst_ref.value: dst_endpoint.ip
-x-oca-event:network_ref.src_ref.value: src_endpoint.ip
-x-oca-event:timezone: timezone_offset
-
-# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007]
-x509-certificate:hashes.SHA-256: file.hashes[?algorithm_id == 3].value
-x509-certificate:hashes.SHA-1: file.hashes[?algorithm_id == 2].value
-x509-certificate:hashes.MD5: file.hashes[?algorithm_id == 1].value
-x509-certificate:version: tls.certificate.version
-x509-certificate:serial_number: tls.certificate.serial_number
-x509-certificate:issuer: tls.certificate.issuer
-x509-certificate:validity_not_before: tls.certificate.created_time
-x509-certificate:validity_not_after: tls.certificate.expiration_time
-x509-certificate:subject: tls.certificate.subject
-x509-certificate:x509_v3_extensions: tls.extension_list
-x509-certificate:signature_algorithm: tls.certificate.fingerprints.algorithm
-
-# Registry Key Activity [201001]
-windows-registry-key:key: win/registry_key.path
-
-# Additional mapping for STIX 2.1
-# File System Activity [1001]
-directory:atime: file.accessed_time
-directory:ctime: file.created_time
-directory:mtime: file.modified_time
-file:atime: file.accessed_time
-file:ctime: file.created_time
-file:mtime: file.modified_time
-
-# Process Activity [1007]
-process:image_ref.name: file.name
+# https://schema.ocsf.io/1.1.0/objects/file
+file:
+ name: file:name
+ size: file:size
+ accessed_time: file:accessed
+ created_time: file:created
+ modified_time: file:modified
+ # This "hashes" notation comes from jmespath (filter projection)
+ # It's much easier to use the ECS notation in this case
+ hashes[?algorithm_id == 1]:
+ value: file:hashes.MD5
+ hashes[?algorithm_id == 2]:
+ value: "file:hashes.'SHA-1'"
+ hashes[?algorithm_id == 3]:
+ value: "file:hashes.'SHA-256'"
+ hashes[?algorithm_id == 4]:
+ value: "file:hashes.'SHA-512'"
+ hashes[?algorithm_id == 5]:
+ value: file:hashes.SSDEEP
+ hashes[?algorithm_id == 6]:
+ value: file:hashes.TLSH
+ hashes[*]:
+ value:
+ - file:hashes.MD5
+ - "file:hashes.'SHA-1'"
+ - "file:hashes.'SHA-256'"
+ - "file:hashes.'SHA-512'"
+ - file:hashes.SSDEEP
+ - file:hashes.TLSH
+
+
+# https://schema.ocsf.io/1.1.0/objects/group
+# group:
+# domain:
+# name:
+# uid:
+
+
+# https://schema.ocsf.io/1.1.0/objects/process
+process:
+ cmd_line: process:command_line
+ name: process:name
+ pid: process:pid
+ uid: process:x_unique_id
+ file:
+ name: process:binary_ref.name
+ parent_folder: process:binary_ref.parent_directory_ref.path
+ # This "hashes" notation comes from jmespath (filter projection)
+ # It's much easier to use the ECS notation in this case
+ hashes[?algorithm_id == 1]:
+ value: process:binary_ref.hashes.MD5
+ hashes[?algorithm_id == 2]:
+ value: process:binary_ref.hashes.'SHA-1'
+ hashes[?algorithm_id == 3]:
+ value: process:binary_ref.hashes.'SHA-256'
+ hashes[?algorithm_id == 4]:
+ value: process:binary_ref.hashes.'SHA-512'
+ hashes[?algorithm_id == 5]:
+ value: process:binary_ref.hashes.SSDEEP
+ hashes[?algorithm_id == 6]:
+ value: process:binary_ref.hashes.TLSH
+ hashes[*]:
+ value:
+ - process:binary_ref.hashes.MD5
+ - process:binary_ref.hashes.'SHA-1'
+ - process:binary_ref.hashes.'SHA-256'
+ - process:binary_ref.hashes.'SHA-512'
+ - process:binary_ref.hashes.SSDEEP
+ - process:binary_ref.hashes.TLSH
+ parent_process:
+ cmd_line: process:parent_ref.command_line
+ name: process:parent_ref.name
+ pid: process:parent_ref.pid
+ uid: process:parent_ref.x_unique_id
+ file:
+ name: process:parent_ref.binary_ref.name
+ parent_folder: process:parent_ref.binary_ref.parent_directory_ref.path
+
+
+# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint
+dst_endpoint:
+ ip:
+ - network-traffic:dst_ref.value
+ - ipv4-addr:value
+ port: network-traffic:dst_port
+
+
+# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint
+src_endpoint:
+ ip:
+ - network-traffic:src_ref.value
+ - ipv4-addr:value
+ port: network-traffic:src_port
+
+
+# https://schema.ocsf.io/1.1.0/objects/endpoint
+endpoint:
+ ip: ipv4-addr:value
+
+
+# https://schema.ocsf.io/1.1.0/objects/device
+device:
+ ip: ipv4-addr:value
+
+
+# https://schema.ocsf.io/1.1.0/objects/network_traffic
+traffic: # should be `network_traffic`?
+ #TODO: bytes: sum of byte counts?
+ bytes_in: network-traffic:dst_byte_count
+ bytes_out: network-traffic:src_byte_count
+ #TODO: packets: sum of packet counts?
+ packets_in: network-traffic:dst_packets
+ packets_out: network-traffic:src_packets
+
+
+# https://schema.ocsf.io/1.1.0/objects/network_connection_info
+# connection_info:
+# direction:
+# protocol_num:
+# protocol_name:
+# protocol_ver:
+# protocol_ver_id:
+
+
+# https://schema.ocsf.io/1.1.0/objects/certificate
+certificate:
+ expiration_time: x509-certificate:validity_not_after
+ created_time: x509-certificate:validity_not_before
+ serial_number: x509-certificate:serial_number
+ fingerprints[*]:
+ algorithm: x509-certificate:signature_algorithm
+ version: x509-certificate:version_number
+ issuer: x509-certificate:issuer
+ subject: x509-certificate:subject
+ #uid:
+
+
+# https://schema.ocsf.io/1.1.0/objects/user
+user:
+ full_name: user-account:display_name
+ name: user-account:account_login
+ type: user-account:account_type
+ uid: user-account:user_id
diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py
new file mode 100644
index 00000000..82202dcb
--- /dev/null
+++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py
@@ -0,0 +1,110 @@
+"""Kestrel Data Model Map value transformers"""
+
+from datetime import datetime, timezone
+from typing import Callable
+
+from pandas import Series
+
+
+# Dict of "registered" transformers
+_transformers = {}
+
+
+def transformer(func: Callable) -> Callable:
+ """A decorator for registering a transformer"""
+ _transformers[func.__name__] = func
+ return func
+
+
+@transformer
+def to_epoch_ms(value: str) -> int:
+ """Convert a time value to milliseconds since the epoch"""
+ if "." in value:
+ time_pattern = "%Y-%m-%dT%H:%M:%S.%fZ"
+ else:
+ time_pattern = "%Y-%m-%dT%H:%M:%SZ"
+ dt = datetime.strptime(value, time_pattern).replace(tzinfo=timezone.utc)
+ return int(dt.timestamp() * 1000)
+
+
+@transformer
+def dirname(path: str) -> str: # TODO: rename to winpath_dirname?
+ """Get the directory part of `path`"""
+ path_dir, _, _ = path.rpartition("\\")
+ return path_dir
+
+
+@transformer
+def basename(path: str) -> str: # TODO: rename to winpath_dirname?
+ """Get the filename part of `path`"""
+ _, _, path_file = path.rpartition("\\")
+ return path_file
+
+
+@transformer
+def startswith(value: str) -> str: # TODO: rename to winpath_startswith?
+ return f"{value}\\%"
+
+
+@transformer
+def endswith(value: str) -> str: # TODO: rename to winpath_endswith?
+ return f"%\\{value}"
+
+
+@transformer
+def to_int(value) -> int:
+ """Ensure `value` is an int"""
+ try:
+ return int(value)
+ except ValueError:
+ # Maybe it's a hexadecimal string?
+ return int(value, 16)
+
+
+@transformer
+def to_str(value) -> str:
+ """Ensure `value` is a str"""
+ return str(value)
+
+
+@transformer
+def ip_version_to_network_layer(value: int) -> str:
+ if value == 4:
+ return "ipv4"
+ elif value == 6:
+ return "ipv6"
+ elif value == 99:
+ return "other"
+ return "unknown"
+
+
+@transformer
+def network_layer_to_ip_version(val: str) -> int:
+ value = val.lower()
+ if value == "ipv4":
+ return 4
+ elif value == "ipv6":
+ return 6
+ elif value == "other":
+ return 99
+ return 0
+
+
+def run_transformer(transformer_name: str, value):
+ """Run the registered transformer with name `transformer_name` on `value`"""
+ func = _transformers.get(transformer_name)
+ if func:
+ result = func(value)
+ else:
+ raise NameError(transformer_name)
+ return result
+
+
+def run_transformer_on_series(transformer_name: str, value: Series):
+ """Run the registered transformer with name `transformer_name` on `value`"""
+ func = _transformers.get(transformer_name)
+ if func:
+ result = value.apply(func)
+ else:
+ raise NameError(transformer_name)
+ return result
diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py
deleted file mode 100644
index 3e15b036..00000000
--- a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py
+++ /dev/null
@@ -1,172 +0,0 @@
-from kestrel.exceptions import MappingParseError
-from kestrel.utils import load_data_file, list_folder_files
-import os
-from typeguard import typechecked
-from typing import (
- Iterable,
- Union,
-)
-import yaml
-
-
-# _entityname_mapping and _entityattr_mapping are dictionaries that contain
-# the info needed to translate:
-# a. queries between:
-# 1. STIX and OCSF
-# 2. ECS and OCSF
-# 3. OCSF and ECS
-# b. results between:
-# 1. ECS and OCSF
-_entityname_mapping = {}
-_entityattr_mapping = {}
-
-
-@typechecked
-def load_standard_config(mapping_pkg: str):
- global _entityname_mapping
- global entityattr_mapping
- if len(_entityname_mapping) > 0 and len(_entityattr_mapping) > 0:
- return
- entityname_mapping_files = list_folder_files(
- mapping_pkg, "entityname", suffix=".yaml"
- )
- for f in entityname_mapping_files:
- parse_entityname_mapping_file(mapping_pkg, f.name)
- entityattr_mapping_files = list_folder_files(
- mapping_pkg, "entityattribute", suffix=".yaml"
- )
- for f in entityattr_mapping_files:
- parse_entityattr_mapping_file(mapping_pkg, f.name)
-
-
-@typechecked
-def parse_entityname_mapping_file(mapping_pkg: str, filename: str):
- global _entityname_mapping
- mapping_fpath = os.path.join("entityname", filename)
- filename_no_ext, _ = filename.split(".")
- src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext
- dst_lang = "ocsf"
- src_dict = _entityname_mapping.get(src_lang, {})
- dst_dict = src_dict.get(dst_lang, {})
- try:
- mapping_str = load_data_file(mapping_pkg, mapping_fpath)
- mapping = yaml.safe_load(mapping_str)
- dst_dict.update(mapping)
- except Exception as ex:
- raise MappingParseError()
- src_dict[dst_lang] = dst_dict
- _entityname_mapping[src_lang] = src_dict
-
-
-@typechecked
-def expand_referenced_field(mapping: dict, key: str, value: dict) -> dict:
- res = {}
- ref = value.get("ref")
- prefix = value.get("prefix")
- target_entity = value.get("target_entity")
- for k, v in mapping.items():
- if k.startswith(f"{ref}."):
- k_no_ref = k[len(ref) + 1 :]
- ref_key = ".".join([key, k_no_ref])
- if prefix is None:
- ref_value = v
- else:
- prefix_tokens = prefix.split(".")
- v_tokens = v.split(".")
- if target_entity is not None:
- v_tokens[0] = target_entity
- ref_value = ".".join(prefix_tokens + v_tokens)
- res[ref_key] = ref_value
- return res
-
-
-@typechecked
-def parse_entityattr_mapping_file(mapping_pkg: str, filename: str):
- global _entityattr_mapping
- mapping_fpath = os.path.join("entityattribute", filename)
- filename_no_ext, _ = filename.split(".")
- src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext
- dst_lang = "ocsf"
- src_dict = _entityattr_mapping.get(src_lang, {})
- dst_dict = src_dict.get(dst_lang, {})
- try:
- mapping_str = load_data_file(mapping_pkg, mapping_fpath)
- mapping = yaml.safe_load(mapping_str)
- mapping_referenced_fields = mapping.pop("referenced_fields", {})
- expanded_refs = {}
- for key, value in mapping_referenced_fields.items():
- expanded_ref = expand_referenced_field(mapping, key, value)
- expanded_refs.update(expanded_ref)
- mapping.update(expanded_refs)
- dst_dict.update(mapping)
- except Exception as ex:
- raise MappingParseError()
- src_dict[dst_lang] = dst_dict
- _entityattr_mapping[src_lang] = src_dict
-
-
-def load_custom_config():
- # ~/.config/kestrel/mapping/entity/*.yaml
- # ~/.config/kestrel/mapping/property/*.yaml
- return
-
-
-@typechecked
-def normalize_entity(
- entityname: str, src_lang: str, dst_lang: str
-) -> Union[str, Iterable[str]]:
- return (
- _entityname_mapping.get(src_lang, {})
- .get(dst_lang, {})
- .get(entityname, entityname)
- )
-
-
-@typechecked
-def normalize_property(
- entityattr: str, src_lang: str, dst_lang: str
-) -> Union[str, Iterable[str]]:
- return (
- _entityattr_mapping.get(src_lang, {})
- .get(dst_lang, {})
- .get(entityattr, entityattr)
- )
-
-
-@typechecked
-def from_ocsf_key_value_pair(from_ocsf_dict: dict, key: str, value: str):
- values = from_ocsf_dict.get(key, [])
- if value not in values:
- values.append(value)
- from_ocsf_dict[key] = values
-
-
-@typechecked
-def from_ocsf_dictionary(to_oscf_dict: dict) -> dict:
- from_ocsf_dict = {}
- for key, value in to_oscf_dict.items():
- if isinstance(value, list):
- for val in value:
- from_ocsf_key_value_pair(from_ocsf_dict, val, key)
- else:
- from_ocsf_key_value_pair(from_ocsf_dict, value, key)
- return from_ocsf_dict
-
-
-@typechecked
-def generate_from_ocsf_dictionaries(source_schema_name: str) -> (dict, dict):
- attr_map = _entityattr_mapping.get(source_schema_name, {}).get("ocsf", {})
- name_map = _entityname_mapping.get(source_schema_name, {}).get("ocsf", {})
- from_ocsf_names = from_ocsf_dictionary(name_map)
- from_ocsf_attrs = from_ocsf_dictionary(attr_map)
- return (from_ocsf_names, from_ocsf_attrs)
-
-
-# if __name__ == "__main__":
-# load_standard_config("kestrel.mapping")
-# res = normalize_entity("ecs", "ocsf", "process")
-# from_ocsf_names, from_ocsf_attrs = generate_from_ocsf_dictionaries("ecs")
-# print("\n\n\n NAMES ")
-# print(yaml.dump(from_ocsf_names))
-# print("\n\n\n ATTRIBUTES ")
-# print(yaml.dump(from_ocsf_attrs))
diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py
index bbbe1ad4..48ebf1f8 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/session.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/session.py
@@ -1,17 +1,17 @@
import logging
from contextlib import AbstractContextManager
-from typing import Iterable
from uuid import UUID, uuid4
-
-from pandas import DataFrame
+from typing import Iterable
from typeguard import typechecked
+from kestrel.display import Display, GraphExplanation
from kestrel.ir.graph import IRGraph
+from kestrel.ir.instructions import Instruction, Explain
from kestrel.frontend.parser import parse_kestrel
from kestrel.cache import AbstractCache, SqliteCache
-from kestrel.interface.datasource import AbstractDataSourceInterface
-from kestrel.interface.datasource.manager import DataSourceManager
-from kestrel.interface.datasource.utils import get_interface_by_name
+from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER
+from kestrel.interface import AbstractInterface, InterfaceManager
+from kestrel.exceptions import InstructionNotFound
_logger = logging.getLogger(__name__)
@@ -22,19 +22,14 @@ class Session(AbstractContextManager):
"""Kestrel huntflow execution session"""
def __init__(self):
- self.session_id: UUID = uuid4()
- self.irgraph: IRGraph = IRGraph()
- self.cache: AbstractCache = SqliteCache()
+ self.session_id = uuid4()
+ self.irgraph = IRGraph()
- # Datasource interfaces in this session
- # Cache is a special datasource interface and should always be added
- self.interfaces: Iterable[AbstractDataSourceInterface] = [self.cache]
+ # load all interfaces; cache is a special interface
+ cache = SqliteCache()
+ self.interface_manager = InterfaceManager([cache])
- # Load data sources and add to list
- data_source_manager = DataSourceManager()
- self.interfaces.extend(data_source_manager.interfaces())
-
- def execute(self, huntflow_block: str) -> Iterable[DataFrame]:
+ def execute(self, huntflow_block: str) -> Iterable[Display]:
"""Execute a Kestrel huntflow block.
Execute a Kestrel statement or multiple consecutive statements (a
@@ -50,7 +45,7 @@ def execute(self, huntflow_block: str) -> Iterable[DataFrame]:
"""
return list(self.execute_to_generate(huntflow_block))
- def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]:
+ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]:
"""Execute a Kestrel huntflow and put results in a generator.
Parameters:
@@ -59,24 +54,55 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]:
Yields:
Evaluated result per Return instruction
"""
-
- # TODO: return type generalization
-
irgraph_new = parse_kestrel(huntflow_block)
self.irgraph.update(irgraph_new)
for ret in irgraph_new.get_returns():
- ret_df = None
- while ret_df is None:
- for g in self.irgraph.find_dependent_subgraphs_of_node(ret, self.cache):
- interface = get_interface_by_name(g.interface, self.interfaces)
- for iid, df in interface.evaluate_graph(g).items():
- if g.interface != self.cache.name:
- self.cache[iid] = df
- if iid == ret.id:
- ret_df = df
- else:
- yield ret_df
+ yield self.evaluate_instruction(ret)
+
+ def evaluate_instruction(self, ins: Instruction) -> Display:
+ """Evaluate a single Instruction.
+
+ Parameters:
+ ins: the instruction to evaluate
+
+ Returns:
+ Evaluated result (Kestrel Display object)
+ """
+ if ins not in self.irgraph:
+ raise InstructionNotFound(ins.to_dict())
+
+ pred = self.irgraph.get_trunk_n_branches(ins)[0]
+ is_explain = isinstance(pred, Explain)
+ display = GraphExplanation([])
+
+ _interface_manager = (
+ self.interface_manager.copy_with_virtual_cache()
+ if is_explain
+ else self.interface_manager
+ )
+ _cache = _interface_manager[CACHE_INTERFACE_IDENTIFIER]
+
+ # The current logic leads to caching results from non-cache and lastly
+ # evaluate in cache.
+ # TODO: may evaluate cache first, then push dependent variables to the
+ # last interface to eval; this requires priority of interfaces
+ while True:
+ for g in self.irgraph.find_dependent_subgraphs_of_node(ins, _cache):
+ interface = _interface_manager[g.interface]
+ for iid, _display in (
+ interface.explain_graph(g)
+ if is_explain
+ else interface.evaluate_graph(g)
+ ).items():
+ if is_explain:
+ display.graphlets.append(_display)
+ else:
+ display = _display
+ if interface is not _cache:
+ _cache[iid] = display
+ if iid == ins.id:
+ return display
def do_complete(self, huntflow_block: str, cursor_pos: int):
"""Kestrel code auto-completion.
@@ -97,9 +123,8 @@ def close(self):
"""
# Note there are two conditions that trigger this function, so it is probably executed twice
# Be careful to write the logic in this function to avoid deleting nonexist files/dirs
- if self.cache:
- del self.cache
- self.cache = None
+ if CACHE_INTERFACE_IDENTIFIER in self.interface_manager:
+ self.interface_manager.del_cache()
def __exit__(self, exception_type, exception_value, traceback):
self.close()
diff --git a/packages-nextgen/kestrel_core/src/kestrel/utils.py b/packages-nextgen/kestrel_core/src/kestrel/utils.py
index 70db2ae3..02cbb5b3 100644
--- a/packages-nextgen/kestrel_core/src/kestrel/utils.py
+++ b/packages-nextgen/kestrel_core/src/kestrel/utils.py
@@ -5,10 +5,11 @@
from pathlib import Path
from pkgutil import get_data
from typeguard import typechecked
-from typing import Union, Mapping
+from typing import Optional, Mapping, Iterable
-def load_data_file(package_name, file_name):
+@typechecked
+def load_data_file(package_name: str, file_name: str) -> str:
try:
# resources.files() is introduced in Python 3.9
content = resources.files(package_name).joinpath(file_name).read_text()
@@ -20,7 +21,16 @@ def load_data_file(package_name, file_name):
return content
-def list_folder_files(package_name, folder_name, prefix=None, suffix=None):
+@typechecked
+def list_folder_files(
+ package_name: str,
+ folder_name: str,
+ prefix: Optional[str] = None,
+ extension: Optional[str] = None,
+) -> Iterable[str]:
+ # preprocesss extension to add dot it not there
+ if extension and extension[0] != ".":
+ extension = "." + extension
try:
file_paths = resources.files(package_name).joinpath(folder_name).iterdir()
except AttributeError:
@@ -41,7 +51,7 @@ def list_folder_files(package_name, folder_name, prefix=None, suffix=None):
for f in file_paths
if (
f.is_file()
- and (f.name.endswith(suffix) if suffix else True)
+ and (f.name.endswith(extension) if extension else True)
and (f.name.startswith(prefix) if prefix else True)
)
)
@@ -57,7 +67,7 @@ def unescape_quoted_string(s: str) -> str:
@typechecked
-def update_nested_dict(dict_old: Mapping, dict_new: Union[Mapping, None]):
+def update_nested_dict(dict_old: Mapping, dict_new: Optional[Mapping]) -> Mapping:
if dict_new:
for k, v in dict_new.items():
if isinstance(v, collections.abc.Mapping) and k in dict_old:
diff --git a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py
index f750c38d..1a0bb9ca 100644
--- a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py
+++ b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py
@@ -3,6 +3,7 @@
from uuid import uuid4
from kestrel.cache import InMemoryCache
+from kestrel.cache.inmemory import InMemoryCacheVirtual
from kestrel.ir.graph import IRGraph, IRGraphEvaluable
from kestrel.frontend.parser import parse_kestrel
@@ -84,3 +85,37 @@ def test_eval_filter_with_ref():
assert len(rets) == 1
df = mapping[rets[0].id]
assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ]
+
+def test_get_virtual_copy():
+ stmt = """
+proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe'
+"""
+ graph = IRGraphEvaluable(parse_kestrel(stmt))
+ c = InMemoryCache()
+ mapping = c.evaluate_graph(graph)
+ v = c.get_virtual_copy()
+ new_entry = uuid4()
+ v[new_entry] = True
+
+ # v[new_entry] calls the right method
+ assert isinstance(v, InMemoryCacheVirtual)
+ assert v[new_entry].startswith("virtual")
+
+ # v[new_entry] does not hit v.cache
+ assert len(c.cache) == 2
+ assert len(v.cache) == 2
+
+ # the two cache_catalog are different
+ assert new_entry not in c
+ assert new_entry in v
+ del v[new_entry]
+ assert new_entry not in v
+ for u in c:
+ del v[u]
+ assert len(v) == 0
+ assert len(c) == 2
diff --git a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py
index f5b99090..5db07fb6 100644
--- a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py
+++ b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py
@@ -1,7 +1,8 @@
from uuid import uuid4
from pandas import DataFrame
-from kestrel.cache.sqlite import SqliteCache
+from kestrel.cache import SqliteCache
+from kestrel.cache.sqlite import SqliteCacheVirtual
from kestrel.ir.graph import IRGraphEvaluable
from kestrel.frontend.parser import parse_kestrel
@@ -150,3 +151,33 @@ def test_eval_filter_with_ref():
assert len(rets) == 1
df = mapping[rets[0].id]
assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ]
+
+def test_get_virtual_copy():
+ stmt = """
+proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe'
+"""
+ graph = IRGraphEvaluable(parse_kestrel(stmt))
+ c = SqliteCache()
+ mapping = c.evaluate_graph(graph)
+ v = c.get_virtual_copy()
+ new_entry = uuid4()
+ v[new_entry] = True
+
+ # v[new_entry] calls the right method
+ assert isinstance(v, SqliteCacheVirtual)
+ assert v[new_entry].endswith("v")
+
+ # the two cache_catalog are different
+ assert new_entry not in c
+ assert new_entry in v
+ del v[new_entry]
+ assert new_entry not in v
+ for u in c:
+ del v[u]
+ assert len(v) == 0
+ assert len(c) == 1
diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py
index dc6164b6..4f9f7507 100644
--- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py
+++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py
@@ -1,7 +1,7 @@
import pytest
from pandas import DataFrame
-from kestrel.interface.datasource.codegen.dataframe import (
+from kestrel.interface.codegen.dataframe import (
evaluate_source_instruction,
evaluate_transforming_instruction,
)
@@ -56,7 +56,7 @@ def test_evaluate_ProjectAttrs():
def test_evaluate_Construct_Filter_ProjectAttrs():
- stmt = """
+ stmt = r"""
proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
, {"name": "explorer.exe", "pid": 99}
, {"name": "firefox.exe", "pid": 201}
diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py
index 27e0aca4..1cc3c46c 100644
--- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py
+++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py
@@ -1,7 +1,7 @@
from datetime import datetime
from dateutil import parser
-from kestrel.interface.datasource.codegen.sql import SqlTranslator
+from kestrel.interface.codegen.sql import SqlTranslator
from kestrel.ir.filter import (
BoolExp,
ExpOp,
diff --git a/packages-nextgen/kestrel_core/tests/test_ir_graph.py b/packages-nextgen/kestrel_core/tests/test_ir_graph.py
index 38fa0c1c..cd77da7d 100644
--- a/packages-nextgen/kestrel_core/tests/test_ir_graph.py
+++ b/packages-nextgen/kestrel_core/tests/test_ir_graph.py
@@ -332,22 +332,22 @@ def test_find_dependent_subgraphs_of_node():
assert len(c) == 2
gs = graph.find_dependent_subgraphs_of_node(ret, c)
assert len(gs) == 1
- assert len(gs[0]) == 10
+ assert len(gs[0]) == 11
assert p2 in gs[0]
assert p21 in gs[0]
assert p4 in gs[0]
- assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs])
+ assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs])
p4_projattr = next(graph.successors(p4))
c[p4_projattr.id] = DataFrame()
gs = graph.find_dependent_subgraphs_of_node(ret, c)
assert len(gs) == 1
- assert len(gs[0]) == 7
+ assert len(gs[0]) == 8
assert p4_projattr.id in c
assert p4_projattr in gs[0]
assert p5 in gs[0]
assert ret in gs[0]
- assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs])
+ assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs])
def test_find_simple_query_subgraphs():
@@ -400,7 +400,7 @@ def test_find_simple_query_subgraphs():
gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c)
assert len(gs) == 1
assert sink in gs[0]
- assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity])
+ assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity, Variable])
for g in gs[0].find_simple_query_subgraphs(c):
assert Counter(map(type, g.nodes())) == Counter([ProjectAttrs, Variable, Filter, ProjectEntity, DataSource])
assert sink in g
diff --git a/packages-nextgen/kestrel_core/tests/test_mapping.py b/packages-nextgen/kestrel_core/tests/test_mapping.py
deleted file mode 100644
index c0860c42..00000000
--- a/packages-nextgen/kestrel_core/tests/test_mapping.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import kestrel.mapping.utils as mapping_utils
-
-
-def test_mapping_load_config():
- mapping_utils.load_standard_config("kestrel.mapping")
- entity_name_map = mapping_utils._entityname_mapping
- assert "stix" in entity_name_map
- assert "ocsf" in entity_name_map.get("stix", {})
- assert "ecs" in entity_name_map
- assert "ocsf" in entity_name_map.get("ecs", {})
- entity_attr_map = mapping_utils._entityattr_mapping
- assert "stix" in entity_attr_map
- assert "ocsf" in entity_attr_map.get("stix", {})
- assert "ecs" in entity_attr_map
- assert "ocsf" in entity_attr_map.get("ecs", {})
-
-
-def test_mapping_entity_names():
- res = mapping_utils.normalize_entity("process", "ecs", "ocsf")
- assert res == "process"
- res = mapping_utils.normalize_entity("i_dont_exist", "ecs", "ocsf")
- assert res == "i_dont_exist"
- res = mapping_utils.normalize_entity("network", "ecs", "ocsf")
- assert res == "network_activity"
-
-
-def test_mapping_entity_attributes():
- res = mapping_utils.normalize_property("process.parent.executable",
- "ecs", "ocsf")
- assert res == "process.parent_process.file.path"
- res = mapping_utils.normalize_property("process.hash.md5", "ecs", "ocsf")
- assert res == "process.file.hashes[?algorithm_id == 1].value"
- res = mapping_utils.normalize_property("process.group.id", "ecs", "ocsf")
- assert res == "process.group.uid"
- res = mapping_utils.normalize_property("processx.non.existent",
- "ecs", "ocsf")
- assert res == "processx.non.existent"
- res = mapping_utils.normalize_property("file.hash.md5", "ecs", "ocsf")
- assert res == "file.hashes[?algorithm_id == 1].value"
-
-
-def test_from_ocsf_dicionaries():
- from_ocsf_names, from_ocsf_attrs = mapping_utils.generate_from_ocsf_dictionaries("ecs")
- res = from_ocsf_names.get("process")
- assert (len(res) == 1 and "process" in res)
- res = from_ocsf_names.get("network_endpoint")
- assert (len(res) == 4 and "client" in res and "destination" in res and
- "server" in res and "source" in res)
- res = from_ocsf_attrs.get("process.name")
- assert (len(res) == 1 and "process.name" in res)
- res = from_ocsf_attrs.get("process.cmd_line")
- assert (len(res) == 1 and "process.command_line" in res)
- res = from_ocsf_attrs.get("process.file.hashes[?algorithm_id == 1].value")
- assert (len(res) == 1 and "process.hash.md5" in res)
- res = from_ocsf_attrs.get("process.file.path")
- assert (len(res) == 1 and "process.executable" in res)
- res = from_ocsf_attrs.get("process.parent_process.file.path")
- assert (len(res) == 1 and "process.parent.executable" in res)
- res = from_ocsf_attrs.get("process.parent_process.tid")
- assert (len(res) == 1 and "process.parent.thread.id" in res)
- res = from_ocsf_attrs.get("src_endpoint.domain")
- assert (len(res) == 2 and "client.domain" in res and
- "source.domain" in res)
- res = from_ocsf_attrs.get("src_endpoint.location.city")
- assert (len(res) == 2 and "client.geo.city_name" in res and
- "source.geo.city_name" in res)
- res = from_ocsf_attrs.get("tls.certificate.created_time")
- assert (len(res) == 1 and "file.x509.not_before" in res)
- res = from_ocsf_attrs.get("tls.certificate.expiration_time")
- assert (len(res) == 1 and "file.x509.not_after" in res)
- res = from_ocsf_attrs.get("tls.certificate.fingerprints.algorithm")
- assert (len(res) == 1 and "file.x509.signature_algorithm" in res)
- res = from_ocsf_attrs.get("traffic.packets_in")
- assert (len(res) == 2 and "destination.packets" in res and
- "server.packets" in res)
- res = from_ocsf_attrs.get("file.hashes[?algorithm_id == 4].value")
- assert (len(res) == 2 and "hash.sha512" in res and
- "file.hash.sha512" in res)
diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py
new file mode 100644
index 00000000..93abe83e
--- /dev/null
+++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py
@@ -0,0 +1,200 @@
+import pytest
+
+import pandas as pd
+
+from kestrel.mapping.data_model import (
+ load_default_mapping,
+ reverse_mapping,
+ translate_comparison_to_native,
+ translate_comparison_to_ocsf,
+ translate_dataframe,
+ translate_projection_to_native,
+)
+
+
+# A "custom" mapping for an opensearch/elasticsearch datasource.
+# This mapping works with data from Blue Team Village's 2023 DefCon CTF, for example.
+WINLOGBEAT_MAPPING = {
+ "file": {
+ "path": "file.path",
+ "name": "file.name"
+ },
+ "process": {
+ "cmd_line": "winlog.event_data.CommandLine",
+ "pid": {
+ "native_field": "winlog.event_data.ProcessId",
+ "native_value": "to_str",
+ "ocsf_value": "to_int"
+ },
+ "uid": "winlog.event_data.ProcessGuid",
+ "file": {
+ "path": "winlog.event_data.Image",
+ "name": [
+ {
+ "native_field": "winlog.event_data.Image",
+ "native_op": "LIKE",
+ "native_value": "endswith",
+ "ocsf_value": "basename"
+ }
+ ],
+ "parent_folder": [
+ {
+ "native_field": "winlog.event_data.Image",
+ "native_op": "LIKE",
+ "native_value": "startswith",
+ "ocsf_value": "dirname"
+ }
+ ]
+ },
+ "parent_process": {
+ "cmd_line": "winlog.event_data.ParentCommandLine",
+ "pid": "winlog.event_data.ParentProcessId",
+ "uid": "winlog.event_data.ParentProcessGuid",
+ "file": {
+ "path": "winlog.event_data.ParentImage",
+ "name": [
+ {
+ "native_field": "winlog.event_data.ParentImage",
+ "native_op": "LIKE",
+ "native_value": "endswith",
+ "ocsf_value": "basename"
+ }
+ ],
+ "parent_folder": [
+ {
+ "native_field": "winlog.event_data.ParentImage",
+ "native_op": "LIKE",
+ "native_value": "startswith",
+ "ocsf_value": "dirname"
+ }
+ ]
+ }
+ }
+ },
+ "dst_endpoint": {
+ "ip": "winlog.event_data.DestinationIp",
+ "port": "winlog.event_data.DestinationPort"
+ },
+ "src_endpoint": {
+ "ip": "winlog.event_data.SourceIp",
+ "port": "winlog.event_data.SourcePort"
+ }
+}
+
+
+# Simplified subset of the standard mapping
+STIX_MAPPING = {
+ "device": {
+ "ip": "ipv4-addr:value"
+ },
+ "endpoint": {
+ "ip": "ipv4-addr:value"
+ },
+}
+
+
+# This mapping is used in 2 places:
+# - frontend comparison from ECS to OCSF
+# - backend comparison from OCSF to ECS (datasource)
+ECS_MAPPING = load_default_mapping("ecs")
+
+
+def test_reverse_mapping_ipv4():
+ reverse_map = reverse_mapping(STIX_MAPPING)
+ ipv4 = reverse_map["ipv4-addr:value"]
+ assert isinstance(ipv4, list)
+ assert set(ipv4) == {"device.ip", "endpoint.ip"}
+
+
+def test_reverse_mapping_executable():
+ reverse_map = reverse_mapping(ECS_MAPPING)
+ exe = reverse_map["process.executable"]
+ assert isinstance(exe, list)
+ assert "process.file.path" in exe
+ for item in exe:
+ if isinstance(item, dict):
+ assert "ocsf_field" in item
+ if item["ocsf_field"] == "process.file.name":
+ # Make sure all metadata from the mapping got reversed
+ assert item["native_value"] == "endswith"
+ assert item["native_op"] == "LIKE"
+ assert item["ocsf_value"] == "basename"
+
+
+
+@pytest.mark.parametrize(
+ "dmm, field, op, value, expected_result",
+ [
+ (WINLOGBEAT_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe",
+ [("winlog.event_data.Image", "=", "C:\\TMP\\foo.exe")]),
+ (WINLOGBEAT_MAPPING, "process.file.name", "=", "foo.exe",
+ [("winlog.event_data.Image", "LIKE", "%\\foo.exe")]),
+ (ECS_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe",
+ [("process.executable", "=", "C:\\TMP\\foo.exe")]),
+ (ECS_MAPPING, "process.file.name", "=", "foo.exe",
+ [("process.executable", "LIKE", "%\\foo.exe")]),
+ ],
+)
+def test_translate_comparison_to_native(dmm, field, op, value, expected_result):
+ assert translate_comparison_to_native(dmm, field, op, value) == expected_result
+
+
+@pytest.mark.parametrize(
+ "dmm, field, op, value, expected_result",
+ [
+ (ECS_MAPPING, "process.executable", "=", "C:\\TMP\\foo.exe",
+ [
+ ("process.file.path", "=", "C:\\TMP\\foo.exe"),
+ ("process.file.name", "=", "foo.exe"),
+ ("process.file.parent_folder", "=", "C:\\TMP"),
+ ]),
+ (ECS_MAPPING, "process.executable", "LIKE", "%\\foo.exe",
+ [
+ ("process.file.path", "LIKE", "%\\foo.exe"),
+ ("process.file.name", "LIKE", "foo.exe"), #TODO: could optimize this to "="
+ ("process.file.parent_folder", "LIKE", "%"), #TODO: could eliminate this?
+ ]),
+ (STIX_MAPPING, "ipv4-addr:value", "=", "198.51.100.13",
+ [
+ ("device.ip", "=", "198.51.100.13"),
+ ("endpoint.ip", "=", "198.51.100.13"),
+ ]),
+ ],
+)
+def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result):
+ """Test the translate function."""
+ reverse_dmm = reverse_mapping(dmm) # Make the dmms fixtures?
+ assert set(translate_comparison_to_ocsf(reverse_dmm, field, op, value)) == set(expected_result)
+
+
+@pytest.mark.parametrize(
+ "dmm, entity, field, expected_result",
+ [
+ (WINLOGBEAT_MAPPING, "process", ["file.name", "pid"],
+ [("winlog.event_data.Image", "file.name"), ("winlog.event_data.ProcessId", "pid")]),
+ (WINLOGBEAT_MAPPING, "process", None,
+ [("winlog.event_data.CommandLine", "cmd_line"),
+ ("winlog.event_data.ProcessId", "pid"),
+ ("winlog.event_data.ProcessGuid", "uid"),
+ ("winlog.event_data.Image", "file.path"),
+ ("winlog.event_data.Image", "file.name"),
+ ("winlog.event_data.Image", "file.parent_folder"),
+ ("winlog.event_data.ParentCommandLine", "parent_process.cmd_line"),
+ ("winlog.event_data.ParentProcessId", "parent_process.pid"),
+ ("winlog.event_data.ParentProcessGuid", "parent_process.uid"),
+ ("winlog.event_data.ParentImage", "parent_process.file.path"),
+ ("winlog.event_data.ParentImage", "parent_process.file.name"),
+ ("winlog.event_data.ParentImage", "parent_process.file.parent_folder"),
+ ]),
+ ],
+)
+def test_translate_projection_to_native(dmm, entity, field, expected_result):
+ assert translate_projection_to_native(dmm, entity, field) == expected_result
+
+
+def test_translate_dataframe(): #TODO: more testing here
+ df = pd.DataFrame({"file.path": [r"C:\Windows\System32\cmd.exe", r"C:\TMP"],
+ "pid": [1, 2]})
+ dmm = load_default_mapping("ecs")
+ df = translate_dataframe(df, dmm["process"])
+ #TODO:assert df["file.name"].iloc[0] == "cmd.exe"
diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py
new file mode 100644
index 00000000..9e454925
--- /dev/null
+++ b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import pytest
+
+from kestrel.mapping.transformers import (
+ run_transformer,
+ run_transformer_on_series,
+)
+
+
+@pytest.mark.parametrize(
+ "transform, value, expected", [
+ ("dirname", r"C:\Windows\System32\cmd.exe", r"C:\Windows\System32"),
+ ("basename", r"C:\Windows\System32\cmd.exe", r"cmd.exe"),
+ ("startswith", r"C:\Windows\System32", r"C:\Windows\System32\%"),
+ ("endswith", "cmd.exe", r"%\cmd.exe"),
+ ("to_int", 1234, 1234),
+ ("to_int", 1234.1234, 1234), # Maybe this should fail?
+ ("to_int", "1234", 1234),
+ ("to_int", "0x4d2", 1234),
+ ("to_str", "1234", "1234"),
+ ("to_str", 1234, "1234"),
+ ("to_epoch_ms", "2024-03-29T12:57:56.926Z", 1711717076926),
+ ("to_epoch_ms", "2024-03-29T12:57:56.92Z", 1711717076920),
+ ("to_epoch_ms", "2024-03-29T12:57:56.9Z", 1711717076900),
+ ("to_epoch_ms", "2024-03-29T12:57:56Z", 1711717076000),
+ ]
+)
+def test_run_transformer(transform, value, expected):
+ assert run_transformer(transform, value) == expected
+
+
+def test_run_series_basename():
+ data = pd.Series([r"C:\Windows\System32\cmd.exe", r"C:\TMP"])
+ result = list(run_transformer_on_series("basename", data))
+ assert result == ["cmd.exe", "TMP"]
diff --git a/packages-nextgen/kestrel_core/tests/test_parser.py b/packages-nextgen/kestrel_core/tests/test_parser.py
index 14faa856..1ca5d314 100644
--- a/packages-nextgen/kestrel_core/tests/test_parser.py
+++ b/packages-nextgen/kestrel_core/tests/test_parser.py
@@ -1,5 +1,6 @@
import json
import pytest
+from collections import Counter
from datetime import datetime, timedelta, timezone
from kestrel.frontend.parser import parse_kestrel
@@ -16,6 +17,8 @@
Reference,
Sort,
Variable,
+ Explain,
+ Return,
)
@@ -108,10 +111,10 @@ def test_parser_mapping_single_comparison_to_multiple_values():
stmt = "x = GET ipv4-addr FROM if://ds WHERE value = '192.168.22.3'"
parse_filter = get_parsed_filter_exp(stmt)
comps = parse_filter.comps
- assert isinstance(comps, list) and len(comps) == 3
+ assert isinstance(comps, list) and len(comps) == 4
fields = [x.field for x in comps]
assert ("dst_endpoint.ip" in fields and "src_endpoint.ip" in fields and
- "device.ip" in fields)
+ "device.ip" in fields and "endpoint.ip" in fields)
def test_parser_mapping_multiple_comparison_to_multiple_values():
@@ -121,12 +124,9 @@ def test_parser_mapping_multiple_comparison_to_multiple_values():
field1 = parse_filter.lhs.field
assert field1 == 'file.name'
field2 = parse_filter.rhs.lhs.field
- assert field2 == 'process.name'
- comps3 = parse_filter.rhs.rhs.comps
- assert isinstance(comps3, list) and len(comps3) == 2
- fields3 = [x.field for x in comps3]
- assert ("actor.process.name" in fields3 and
- "process.parent_process.name" in fields3)
+ assert field2 == 'name' # 'process.name'
+ field3 = parse_filter.rhs.rhs.field
+ assert field3 == "parent_process.name"
def test_parser_new_json():
@@ -265,3 +265,26 @@ def test_parser_disp_after_new():
assert (proj, limit) in graph.edges
assert (limit, offset) in graph.edges
assert (offset, ret) in graph.edges
+
+
+def test_parser_explain_alone():
+ stmt = "EXPLAIN abc"
+ graph = parse_kestrel(stmt)
+ assert len(graph) == 3
+ assert len(graph.edges) == 2
+ assert Counter(map(type, graph.nodes())) == Counter([Reference, Explain, Return])
+
+
+def test_parser_explain_dereferred():
+ stmt = """
+proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+EXPLAIN proclist
+"""
+ graph = parse_kestrel(stmt)
+ assert len(graph) == 4
+ assert len(graph.edges) == 3
+ assert Counter(map(type, graph.nodes())) == Counter([Construct, Variable, Explain, Return])
diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py
index bcbfdeb0..115154d4 100644
--- a/packages-nextgen/kestrel_core/tests/test_session.py
+++ b/packages-nextgen/kestrel_core/tests/test_session.py
@@ -1,6 +1,14 @@
import pytest
+import os
from kestrel import Session
from pandas import DataFrame
+from uuid import uuid4
+
+from kestrel.display import GraphExplanation
+from kestrel.ir.instructions import Construct
+from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER
+from kestrel.frontend.parser import parse_kestrel
+from kestrel.cache import SqliteCache
def test_execute_in_cache():
@@ -26,3 +34,153 @@ def test_execute_in_cache():
assert b2.equals(next(res))
with pytest.raises(StopIteration):
next(res)
+
+
+def test_double_deref_in_cache():
+ # When the Filter node is dereferred twice
+ # The node should be deepcopied each time to avoid issue
+ hf = """
+proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+px = proclist WHERE name != "cmd.exe" AND pid = 205
+chrome = proclist WHERE pid IN px.pid
+DISP chrome
+DISP chrome
+"""
+ df = DataFrame([ {"name": "chrome.exe", "pid": 205} ])
+ with Session() as session:
+ res = session.execute_to_generate(hf)
+ assert df.equals(next(res))
+ assert df.equals(next(res))
+ with pytest.raises(StopIteration):
+ next(res)
+
+
+def test_explain_in_cache():
+ hf = """
+proclist = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+browsers = proclist WHERE name != "cmd.exe"
+chrome = browsers WHERE pid = 205
+EXPLAIN chrome
+"""
+ with Session() as session:
+ ress = session.execute_to_generate(hf)
+ res = next(ress)
+ assert isinstance(res, GraphExplanation)
+ assert len(res.graphlets) == 1
+ ge = res.graphlets[0]
+ assert ge.graph == session.irgraph.to_dict()
+ construct = session.irgraph.get_nodes_by_type(Construct)[0]
+ assert ge.query.language == "SQL"
+ stmt = ge.query.statement.replace('"', '')
+ assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}v) AS proclist \nWHERE name != \'cmd.exe\') AS browsers \nWHERE pid = 205) AS chrome'
+ with pytest.raises(StopIteration):
+ next(ress)
+
+
+def test_multi_interface_explain():
+
+ class DataLake(SqliteCache):
+ @staticmethod
+ def schemes():
+ return ["datalake"]
+
+ class Gateway(SqliteCache):
+ @staticmethod
+ def schemes():
+ return ["gateway"]
+
+ extra_db = []
+ with Session() as session:
+ stmt1 = """
+procs = NEW process [ {"name": "cmd.exe", "pid": 123}
+ , {"name": "explorer.exe", "pid": 99}
+ , {"name": "firefox.exe", "pid": 201}
+ , {"name": "chrome.exe", "pid": 205}
+ ]
+DISP procs
+"""
+ session.execute(stmt1)
+ session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = DataLake
+ session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "datalake"
+
+ new_cache = SqliteCache(session_id = uuid4())
+ extra_db.append(new_cache.db_path)
+ session.interface_manager.interfaces.append(new_cache)
+ stmt2 = """
+nt = NEW network [ {"pid": 123, "source": "192.168.1.1", "destination": "1.1.1.1"}
+ , {"pid": 205, "source": "192.168.1.1", "destination": "1.1.1.2"}
+ ]
+DISP nt
+"""
+ session.execute(stmt2)
+ session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = Gateway
+ session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "gateway"
+
+ new_cache = SqliteCache(session_id = uuid4())
+ extra_db.append(new_cache.db_path)
+ session.interface_manager.interfaces.append(new_cache)
+ stmt3 = """
+domain = NEW domain [ {"ip": "1.1.1.1", "domain": "cloudflare.com"}
+ , {"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"}
+ ]
+DISP domain
+"""
+ session.execute(stmt3)
+
+ stmt = """
+p2 = procs WHERE name IN ("firefox.exe", "chrome.exe")
+ntx = nt WHERE pid IN p2.pid
+d2 = domain WHERE ip IN ntx.destination
+EXPLAIN d2
+DISP d2
+"""
+ ress = session.execute_to_generate(stmt)
+ disp = next(ress)
+ df_res = next(ress)
+
+ with pytest.raises(StopIteration):
+ next(ress)
+
+ assert isinstance(disp, GraphExplanation)
+ assert len(disp.graphlets) == 4
+
+ assert len(disp.graphlets[0].graph["nodes"]) == 5
+ query = disp.graphlets[0].query.statement.replace('"', '')
+ procs = session.irgraph.get_variable("procs")
+ c1 = next(session.irgraph.predecessors(procs))
+ assert query == f"SELECT pid \nFROM (SELECT * \nFROM (SELECT * \nFROM {c1.id.hex}) AS procs \nWHERE name IN ('firefox.exe', 'chrome.exe')) AS p2"
+
+ assert len(disp.graphlets[1].graph["nodes"]) == 2
+ query = disp.graphlets[1].query.statement.replace('"', '')
+ nt = session.irgraph.get_variable("nt")
+ c2 = next(session.irgraph.predecessors(nt))
+ assert query == f"SELECT * \nFROM (SELECT * \nFROM {c2.id.hex}) AS nt"
+
+ # the current session.execute_to_generate() logic does not store
+ # in cache if evaluated by cache; the behavior may change in the future
+ assert len(disp.graphlets[2].graph["nodes"]) == 2
+ query = disp.graphlets[2].query.statement.replace('"', '')
+ domain = session.irgraph.get_variable("domain")
+ c3 = next(session.irgraph.predecessors(domain))
+ assert query == f"SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain"
+
+ assert len(disp.graphlets[3].graph["nodes"]) == 12
+ print(disp.graphlets[3].graph["nodes"])
+ query = disp.graphlets[3].query.statement.replace('"', '')
+ p2 = session.irgraph.get_variable("p2")
+ p2pa = next(session.irgraph.successors(p2))
+ assert query == f"SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain \nWHERE ip IN (SELECT destination \nFROM (SELECT * \nFROM {nt.id.hex}v \nWHERE pid IN (SELECT * \nFROM {p2pa.id.hex}v)) AS ntx)) AS d2"
+
+ df_ref = DataFrame([{"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"}])
+ assert df_ref.equals(df_res)
+
+ for db_file in extra_db:
+ os.remove(db_file)
diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py
deleted file mode 100644
index f932e879..00000000
--- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from kestrel_datasource_opensearch.interface import OpenSearchInterface
diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py b/packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml
similarity index 96%
rename from packages-nextgen/kestrel_datasource_opensearch/pyproject.toml
rename to packages-nextgen/kestrel_interface_opensearch/pyproject.toml
index 6d5017a0..6270f6d0 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml
+++ b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools >= 68.2.2", "wheel"]
build-backend = "setuptools.build_meta"
[project]
-name = "kestrel_datasource_opensearch"
+name = "kestrel_interface_opensearch"
version = "2.0.0"
description = "Kestrel OpenSearch Datasource Interface"
readme = "README.rst"
diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py
new file mode 100644
index 00000000..3ee389ca
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py
@@ -0,0 +1 @@
+from kestrel_interface_opensearch.interface import OpenSearchInterface
diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py
similarity index 62%
rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py
rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py
index add15f4a..26d02ccf 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py
+++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py
@@ -1,6 +1,6 @@
import logging
from dataclasses import dataclass, field
-from typing import Dict, Optional
+from typing import Dict, Mapping, Optional
import yaml
from mashumaro.mixins.json import DataClassJSONMixin
@@ -9,10 +9,8 @@
CONFIG_DIR_DEFAULT,
load_user_config,
)
-from kestrel.mapping.utils import (
- generate_from_ocsf_dictionaries,
- load_standard_config,
-)
+from kestrel.exceptions import InterfaceNotConfigured
+from kestrel.mapping.data_model import load_default_mapping
PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "opensearch.yaml"
@@ -42,22 +40,16 @@ class Index(DataClassJSONMixin):
connection: str
timestamp: str
timestamp_format: str
- data_model_mapping: Optional[str] = None
- data_model_map: dict = field(default_factory=dict)
+ data_model_mapping: Optional[str] = None # Filename for mapping
+ data_model_map: Mapping = field(default_factory=dict)
def __post_init__(self):
if self.data_model_mapping:
with open(self.data_model_mapping, "r") as fp:
- data_model_map = yaml.safe_load(fp)
- # Reverse it so it's ocsf -> native
- self.data_model_map = {
- v: k for k, v in data_model_map.items() if isinstance(v, str)
- }
+ self.data_model_map = yaml.safe_load(fp)
else:
# Default to the built-in ECS mapping
- load_standard_config("kestrel.mapping")
- _, data_model_map = generate_from_ocsf_dictionaries("ecs")
- self.data_model_map = {k: v[0] for k, v in data_model_map.items()}
+ self.data_model_map = load_default_mapping("ecs")
@dataclass
@@ -71,4 +63,7 @@ def __post_init__(self):
def load_config():
- return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT))
+ try:
+ return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT))
+ except TypeError:
+ raise InterfaceNotConfigured()
diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py
similarity index 73%
rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py
rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py
index c1406abc..8c70eb95 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py
+++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py
@@ -5,8 +5,9 @@
from opensearchpy import OpenSearch
from pandas import DataFrame, Series, concat
+from kestrel.display import GraphletExplanation
from kestrel.exceptions import DataSourceError
-from kestrel.interface.datasource.base import AbstractDataSourceInterface
+from kestrel.interface import AbstractInterface
from kestrel.ir.graph import IRGraphEvaluable
from kestrel.ir.instructions import (
DataSource,
@@ -18,9 +19,10 @@
TransformingInstruction,
SolePredecessorTransformingInstruction,
)
+from kestrel.mapping.data_model import translate_dataframe
-from kestrel_datasource_opensearch.config import load_config
-from kestrel_datasource_opensearch.ossql import OpenSearchTranslator
+from kestrel_interface_opensearch.config import load_config
+from kestrel_interface_opensearch.ossql import OpenSearchTranslator
_logger = logging.getLogger(__name__)
@@ -32,11 +34,12 @@ def _jdbc2df(schema: dict, datarows: dict) -> DataFrame:
return DataFrame(datarows, columns=columns)
-def read_sql(sql: str, conn: OpenSearch) -> DataFrame:
+def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFrame:
"""Execute `sql` and return the results as a DataFrame, a la pandas.read_sql"""
# https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#query-api
body = {
- "fetch_size": 10000, # Should we make this configurable?
+ # Temporarily comment out fetch_size due to https://github.com/opensearch-project/sql/issues/2579
+ # FIXME: "fetch_size": 10000, # Should we make this configurable?
"query": sql,
}
query_resp = conn.http.post("/_plugins/_sql?format=jdbc", body=body)
@@ -56,7 +59,12 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame:
dfs = []
done = False
while not done:
- dfs.append(_jdbc2df(schema, query_resp["datarows"]))
+ df = _jdbc2df(schema, query_resp["datarows"])
+ if dmm is not None:
+ # Need to use Data Model Map to do results translation
+ dfs.append(translate_dataframe(df, dmm))
+ else:
+ dfs.append(df)
cursor = query_resp.get("cursor")
if not cursor:
break
@@ -68,7 +76,7 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame:
return concat(dfs)
-class OpenSearchInterface(AbstractDataSourceInterface):
+class OpenSearchInterface(AbstractInterface):
def __init__(
self,
serialized_cache_catalog: Optional[str] = None,
@@ -89,9 +97,9 @@ def __init__(
)
self.conns[name] = client
- @property
- def name(self):
- return "opensearch"
+ @staticmethod
+ def schemes() -> Iterable[str]:
+ return ["opensearch"]
def store(
self,
@@ -111,7 +119,8 @@ def evaluate_graph(
for instruction in instructions_to_evaluate:
translator = self._evaluate_instruction_in_graph(graph, instruction)
# TODO: may catch error in case evaluation starts from incomplete SQL
- _logger.debug("SQL query generated: %s", translator.result())
+ sql = translator.result()
+ _logger.debug("SQL query generated: %s", sql)
ds = self.config.indexes[translator.table] # table == datasource
conn = self.config.connections[ds.connection]
client = OpenSearch(
@@ -119,10 +128,28 @@ def evaluate_graph(
http_auth=(conn.auth.username, conn.auth.password),
verify_certs=conn.verify_certs,
)
- mapping[instruction.id] = read_sql(translator.result(), client)
+ mapping[instruction.id] = read_sql(
+ sql, client, translator.from_ocsf_map[translator.entity]
+ )
client.close()
return mapping
+ def explain_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_explain: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, GraphletExplanation]:
+ mapping = {}
+ if not instructions_to_explain:
+ instructions_to_explain = graph.get_sink_nodes()
+ for instruction in instructions_to_explain:
+ translator = self._evaluate_instruction_in_graph(graph, instruction)
+ dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction)
+ graph_dict = dep_graph.to_dict()
+ query_stmt = translator.result()
+ mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt)
+ return mapping
+
def _evaluate_instruction_in_graph(
self,
graph: IRGraphEvaluable,
@@ -175,7 +202,10 @@ def get_schema(self, index: str) -> dict:
client = self._get_client_for_index(index)
if index not in self.schemas:
df = read_sql(f"DESCRIBE TABLES LIKE {index}", client)
- self.schemas[index] = Series(
- df["TYPE_NAME"], index=df["COLUMN_NAME"]
- ).to_dict()
+ self.schemas[index] = (
+ df[["TYPE_NAME", "COLUMN_NAME"]]
+ .set_index("COLUMN_NAME")
+ .T.to_dict("records")[0]
+ )
+ _logger.debug("%s schema:\n%s", index, self.schemas[index])
return self.schemas[index]
diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py
similarity index 79%
rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py
rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py
index 55976d23..018cd4c8 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py
+++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py
@@ -9,7 +9,6 @@
BoolExp,
ExpOp,
FComparison,
- ListComparison,
ListOp,
MultiComp,
NumCompOp,
@@ -26,6 +25,10 @@
Sort,
SortDirection,
)
+from kestrel.mapping.data_model import (
+ translate_comparison_to_native,
+ translate_projection_to_native,
+)
_logger = logging.getLogger(__name__)
@@ -68,6 +71,16 @@ def _or(lhs: str, rhs: Value) -> str:
}
+def _format_value(value):
+ if isinstance(value, str):
+ # Need to quote string values
+ value = f"'{value}'"
+ elif isinstance(value, list):
+ # SQL uses parens for lists
+ value = tuple(value)
+ return value
+
+
@typechecked
class OpenSearchTranslator:
def __init__(
@@ -102,23 +115,21 @@ def __init__(
@typechecked
def _render_comp(self, comp: FComparison) -> str:
- if isinstance(comp, StrComparison):
- # Need to quote string values
- value = f"'{comp.value}'"
- elif isinstance(comp, ListComparison):
- # SQL uses parens for lists
- value = tuple(comp.value)
- else:
- value = comp.value
- # Need to map OCSF filter field to native
- prefix = f"{self.entity}." if self.entity else ""
+ prefix = (
+ f"{self.entity}." if (self.entity and comp.field != self.timestamp) else ""
+ )
ocsf_field = f"{prefix}{comp.field}"
- field = self.from_ocsf_map.get(ocsf_field, comp.field)
- _logger.debug("Mapped field '%s' to '%s'", ocsf_field, field)
+ comps = translate_comparison_to_native(
+ self.from_ocsf_map, ocsf_field, comp.op, comp.value
+ )
try:
- result = f"{field} {comp2func[comp.op]} {value}"
+ comps = [f"{f} {comp2func[o]} {_format_value(v)}" for f, o, v in comps]
+ conj = " OR ".join(comps)
+ result = conj if len(comps) == 1 else f"({conj})"
except KeyError:
- raise UnsupportedOperatorError(comp.op.value)
+ raise UnsupportedOperatorError(
+ comp.op.value
+ ) # FIXME: need to report the mapped op, not the original
return result
@typechecked
@@ -177,24 +188,20 @@ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None:
# Just save projection and compile it later
self.project = proj
- def _get_ocsf_cols(self):
- prefix = f"{self.entity}." if self.entity else ""
- if not self.project:
- ocsf_cols = [k for k in self.from_ocsf_map.keys() if k.startswith(prefix)]
- else:
- ocsf_cols = [f"{prefix}{col}" for col in self.project.attrs]
- _logger.debug("OCSF fields: %s", ocsf_cols)
- return ocsf_cols
-
def _render_proj(self):
- fields = {
- self.from_ocsf_map.get(col, col): col for col in self._get_ocsf_cols()
- }
- _logger.debug("Fields: %s", fields)
+ """Get a list of native cols to project with their OCSF equivalents as SQL aliases"""
+ projection = self.project.attrs if self.project else None
+ name_pairs = translate_projection_to_native(
+ self.from_ocsf_map, self.entity, projection
+ )
proj = [
- f"`{k}` AS `{v.partition('.')[2]}`" if "." in v else v
- for k, v in fields.items()
+ f"`{k}` AS `{v}`" if k != v else f"`{k}`"
+ for k, v in name_pairs
+ if k in self.schema # Ignore mapped attrs the index doesn't have
]
+ if not proj:
+ # If this is still empty, then the attr projection must be for attrs "outside" to entity projection?
+ proj = [f"`{attr}`" for attr in self.project.attrs]
_logger.debug("Set projection to %s", proj)
return proj
@@ -230,7 +237,7 @@ def result(self) -> str:
if where:
stages.append(f"WHERE {where}")
if self.order_by:
- stages.append(f"ORDER BY {self.order_by} {self.sort_dir}")
+ stages.append(f"ORDER BY {self.order_by} {self.sort_dir.value}")
if self.limit:
# https://opensearch.org/docs/latest/search-plugins/sql/sql/basic/#limit
if self.offset:
diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py b/packages-nextgen/kestrel_interface_opensearch/tests/__init__.py
similarity index 100%
rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py
rename to packages-nextgen/kestrel_interface_opensearch/tests/__init__.py
diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py
similarity index 97%
rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py
rename to packages-nextgen/kestrel_interface_opensearch/tests/test_config.py
index 51964889..85241b71 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py
+++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py
@@ -2,7 +2,7 @@
import yaml
-from kestrel_datasource_opensearch.config import (
+from kestrel_interface_opensearch.config import (
PROFILE_PATH_ENV_VAR,
Connection,
load_config,
diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py
similarity index 74%
rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py
rename to packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py
index d4e17eaf..838b57e2 100644
--- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py
+++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py
@@ -1,7 +1,7 @@
from datetime import datetime
from dateutil import parser
-from kestrel_datasource_opensearch.ossql import OpenSearchTranslator
+from kestrel_interface_opensearch.ossql import OpenSearchTranslator
from kestrel.exceptions import UnsupportedOperatorError
from kestrel.ir.filter import (
BoolExp,
@@ -33,12 +33,28 @@
TIMEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
+# A much-simplified test mapping
data_model_map = {
- "process.cmd_line": "CommandLine",
- "process.file.path": "Image",
- "process.pid": "ProcessId",
- "actor.process.pid": "ParentProcessId",
+ "process": {
+ "cmd_line": "CommandLine",
+ "file": {
+ "path": "Image",
+ # "name": [
+ # {
+ # "native_field": "Image",
+ # "native_value": "basename",
+ # "ocsf_op": "LIKE",
+ # "ocsf_value": "endswith"
+ # }
+ # ]
+ },
+ "pid": "ProcessId",
+ "parent_process": {
+ "pid": "ParentProcessId",
+ },
+ },
}
+
schema = {
"CommandLine": "text",
"Image": "text",
@@ -68,10 +84,10 @@ def _remove_nl(s):
"SELECT {} FROM my_table WHERE foo >= 0 AND timestamp >= '2023-12-06T08:17:00.000000Z' AND timestamp < '2023-12-07T08:17:00.000000Z'"),
# Add a limit and projection
([Limit(3), ProjectAttrs(['foo', 'bar', 'baz']), Filter(StrComparison('foo', StrCompOp.EQ, 'abc'))],
- "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"),
+ "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"),
# Same as above but reverse order
([Filter(StrComparison('foo', StrCompOp.EQ, 'abc')), ProjectAttrs(['foo', 'bar', 'baz']), Limit(3)],
- "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"),
+ "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"),
([Filter(ListComparison('foo', ListOp.NIN, ['abc', 'def']))],
"SELECT {} FROM my_table WHERE foo NOT IN ('abc', 'def')"),
([Filter(MultiComp(ExpOp.OR, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))],
@@ -86,9 +102,11 @@ def _remove_nl(s):
]
)
def test_opensearch_translator(iseq, sql):
- cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`'
- if ProjectEntity not in {type(i) for i in iseq}:
- cols += ', `ParentProcessId` AS `process.pid`'
+ cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`'
+ if ProjectEntity in {type(i) for i in iseq}:
+ cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`'
+ else:
+ cols = '`CommandLine` AS `process.cmd_line`, `Image` AS `process.file.path`, `ProcessId` AS `process.pid`, `ParentProcessId` AS `process.parent_process.pid`'
trans = OpenSearchTranslator(TIMEFMT, "timestamp", "my_table", data_model_map, schema)
for i in iseq:
trans.add_instruction(i)
diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml
new file mode 100644
index 00000000..c4309e70
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml
@@ -0,0 +1,35 @@
+[build-system]
+requires = ["setuptools >= 68.2.2", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "kestrel_interface_sqlalchemy"
+version = "2.0.0"
+description = "Kestrel SQLAlchemy Datasource Interface"
+readme = "README.rst"
+requires-python = ">=3.8"
+license = {text = "Apache 2.0 License"}
+maintainers = [
+ {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"},
+ {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"},
+]
+keywords = [
+ "kestrel",
+ "cybersecurity",
+ "threat hunting",
+]
+classifiers = [
+ "Topic :: Security",
+ "Operating System :: OS Independent",
+ "Development Status :: 4 - Beta",
+ "Programming Language :: Python :: 3",
+]
+
+dependencies = [
+ "kestrel_core>=2.0.0",
+]
+
+[project.urls]
+Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang"
+Documentation = "https://kestrel.readthedocs.io/"
+Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git"
diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py
new file mode 100644
index 00000000..781df021
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py
@@ -0,0 +1 @@
+from kestrel_interface_sqlalchemy.interface import SQLAlchemyInterface
diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py
new file mode 100644
index 00000000..e9d148e4
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py
@@ -0,0 +1,58 @@
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, Mapping, Optional
+
+import yaml
+from mashumaro.mixins.json import DataClassJSONMixin
+
+from kestrel.config.utils import (
+ CONFIG_DIR_DEFAULT,
+ load_user_config,
+)
+from kestrel.exceptions import InterfaceNotConfigured
+from kestrel.mapping.data_model import load_default_mapping
+
+
+PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "sqlalchemy.yaml"
+PROFILE_PATH_ENV_VAR = "KESTREL_SQLALCHEMY_CONFIG"
+
+_logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Connection(DataClassJSONMixin):
+ url: str # SQLAlchemy "connection URL" or "connection string"
+
+
+@dataclass
+class Table(DataClassJSONMixin):
+ connection: str
+ timestamp: str
+ timestamp_format: str
+ data_model_mapping: Optional[str] = None # Filename for mapping
+ data_model_map: Mapping = field(default_factory=dict)
+
+ def __post_init__(self):
+ if self.data_model_mapping:
+ with open(self.data_model_mapping, "r") as fp:
+ self.data_model_map = yaml.safe_load(fp)
+ else:
+ # Default to the built-in ECS mapping
+ self.data_model_map = load_default_mapping("ecs") # FIXME: need a default?
+
+
+@dataclass
+class Config(DataClassJSONMixin):
+ connections: Dict[str, Connection]
+ tables: Dict[str, Table]
+
+ def __post_init__(self):
+ self.connections = {k: Connection(**v) for k, v in self.connections.items()}
+ self.tables = {k: Table(**v) for k, v in self.tables.items()}
+
+
+def load_config():
+ try:
+ return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT))
+ except TypeError:
+ raise InterfaceNotConfigured()
diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py
new file mode 100644
index 00000000..6197ab5e
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py
@@ -0,0 +1,268 @@
+import logging
+from functools import reduce
+from typing import Callable, Iterable, Mapping, Optional
+from uuid import UUID
+
+from pandas import DataFrame, read_sql
+import sqlalchemy
+from sqlalchemy import and_, column, or_
+from sqlalchemy.sql.elements import BooleanClauseList
+from sqlalchemy.sql.expression import ColumnClause
+from typeguard import typechecked
+
+from kestrel.display import GraphletExplanation
+from kestrel.interface import AbstractInterface
+from kestrel.interface.codegen.sql import SqlTranslator, comp2func
+from kestrel.ir.filter import (
+ BoolExp,
+ ExpOp,
+ FComparison,
+ MultiComp,
+ StrComparison,
+ StrCompOp,
+)
+from kestrel.ir.graph import IRGraphEvaluable
+from kestrel.ir.instructions import (
+ DataSource,
+ Filter,
+ Instruction,
+ ProjectAttrs,
+ ProjectEntity,
+ Return,
+ SolePredecessorTransformingInstruction,
+ SourceInstruction,
+ TransformingInstruction,
+ Variable,
+)
+from kestrel.mapping.data_model import (
+ translate_comparison_to_native,
+ translate_dataframe,
+ translate_projection_to_native,
+)
+
+from kestrel_interface_sqlalchemy.config import load_config
+
+
+_logger = logging.getLogger(__name__)
+
+
+@typechecked
+class SQLAlchemyTranslator(SqlTranslator):
+ def __init__(
+ self,
+ dialect: sqlalchemy.engine.default.DefaultDialect,
+ timefmt: Callable,
+ timestamp: str,
+ from_obj: sqlalchemy.FromClause,
+ dmm: dict,
+ ):
+ super().__init__(dialect, timefmt, timestamp, from_obj)
+ self.dmm = dmm
+ self.proj = None
+ self.entity_type = None
+
+ @typechecked
+ def _render_comp(self, comp: FComparison):
+ prefix = (
+ f"{self.entity_type}."
+ if (self.entity_type and comp.field != self.timestamp)
+ else ""
+ )
+ ocsf_field = f"{prefix}{comp.field}"
+ comps = translate_comparison_to_native(
+ self.dmm, ocsf_field, comp.op, comp.value
+ )
+ translated_comps = []
+ for comp in comps:
+ field, op, value = comp
+ col: ColumnClause = column(field)
+ if op == StrCompOp.NMATCHES:
+ tmp = ~comp2func[op](col, value)
+ else:
+ tmp = comp2func[op](col, value)
+ translated_comps.append(tmp)
+ return reduce(or_, translated_comps)
+
+ @typechecked
+ def _render_multi_comp(self, comps: MultiComp):
+ op = and_ if comps.op == ExpOp.AND else or_
+ return reduce(op, map(self._render_comp, comps.comps))
+
+ # This is copied verbatim from sql.py but we need to supply our own _render_comp
+ def _render_exp(self, exp: BoolExp) -> BooleanClauseList:
+ if isinstance(exp.lhs, BoolExp):
+ lhs = self._render_exp(exp.lhs)
+ elif isinstance(exp.lhs, MultiComp):
+ lhs = self._render_multi_comp(exp.lhs)
+ else:
+ lhs = self._render_comp(exp.lhs)
+ if isinstance(exp.rhs, BoolExp):
+ rhs = self._render_exp(exp.rhs)
+ elif isinstance(exp.rhs, MultiComp):
+ rhs = self._render_multi_comp(exp.rhs)
+ else:
+ rhs = self._render_comp(exp.rhs)
+ return and_(lhs, rhs) if exp.op == ExpOp.AND else or_(lhs, rhs)
+
+ @typechecked
+ def _add_filter(self) -> Optional[str]:
+ if not self.filt:
+ return None
+ filt = self.filt
+ if filt.timerange.start:
+ # Convert the timerange to the appropriate pair of comparisons
+ start_comp = StrComparison(
+ self.timestamp, ">=", self.timefmt(filt.timerange.start)
+ )
+ stop_comp = StrComparison(
+ self.timestamp, "<", self.timefmt(filt.timerange.stop)
+ )
+ # AND them together
+ time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp)
+ # AND that with any existing filter expression
+ exp = BoolExp(filt.exp, ExpOp.AND, time_exp)
+ else:
+ exp = filt.exp
+ if isinstance(exp, BoolExp):
+ comp = self._render_exp(exp)
+ elif isinstance(exp, MultiComp):
+ comp = self._render_multi_comp(exp)
+ else:
+ comp = self._render_comp(exp)
+ self.query = self.query.where(comp)
+
+ def add_Filter(self, filt: Filter) -> None:
+ # Just save filter and compile it later
+ # Probably need the entity projection set first
+ self.filt = filt
+
+ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None:
+ self.proj = proj
+
+ def add_ProjectEntity(self, proj: ProjectEntity) -> None:
+ self.entity_type = proj.entity_type
+
+ def result(self) -> sqlalchemy.Compiled:
+ proj = self.proj.attrs if self.proj else None
+ pairs = translate_projection_to_native(self.dmm, self.entity_type, proj)
+ cols = [sqlalchemy.column(i).label(j) for i, j in pairs]
+ self._add_filter()
+ self.query = self.query.with_only_columns(*cols) # TODO: mapping?
+ return self.query.compile(dialect=self.dialect)
+
+
+class SQLAlchemyInterface(AbstractInterface):
+ def __init__(
+ self,
+ serialized_cache_catalog: Optional[str] = None,
+ session_id: Optional[UUID] = None,
+ ):
+ _logger.debug("SQLAlchemyInterface: loading config")
+ super().__init__(serialized_cache_catalog, session_id)
+ self.config = load_config()
+ self.schemas: dict = {} # Schema per table (index)
+ self.engines: dict = {} # Map of conn name -> engine
+ self.conns: dict = {} # Map of conn name -> connection
+ for info in self.config.tables.values():
+ name = info.connection
+ conn_info = self.config.connections[name]
+ if name not in self.engines:
+ self.engines[name] = sqlalchemy.create_engine(conn_info.url)
+ if name not in self.conns:
+ engine = self.engines[name]
+ self.conns[name] = engine.connect()
+ _logger.debug("SQLAlchemyInterface: configured %s", name)
+
+ @staticmethod
+ def schemes() -> Iterable[str]:
+ return ["sqlalchemy"]
+
+ def store(
+ self,
+ instruction_id: UUID,
+ data: DataFrame,
+ ):
+ raise NotImplementedError("SQLAlchemyInterface.store") # TEMP
+
+ def evaluate_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_evaluate: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, DataFrame]:
+ mapping = {}
+ if not instructions_to_evaluate:
+ instructions_to_evaluate = graph.get_sink_nodes()
+ for instruction in instructions_to_evaluate:
+ translator = self._evaluate_instruction_in_graph(graph, instruction)
+ # TODO: may catch error in case evaluation starts from incomplete SQL
+ sql = translator.result()
+ _logger.debug("SQL query generated: %s", sql)
+ # Get the "from" table for this query
+ tables = translator.query.selectable.get_final_froms()
+ table = tables[0].name # TODO: what if there's more than 1?
+ # Get the data source's SQLAlchemy connection object
+ conn = self.conns[self.config.tables[table].connection]
+ df = read_sql(sql, conn)
+ dmm = translator.dmm[
+ translator.entity_type
+ ] # TODO: need a method for this?
+ mapping[instruction.id] = translate_dataframe(df, dmm)
+ return mapping
+
+ def explain_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instructions_to_explain: Optional[Iterable[Instruction]] = None,
+ ) -> Mapping[UUID, GraphletExplanation]:
+ mapping = {}
+ if not instructions_to_explain:
+ instructions_to_explain = graph.get_sink_nodes()
+ for instruction in instructions_to_explain:
+ translator = self._evaluate_instruction_in_graph(graph, instruction)
+ dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction)
+ graph_dict = dep_graph.to_dict()
+ query_stmt = translator.result()
+ mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt)
+ return mapping
+
+ def _evaluate_instruction_in_graph(
+ self,
+ graph: IRGraphEvaluable,
+ instruction: Instruction,
+ ) -> SQLAlchemyTranslator:
+ _logger.debug("instruction: %s", str(instruction))
+ translator = None
+ if isinstance(instruction, TransformingInstruction):
+ trunk, _r2n = graph.get_trunk_n_branches(instruction)
+ translator = self._evaluate_instruction_in_graph(graph, trunk)
+
+ if isinstance(instruction, SolePredecessorTransformingInstruction):
+ if isinstance(instruction, Return):
+ pass
+ elif isinstance(instruction, Variable):
+ pass
+ else:
+ translator.add_instruction(instruction)
+
+ elif isinstance(instruction, Filter):
+ translator.add_instruction(instruction)
+
+ else:
+ raise NotImplementedError(f"Unknown instruction type: {instruction}")
+
+ elif isinstance(instruction, SourceInstruction):
+ if isinstance(instruction, DataSource):
+ ds = self.config.tables[instruction.datasource]
+ connection = ds.connection
+ dialect = self.engines[connection].dialect
+ translator = SQLAlchemyTranslator(
+ dialect,
+ lambda dt: dt.strftime(ds.timestamp_format),
+ ds.timestamp,
+ sqlalchemy.table(instruction.datasource),
+ ds.data_model_map,
+ )
+ else:
+ raise NotImplementedError(f"Unhandled instruction type: {instruction}")
+
+ return translator
diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py
new file mode 100644
index 00000000..a19d97a6
--- /dev/null
+++ b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py
@@ -0,0 +1,42 @@
+import os
+
+import yaml
+
+from kestrel_interface_sqlalchemy.config import (
+ PROFILE_PATH_ENV_VAR,
+ Connection,
+ load_config,
+)
+
+
+def test_load_config(tmp_path):
+ config = {
+ "connections": {
+ "localhost": {
+ "url": "sqlite:////home/jdoe/test.db",
+ },
+ "some-data-lake": {
+ "url": "presto://jdoe@example.com:8889/hive",
+ }
+ },
+ "tables": {
+ "cloud_table": {
+ "connection": "some-data-lake",
+ "timestamp": "eventTime",
+ "timestamp_format": "%Y-%m-%d %H:%M:%S.%f",
+ "data_model_mapping": str(tmp_path / "mapping.yaml")
+ }
+ }
+ }
+ map_file = tmp_path / "mapping.yaml"
+ with open(map_file, 'w') as fp:
+ fp.write("some.field: other.field\n")
+ config_file = tmp_path / "sqlalchemy.yaml"
+ with open(config_file, 'w') as fp:
+ yaml.dump(config, fp)
+ os.environ[PROFILE_PATH_ENV_VAR] = str(config_file)
+ read_config = load_config()
+ conn: Connection = read_config.connections["localhost"]
+ assert conn.url == config["connections"]["localhost"]["url"]
+ assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"]
+ assert read_config.tables["cloud_table"].timestamp == config["tables"]["cloud_table"]["timestamp"]
diff --git a/packages-nextgen/kestrel_jupyter/pyproject.toml b/packages-nextgen/kestrel_jupyter/pyproject.toml
index 99bfea56..3cc31435 100644
--- a/packages-nextgen/kestrel_jupyter/pyproject.toml
+++ b/packages-nextgen/kestrel_jupyter/pyproject.toml
@@ -31,6 +31,9 @@ dependencies = [
"jupyterlab",
"jupyter_client",
"nbclassic",
+ "sqlparse==0.4.4",
+ "pygments==2.17.2",
+ "matplotlib==3.8.3",
]
[project.optional-dependencies]
diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py
new file mode 100644
index 00000000..21e10883
--- /dev/null
+++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py
@@ -0,0 +1,68 @@
+from pandas import DataFrame
+import tempfile
+import base64
+import sqlparse
+from typing import Iterable, Mapping
+from pygments import highlight
+from pygments.lexers import guess_lexer
+from pygments.lexers.sql import SqlLexer
+from pygments.lexers.kusto import KustoLexer
+from pygments.formatters import HtmlFormatter
+import networkx as nx
+import matplotlib.pyplot as plt
+
+from kestrel.display import Display, GraphExplanation
+from kestrel.ir.graph import IRGraph
+from kestrel.ir.instructions import Instruction, DataSource, Variable, Construct
+
+
+def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]:
+ d = {}
+ for n in g:
+ if isinstance(n, Variable):
+ d[n] = n.name
+ elif isinstance(n, Construct):
+ d[n] = n.id.hex[:4]
+ elif isinstance(n, DataSource):
+ d[n] = n.datasource
+ else:
+ d[n] = f"[{n.instruction.upper()}]"
+ return d
+
+
+def to_html_blocks(d: Display) -> Iterable[str]:
+ if isinstance(d, DataFrame):
+ yield d.to_html()
+ elif isinstance(d, GraphExplanation):
+ for graphlet in d.graphlets:
+ graph = IRGraph(graphlet.graph)
+ plt.figure(figsize=(4, 2))
+ nx.draw(
+ graph,
+ with_labels=True,
+ labels=gen_label_mapping(graph),
+ font_size=8,
+ node_size=260,
+ node_color="#bfdff5",
+ )
+ with tempfile.NamedTemporaryFile(delete_on_close=False) as tf:
+ tf.close()
+ plt.savefig(tf.name, format="png")
+ with open(tf.name, "rb") as tfx:
+ data = tfx.read()
+
+ img = data_uri = base64.b64encode(data).decode("utf-8")
+ imgx = f''
+ yield imgx
+
+ query = graphlet.query.statement
+ if graphlet.query.language == "SQL":
+ lexer = SqlLexer()
+ query = sqlparse.format(query, reindent=True, keyword_case="upper")
+ elif graphlet.query.language == "KQL":
+ lexer = KustoLexer()
+ else:
+ lexer = guess_lexer(query)
+ query = highlight(query, lexer, HtmlFormatter())
+ style = ""
+ yield style + query
diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py
index 2d935317..456cde96 100644
--- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py
+++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py
@@ -1,7 +1,9 @@
from ipykernel.kernelbase import Kernel
import logging
+import networkx as nx
from kestrel.session import Session
+from kestrel_jupyter_kernel.display import to_html_blocks
_logger = logging.getLogger(__name__)
@@ -35,11 +37,14 @@ def do_execute(
if not silent:
try:
for result in self.kestrel_session.execute_to_generate(code):
- self.send_response(
- self.iopub_socket,
- "display_data",
- {"data": {"text/html": result.to_html()}, "metadata": {}},
- )
+ for html in to_html_blocks(result):
+ self.send_response(
+ self.iopub_socket,
+ "display_data",
+ {"data": {"text/html": html}, "metadata": {}},
+ )
+ # how to clear output (if needed in the future):
+ # self.send_response(self.iopub_socket, "clear_output")
except Exception as e:
_logger.error("Exception occurred", exc_info=True)
diff --git a/packages/kestrel_analytics_docker/pyproject.toml b/packages/kestrel_analytics_docker/pyproject.toml
index 1f668918..3b9c9283 100644
--- a/packages/kestrel_analytics_docker/pyproject.toml
+++ b/packages/kestrel_analytics_docker/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "kestrel_analytics_docker"
-version = "1.8.0"
+version = "1.8.1"
description = "Kestrel Docker Analytics Interface"
readme = "README.rst"
requires-python = ">=3.8"
@@ -28,7 +28,7 @@ classifiers = [
dependencies = [
"kestrel_core>=1.8.0",
- "docker>=6.1.3",
+ "docker>=7.0.0",
]
[project.urls]
diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml
index e8fcfa87..6d38a007 100644
--- a/packages/kestrel_core/pyproject.toml
+++ b/packages/kestrel_core/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "kestrel_core"
-version = "1.8.1"
+version = "1.8.2"
description = "Kestrel Threat Hunting Language"
readme = "README.rst"
requires-python = ">=3.8"
@@ -30,13 +30,13 @@ classifiers = [
]
dependencies = [
- "typeguard>=4.1.5",
+ "typeguard>=4.2.1",
"pyyaml>=6.0.1",
- "lark>=1.1.7",
- "pandas>=2.0.3",
- "pyarrow>=13.0.0",
+ "lark>=1.1.9",
+ "pandas>=2.0.3", # last version supporting Python 3.8
+ "pyarrow>=15.0.2",
"tabulate>=0.9.0",
- "firepit>=2.3.32",
+ "firepit>=2.3.33",
]
[project.optional-dependencies]
diff --git a/packages/kestrel_core/src/kestrel/config.yaml b/packages/kestrel_core/src/kestrel/config.yaml
index 182ddfe9..2470f465 100644
--- a/packages/kestrel_core/src/kestrel/config.yaml
+++ b/packages/kestrel_core/src/kestrel/config.yaml
@@ -5,7 +5,7 @@ language:
default_datasource_schema: "stixshifter"
default_analytics_schema: "python"
-# how a Kestrel session is executed
+# Kestrel session execution
session:
cache_directory_prefix: "kestrel-session-" # under system temp directory
local_database_path: "local.db"
diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml
index b4e4f830..05e831f7 100644
--- a/packages/kestrel_datasource_stixshifter/pyproject.toml
+++ b/packages/kestrel_datasource_stixshifter/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "kestrel_datasource_stixshifter"
-version = "1.8.1"
+version = "1.8.2"
description = "Kestrel STIX-shifter Datasource Interface"
readme = "README.rst"
requires-python = ">=3.8"
@@ -28,11 +28,11 @@ classifiers = [
dependencies = [
"kestrel_core>=1.8.1",
- "lxml>=4.9.3",
+ "lxml>=5.2.1",
"requests>=2.31.0",
- "nest-asyncio>=1.5.8",
- "stix-shifter==6.2.2",
- "stix-shifter-utils==6.2.2",
+ "nest-asyncio>=1.6.0",
+ "stix-shifter==7.0.6",
+ "stix-shifter-utils==7.0.6",
]
[project.optional-dependencies]
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py
index 27df919a..73eb8ff8 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py
@@ -22,6 +22,7 @@
SINGLE_BATCH_TIMEOUT = 60
COOL_DOWN_AFTER_TRANSMISSION = 0
ALLOW_DEV_CONNECTOR = False
+VERIFY_CERT = True
FAST_TRANSLATE_CONNECTORS = [] # Suggested: ["qradar", "elastic_ecs"]
@@ -175,6 +176,14 @@ def get_datasource_from_profiles(profile_name, profiles):
profile_name,
)
+ verify_cert = _extract_param_from_connection_config(
+ "verify_cert",
+ bool,
+ VERIFY_CERT,
+ connection,
+ profile_name,
+ )
+
return (
connector_name,
connection,
@@ -182,6 +191,7 @@ def get_datasource_from_profiles(profile_name, profiles):
retrieval_batch_size,
cool_down_after_transmission,
allow_dev_connector,
+ verify_cert,
)
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py
index 370eecea..d090f003 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py
@@ -12,8 +12,12 @@
_logger = logging.getLogger(__name__)
-XPATH_PYPI_PKG_HOME = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[1]/a/@href"
-XPATH_PYPI_PKG_SOURCE = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[2]/a/@href"
+XPATH_PYPI_PKG_HOME = [
+ f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[1]/a/@href" for i in range(5)
+]
+XPATH_PYPI_PKG_SOURCE = [
+ f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[2]/a/@href" for i in range(5)
+]
STIX_SHIFTER_HOMEPAGE = "https://github.com/opencybersecurityalliance/stix-shifter"
@@ -39,8 +43,16 @@ def verify_package_origin(connector_name, stixshifter_version, requests_verify=T
)
try:
- p_homepage = pypi_etree.xpath(XPATH_PYPI_PKG_HOME)[0]
- p_source = pypi_etree.xpath(XPATH_PYPI_PKG_SOURCE)[0]
+ p_homepage = [
+ urls
+ for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_HOME]
+ if urls
+ ][0][0]
+ p_source = [
+ urls
+ for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_SOURCE]
+ if urls
+ ][0][0]
except:
raise DataSourceError(
f'STIX-shifter connector for "{connector_name}" is not installed '
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py
index 487f7944..c3631f7a 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py
@@ -11,6 +11,9 @@
from kestrel_datasource_stixshifter.worker import STOP_SIGN
from kestrel_datasource_stixshifter.query import translate_query
from kestrel_datasource_stixshifter.worker.transmitter import Transmitter
+from kestrel_datasource_stixshifter.worker.utils import (
+ disable_cert_verification_on_transmission,
+)
from stix_shifter.stix_transmission import stix_transmission
@@ -26,6 +29,7 @@ def __init__(self, datasource_name):
self.retrieval_batch_size,
self.cool_down_after_transmission,
self.allow_dev_connector,
+ self.verify_cert,
) = get_datasource_from_profiles(datasource_name, self.profiles)
self.if_fast_translation = (
self.connector_name in self.kestrel_options["fast_translate"]
@@ -72,6 +76,9 @@ def diagnose_ping(self):
self.configuration_dict,
)
+ if not self.verify_cert:
+ disable_cert_verification_on_transmission(transmission)
+
result = transmission.ping()
print()
@@ -125,6 +132,7 @@ def diagnose_run_query_and_retrieval_result(self, stix_patterns, max_batch_cnt):
self.configuration_dict,
self.retrieval_batch_size,
self.cool_down_after_transmission,
+ self.verify_cert,
query,
result_queue,
max_batch_cnt * self.retrieval_batch_size,
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py
index a662ea4b..9435cebe 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py
@@ -26,9 +26,9 @@
connection:
host: elastic.securitylog.company.com
port: 9200
- selfSignedCert: false # this means do NOT check cert
indices: host101
options: # use any of this section when needed
+ verify_cert: false # allow invalid/expired/self-signed certificate
retrieval_batch_size: 10000 # set to 10000 to match default Elasticsearch page size; Kestrel default across connectors: 2000
single_batch_timeout: 120 # increase it if hit 60 seconds (Kestrel default) timeout error for each batch of retrieval
cool_down_after_transmission: 2 # seconds to cool down between data source API calls, required by some API such as sentinelone; Kestrel default: 0
@@ -127,11 +127,15 @@
"""
+import multiprocessing
from kestrel.datasource import AbstractDataSourceInterface
from kestrel_datasource_stixshifter.config import load_profiles
from kestrel_datasource_stixshifter.query import query_datasource
+multiprocessing.set_start_method("spawn", force=True)
+
+
class StixShifterInterface(AbstractDataSourceInterface):
@staticmethod
def schemes():
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py
index aeadfc83..cdb1a719 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py
@@ -22,6 +22,7 @@ def transmit(
retrieval_batch_size: int,
translators_count: int,
cool_down_after_transmission: int,
+ verify_cert: bool,
queries: list,
raw_records_queue: Queue,
limit: Optional[int],
@@ -34,6 +35,7 @@ def transmit(
retrieval_batch_size,
translators_count,
cool_down_after_transmission,
+ verify_cert,
queries,
raw_records_queue,
limit,
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py
index fa0d61e5..46b07b7f 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py
@@ -83,6 +83,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None):
retrieval_batch_size,
cool_down_after_transmission,
allow_dev_connector,
+ verify_cert,
) = map(
copy.deepcopy, get_datasource_from_profiles(profile, config["profiles"])
)
@@ -123,6 +124,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None):
retrieval_batch_size,
config["options"]["translation_workers_count"],
cool_down_after_transmission,
+ verify_cert,
dsl["queries"],
raw_records_queue,
profile_limit,
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py
index ca4cd1c0..31534781 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py
@@ -6,7 +6,11 @@
from stix_shifter.stix_transmission import stix_transmission
from kestrel_datasource_stixshifter.worker import STOP_SIGN
-from kestrel_datasource_stixshifter.worker.utils import TransmissionResult, WorkerLog
+from kestrel_datasource_stixshifter.worker.utils import (
+ TransmissionResult,
+ WorkerLog,
+ disable_cert_verification_on_transmission,
+)
@typechecked
@@ -19,6 +23,7 @@ def __init__(
retrieval_batch_size: int,
number_of_translators: int,
cool_down_after_transmission: int,
+ verify_cert: bool,
queries: list,
output_queue: Queue,
limit: Optional[int],
@@ -31,6 +36,7 @@ def __init__(
self.retrieval_batch_size = retrieval_batch_size
self.number_of_translators = number_of_translators
self.cool_down_after_transmission = cool_down_after_transmission
+ self.verify_cert = verify_cert
self.queries = queries
self.queue = output_queue
self.limit = limit
@@ -43,6 +49,7 @@ def run(self):
self.configuration_dict,
self.retrieval_batch_size,
self.cool_down_after_transmission,
+ self.verify_cert,
query,
self.queue,
self.limit,
@@ -65,6 +72,7 @@ def __init__(
configuration_dict: dict,
retrieval_batch_size: int,
cool_down_after_transmission: int,
+ verify_cert: bool,
query: str,
output_queue: Queue,
limit: Optional[int],
@@ -76,6 +84,7 @@ def __init__(
self.configuration_dict = configuration_dict
self.retrieval_batch_size = retrieval_batch_size
self.cool_down_after_transmission = cool_down_after_transmission
+ self.verify_cert = verify_cert
self.query = query
self.queue = output_queue
self.limit = limit
@@ -87,6 +96,11 @@ def run(self):
self.connection_dict,
self.configuration_dict,
)
+
+ # hack stix-shifter v7 to support "disable certificate verification"
+ if not self.verify_cert:
+ disable_cert_verification_on_transmission(self.transmission)
+
search_meta_result = self.transmission.query(self.query)
if search_meta_result["success"]:
diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py
index 9a8d00af..406b4570 100644
--- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py
+++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py
@@ -1,6 +1,8 @@
+import ssl
from typing import Optional, Union, List
from dataclasses import dataclass
from pandas import DataFrame
+from stix_shifter.stix_transmission.stix_transmission import StixTransmission
STOP_SIGN = "STOP"
@@ -30,3 +32,18 @@ class TranslationResult:
success: bool
data: Union[None, dict, DataFrame]
log: Optional[WorkerLog]
+
+
+def disable_cert_verification_on_transmission(trans: StixTransmission):
+ ot = trans.entry_point.transmission()
+
+ # currently all the following attributes point to the same object
+ # iterate through them in case stix-shifter code changes in the future
+ for attr in [
+ x
+ for x in dir(ot)
+ if x.startswith("_BaseEntryPoint__") and x.endswith("_connector")
+ ]:
+ c = getattr(ot, attr)
+ c.api_client.client.ssl_context.check_hostname = False
+ c.api_client.client.ssl_context.verify_mode = ssl.CERT_NONE
diff --git a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py
index 89b62efa..610a513c 100644
--- a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py
+++ b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py
@@ -78,6 +78,7 @@ def test_yaml_profiles_refresh(tmp_path):
single_batch_timeout: 120
cool_down_after_transmission: 5
allow_dev_connector: True
+ verify_cert: false
dialects:
- beats
config:
@@ -106,7 +107,7 @@ def test_yaml_profiles_refresh(tmp_path):
ss_config = s.config["datasources"]["kestrel_datasource_stixshifter"]
ss_profiles = ss_config["profiles"]
- connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles)
+ connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles)
assert connector_name == "elastic_ecs"
assert configuration["auth"]["id"] == "profileA"
assert configuration["auth"]["api_key"] == "qwer"
@@ -114,6 +115,7 @@ def test_yaml_profiles_refresh(tmp_path):
assert connection["options"]["result_limit"] == 2000 * 2
assert retrieval_batch_size == 2000
assert cool_down_after_transmission == 0
+ assert verify_cert == True
with open(profile_file, "w") as pf:
pf.write(profileB)
@@ -122,7 +124,7 @@ def test_yaml_profiles_refresh(tmp_path):
# need to refresh the pointers since the dict is updated
ss_profiles = ss_config["profiles"]
- connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles)
+ connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles)
assert connector_name == "elastic_ecs"
assert configuration["auth"]["id"] == "profileB"
assert configuration["auth"]["api_key"] == "xxxxxx"
@@ -131,5 +133,6 @@ def test_yaml_profiles_refresh(tmp_path):
assert retrieval_batch_size == 10000
assert cool_down_after_transmission == 5
assert allow_dev_connector == True
+ assert verify_cert == False
del os.environ["KESTREL_STIXSHIFTER_CONFIG"]
diff --git a/packages/kestrel_jupyter/pyproject.toml b/packages/kestrel_jupyter/pyproject.toml
index 70887889..888a3cac 100644
--- a/packages/kestrel_jupyter/pyproject.toml
+++ b/packages/kestrel_jupyter/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "kestrel_jupyter"
-version = "1.8.2"
+version = "1.8.3"
description = "Kestrel Jupyter Kernel"
readme = "README.rst"
requires-python = ">=3.8"
@@ -26,11 +26,11 @@ classifiers = [
]
dependencies = [
- "kestrel_core==1.8.1",
+ "kestrel_core==1.8.2",
"kestrel_datasource_stixbundle==1.8.0",
- "kestrel_datasource_stixshifter==1.8.1",
+ "kestrel_datasource_stixshifter==1.8.2",
"kestrel_analytics_python==1.8.0",
- "kestrel_analytics_docker==1.8.0",
+ "kestrel_analytics_docker==1.8.1",
"jupyterlab-server",
"jupyterlab",
"jupyter_client",