From 6de9078b2c7b8c91bd0dbc576051043377fbb266 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 23 Feb 2024 16:45:08 -0500 Subject: [PATCH 01/61] add comment on how to clear output --- .../kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py index 2d935317..ba361f0d 100644 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py @@ -40,6 +40,8 @@ def do_execute( "display_data", {"data": {"text/html": result.to_html()}, "metadata": {}}, ) + # how to clear output (if needed in the future): + # self.send_response(self.iopub_socket, "clear_output") except Exception as e: _logger.error("Exception occurred", exc_info=True) From 8e74c2d09827d38894c2f2d3788e83b150392a9b Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 27 Feb 2024 10:41:12 -0500 Subject: [PATCH 02/61] add EXPLAIN in front-end --- .../src/kestrel/frontend/compile.py | 8 ++++++ .../src/kestrel/frontend/kestrel.lark | 5 +++- .../src/kestrel/ir/instructions.py | 5 ++++ .../kestrel_core/tests/test_parser.py | 26 +++++++++++++++++++ 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py index fcbab5b4..7e720d86 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py @@ -41,6 +41,7 @@ Return, Sort, Variable, + Explain, ) from kestrel.exceptions import IRGraphMissingNode @@ -371,3 +372,10 @@ def disp(self, args): graph, root = args[0] graph.add_node(Return(), root) return graph + + def explain(self, args): + graph = IRGraph() + reference = graph.add_node(Reference(args[0].value)) + explain = graph.add_node(Explain(), reference) + graph.add_node(Return(), explain) + return graph diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark index eda6958c..1e00bfc9 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/kestrel.lark @@ -28,10 +28,11 @@ assignment: VARIABLE "=" expression | sort ?command_no_result: apply + | explain + | describe | disp | info | save - | describe // // All commands @@ -61,6 +62,8 @@ save: "SAVE"i VARIABLE "TO"i stdpath describe: "DESCRIBE"i var_attr +explain: "EXPLAIN"i VARIABLE + // // Variable definition // diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py index 0d667ea3..8b1aa1e3 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/ir/instructions.py @@ -168,6 +168,11 @@ class Reference(IntermediateInstruction): name: str +@dataclass(eq=False) +class Explain(SolePredecessorTransformingInstruction): + pass + + @dataclass(eq=False) class Limit(SolePredecessorTransformingInstruction): num: int diff --git a/packages-nextgen/kestrel_core/tests/test_parser.py b/packages-nextgen/kestrel_core/tests/test_parser.py index 14faa856..3e7310d5 100644 --- a/packages-nextgen/kestrel_core/tests/test_parser.py +++ b/packages-nextgen/kestrel_core/tests/test_parser.py @@ -1,5 +1,6 @@ import json import pytest +from collections import Counter from datetime import datetime, timedelta, timezone from kestrel.frontend.parser import parse_kestrel @@ -16,6 +17,8 @@ Reference, Sort, Variable, + Explain, + Return, ) @@ -265,3 +268,26 @@ def test_parser_disp_after_new(): assert (proj, limit) in graph.edges assert (limit, offset) in graph.edges assert (offset, ret) in graph.edges + + +def test_parser_explain_alone(): + stmt = "EXPLAIN abc" + graph = parse_kestrel(stmt) + assert len(graph) == 3 + assert len(graph.edges) == 2 + assert Counter(map(type, graph.nodes())) == Counter([Reference, Explain, Return]) + + +def test_parser_explain_dereferred(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +EXPLAIN proclist +""" + graph = parse_kestrel(stmt) + assert len(graph) == 4 + assert len(graph.edges) == 3 + assert Counter(map(type, graph.nodes())) == Counter([Construct, Variable, Explain, Return]) From e3e4d4f1f8bf63909858bf040f922eb08d7ecc4b Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Wed, 28 Feb 2024 10:31:11 -0500 Subject: [PATCH 03/61] implement EXPLAIN command --- .../kestrel_core/src/kestrel/cache/base.py | 29 ++++++++--- .../src/kestrel/cache/inmemory.py | 52 +++++++++++++++---- .../kestrel_core/src/kestrel/cache/sqlite.py | 41 ++++++++++++++- .../kestrel_core/src/kestrel/display.py | 25 +++++++++ .../src/kestrel/interface/datasource/base.py | 23 +++++++- .../kestrel_core/src/kestrel/ir/graph.py | 2 +- .../kestrel_core/src/kestrel/session.py | 41 +++++++++------ .../kestrel_core/tests/test_cache_inmemory.py | 35 +++++++++++++ .../kestrel_core/tests/test_cache_sqlite.py | 33 +++++++++++- .../kestrel_core/tests/test_session.py | 26 ++++++++++ .../interface.py | 17 ++++++ 11 files changed, 288 insertions(+), 36 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py index b4f5f101..d9cd0ea5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py @@ -1,3 +1,4 @@ +from __future__ import annotations from pandas import DataFrame from typing import MutableMapping from uuid import UUID @@ -28,6 +29,8 @@ def __del__(self): def __getitem__(self, instruction_id: UUID) -> DataFrame: """Get the dataframe for the cached instruction + This method will automatically support `uuid in cache` + Parameters: instruction_id: id of the instruction @@ -57,16 +60,28 @@ def __delitem__(self, instruction_id: UUID): """ ... - def store(self, instruction_id: UUID, data: DataFrame): - self[instruction_id] = data + @abstractmethod + def get_virtual_copy(self) -> AbstractCache: + """Create a virtual cache object from this cache - def __contain__(self, instruction_id: UUID) -> bool: - """Whether the evaluated instruction is cached + This method needs to reimplement __getitem__, __setitem__, __delitem__ + to not actually hit the store media of the cache, e.g., SQLite. - Parameters: - instruction_id: id of the instruction + The virtual cache is useful for the implementation of the Explain() + instruction, pretending the dependent graphs are evaluated, so the + evaluation can continue towards the Return() instruction. + + Because Python invokes special methods from class methods, replacing + the __getitem__, __setitem__, and __delitem__ in the object does not + help. It is better to derive a subclass and replace __class__ of the + object to the subclass to correctly invoke the new set of __xitem___. + + https://docs.python.org/3/reference/datamodel.html#special-lookup """ - return instruction_id in self.cache_catalog + ... + + def store(self, instruction_id: UUID, data: DataFrame): + self[instruction_id] = data def __iter__(self) -> UUID: """Return UUIDs of instructions cached diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py index e0527b9c..82f3f3fb 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py @@ -1,3 +1,4 @@ +from copy import copy from pandas import DataFrame from typeguard import typechecked from uuid import UUID @@ -6,13 +7,16 @@ MutableMapping, Optional, Iterable, + Any, ) from kestrel.cache.base import AbstractCache from kestrel.ir.graph import IRGraphEvaluable +from kestrel.display import GraphletExplanation from kestrel.ir.instructions import ( Instruction, Return, + Explain, Variable, Filter, SourceInstruction, @@ -44,7 +48,7 @@ def __getitem__(self, instruction_id: UUID) -> DataFrame: return self.cache[self.cache_catalog[instruction_id]] def __delitem__(self, instruction_id: UUID): - del self.cache[instruction_id] + del self.cache[self.cache_catalog[instruction_id]] del self.cache_catalog[instruction_id] def __setitem__( @@ -52,23 +56,41 @@ def __setitem__( instruction_id: UUID, data: DataFrame, ): - self.cache[instruction_id] = data - self.cache_catalog[instruction_id] = instruction_id + self.cache_catalog[instruction_id] = instruction_id.hex + self.cache[self.cache_catalog[instruction_id]] = data + + def get_virtual_copy(self) -> AbstractCache: + v = copy(self) + v.cache_catalog = copy(self.cache_catalog) + v.__class__ = InMemoryCacheVirtual + return v def evaluate_graph( self, graph: IRGraphEvaluable, instructions_to_evaluate: Optional[Iterable[Instruction]] = None, ) -> Mapping[UUID, DataFrame]: + mapping = {} if not instructions_to_evaluate: instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + df = self._evaluate_instruction_in_graph(graph, instruction) + self[instruction.id] = df + mapping[instruction.id] = df + return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: mapping = {} - for ins in instructions_to_evaluate: - df = self._evaluate_instruction_in_graph(graph, ins) - self[ins.id] = df - mapping[ins.id] = df - + if not instructions_to_evaluate: + instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + mapping[instruction.id] = GraphletExplanation(graph_dict, "") return mapping def _evaluate_instruction_in_graph( @@ -81,7 +103,7 @@ def _evaluate_instruction_in_graph( elif isinstance(instruction, TransformingInstruction): trunk, r2n = graph.get_trunk_n_branches(instruction) df = self._evaluate_instruction_in_graph(graph, trunk) - if isinstance(instruction, Return): + if isinstance(instruction, (Return, Explain)): pass elif isinstance(instruction, Variable): self[instruction.id] = df @@ -99,3 +121,15 @@ def _evaluate_instruction_in_graph( else: raise NotImplementedError(f"Unknown instruction type: {instruction}") return df + + +@typechecked +class InMemoryCacheVirtual(InMemoryCache): + def __getitem__(self, instruction_id: UUID) -> Any: + return self.cache_catalog[instruction_id] + + def __delitem__(self, instruction_id: UUID): + del self.cache_catalog[instruction_id] + + def __setitem__(self, instruction_id: UUID, data: Any): + self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 545513a5..4cfeae4b 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -1,5 +1,6 @@ import logging -from typing import Iterable, Mapping, Optional, Union +from copy import copy +from typing import Iterable, Mapping, Optional, Union, Any from uuid import UUID import sqlalchemy @@ -10,10 +11,12 @@ from kestrel.cache.base import AbstractCache from kestrel.interface.datasource.codegen.sql import SqlTranslator from kestrel.ir.graph import IRGraphEvaluable +from kestrel.display import GraphletExplanation from kestrel.ir.instructions import ( Construct, Instruction, Return, + Explain, Variable, Filter, SourceInstruction, @@ -77,6 +80,12 @@ def __setitem__( self.cache_catalog[instruction_id] = table_name data.to_sql(table_name, con=self.connection, if_exists="replace", index=False) + def get_virtual_copy(self) -> AbstractCache: + v = copy(self) + v.cache_catalog = copy(self.cache_catalog) + v.__class__ = SqliteCacheVirtual + return v + def evaluate_graph( self, graph: IRGraphEvaluable, @@ -93,6 +102,22 @@ def evaluate_graph( mapping[instruction.id] = read_sql(translator.result(), self.connection) return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + translator = self._evaluate_instruction_in_graph(graph, instruction) + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query_stmt = str(translator.result_w_literal_binds()) + mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + return mapping + def _evaluate_instruction_in_graph( self, graph: IRGraphEvaluable, @@ -118,7 +143,7 @@ def _evaluate_instruction_in_graph( translator = self._evaluate_instruction_in_graph(graph, trunk) if isinstance(instruction, SolePredecessorTransformingInstruction): - if isinstance(instruction, Return): + if isinstance(instruction, (Return, Explain)): pass elif isinstance(instruction, Variable): # start a new translator and use previous one as subquery @@ -147,3 +172,15 @@ def _evaluate_instruction_in_graph( raise NotImplementedError(f"Unknown instruction type: {instruction}") return translator + + +@typechecked +class SqliteCacheVirtual(SqliteCache): + def __getitem__(self, instruction_id: UUID) -> Any: + return self.cache_catalog[instruction_id] + + def __delitem__(self, instruction_id: UUID): + del self.cache_catalog[instruction_id] + + def __setitem__(self, instruction_id: UUID, data: Any): + self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex diff --git a/packages-nextgen/kestrel_core/src/kestrel/display.py b/packages-nextgen/kestrel_core/src/kestrel/display.py index 49758f4d..c5aefeb4 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/display.py +++ b/packages-nextgen/kestrel_core/src/kestrel/display.py @@ -1 +1,26 @@ +from typing import List, Union +from dataclasses import dataclass +from mashumaro.mixins.json import DataClassJSONMixin +from pandas import DataFrame + + +@dataclass +class GraphletExplanation(DataClassJSONMixin): + # serialized IRGraph + graph: dict + # SQL/KQL query statement + query: str + + +@dataclass +class GraphExplanation(DataClassJSONMixin): + graphlets: List[GraphletExplanation] + + # Kestrel Display Object +Display = Union[ + str, + dict, + DataFrame, + GraphExplanation, +] diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py index 0e730d89..b838ecd6 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py @@ -9,6 +9,7 @@ Iterable, ) +from kestrel.display import GraphletExplanation from kestrel.ir.instructions import Instruction from kestrel.ir.graph import IRGraphEvaluable from kestrel.exceptions import ( @@ -97,7 +98,7 @@ def evaluate_graph( Parameters: - graph: The IRGraph with zero or one interface + graph: The evaluate IRGraph instructions_to_evaluate: instructions to evaluate and return; by default, it will be all Return instructions in the graph @@ -107,6 +108,26 @@ def evaluate_graph( """ ... + @abstractmethod + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + """Explain how to evaluate the IRGraph + + Parameters: + + graph: The evaluable IRGraph + + instructions_to_explain: instructions to explain and return; by default, it will be all Return instructions in the graph + + Returns: + + GraphletExplanation (a Kestrel Display object) for each instruction in instructions_to_explain. + """ + ... + def cache_catalog_to_json(self) -> str: """Serialize the cache catalog to a JSON string""" return json.dumps(self.cache_catalog) diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py index f948dff9..e5729cda 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py +++ b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py @@ -774,7 +774,7 @@ def _add_node(self, node: Instruction, deref: bool = True) -> Instruction: class IRGraphSimpleQuery(IRGraphEvaluable): """Simple Query IRGraph - A simple query IRGraph is an evaluatable IRGraph that + A simple query IRGraph is an evaluable IRGraph that 1. It contains one source node diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index bbbe1ad4..91f0af44 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -1,12 +1,12 @@ import logging from contextlib import AbstractContextManager -from typing import Iterable from uuid import UUID, uuid4 - -from pandas import DataFrame +from typing import Iterable from typeguard import typechecked +from kestrel.display import Display, GraphExplanation from kestrel.ir.graph import IRGraph +from kestrel.ir.instructions import Explain from kestrel.frontend.parser import parse_kestrel from kestrel.cache import AbstractCache, SqliteCache from kestrel.interface.datasource import AbstractDataSourceInterface @@ -34,7 +34,7 @@ def __init__(self): data_source_manager = DataSourceManager() self.interfaces.extend(data_source_manager.interfaces()) - def execute(self, huntflow_block: str) -> Iterable[DataFrame]: + def execute(self, huntflow_block: str) -> Iterable[Display]: """Execute a Kestrel huntflow block. Execute a Kestrel statement or multiple consecutive statements (a @@ -50,7 +50,7 @@ def execute(self, huntflow_block: str) -> Iterable[DataFrame]: """ return list(self.execute_to_generate(huntflow_block)) - def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]: + def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: """Execute a Kestrel huntflow and put results in a generator. Parameters: @@ -60,23 +60,34 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[DataFrame]: Evaluated result per Return instruction """ - # TODO: return type generalization - irgraph_new = parse_kestrel(huntflow_block) self.irgraph.update(irgraph_new) for ret in irgraph_new.get_returns(): - ret_df = None - while ret_df is None: - for g in self.irgraph.find_dependent_subgraphs_of_node(ret, self.cache): + is_explain = isinstance(irgraph_new.get_trunk_n_branches(ret)[0], Explain) + is_complete = False + display = GraphExplanation([]) + cache = self.cache.get_virtual_copy() if is_explain else self.cache + while not is_complete: + for g in self.irgraph.find_dependent_subgraphs_of_node(ret, cache): interface = get_interface_by_name(g.interface, self.interfaces) - for iid, df in interface.evaluate_graph(g).items(): - if g.interface != self.cache.name: - self.cache[iid] = df + # intermediate result dictionary + ird = ( + interface.explain_graph(g) + if is_explain + else interface.evaluate_graph(g) + ) + for iid, _display in ird.items(): + if is_explain: + display.graphlets.append(_display) + else: + display = _display + if g.interface != cache.name: + cache[iid] = True if iid == ret.id: - ret_df = df + is_complete = True else: - yield ret_df + yield display def do_complete(self, huntflow_block: str, cursor_pos: int): """Kestrel code auto-completion. diff --git a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py index f750c38d..1a0bb9ca 100644 --- a/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py +++ b/packages-nextgen/kestrel_core/tests/test_cache_inmemory.py @@ -3,6 +3,7 @@ from uuid import uuid4 from kestrel.cache import InMemoryCache +from kestrel.cache.inmemory import InMemoryCacheVirtual from kestrel.ir.graph import IRGraph, IRGraphEvaluable from kestrel.frontend.parser import parse_kestrel @@ -84,3 +85,37 @@ def test_eval_filter_with_ref(): assert len(rets) == 1 df = mapping[rets[0].id] assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] + +def test_get_virtual_copy(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' +""" + graph = IRGraphEvaluable(parse_kestrel(stmt)) + c = InMemoryCache() + mapping = c.evaluate_graph(graph) + v = c.get_virtual_copy() + new_entry = uuid4() + v[new_entry] = True + + # v[new_entry] calls the right method + assert isinstance(v, InMemoryCacheVirtual) + assert v[new_entry].startswith("virtual") + + # v[new_entry] does not hit v.cache + assert len(c.cache) == 2 + assert len(v.cache) == 2 + + # the two cache_catalog are different + assert new_entry not in c + assert new_entry in v + del v[new_entry] + assert new_entry not in v + for u in c: + del v[u] + assert len(v) == 0 + assert len(c) == 2 diff --git a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py index f5b99090..2dd71d51 100644 --- a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py +++ b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py @@ -1,7 +1,8 @@ from uuid import uuid4 from pandas import DataFrame -from kestrel.cache.sqlite import SqliteCache +from kestrel.cache import SqliteCache +from kestrel.cache.sqlite import SqliteCacheVirtual from kestrel.ir.graph import IRGraphEvaluable from kestrel.frontend.parser import parse_kestrel @@ -150,3 +151,33 @@ def test_eval_filter_with_ref(): assert len(rets) == 1 df = mapping[rets[0].id] assert df.to_dict("records") == [ {"name": "firefox.exe", "pid": 201} ] + +def test_get_virtual_copy(): + stmt = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name = 'firefox.exe' OR name = 'chrome.exe' +""" + graph = IRGraphEvaluable(parse_kestrel(stmt)) + c = SqliteCache() + mapping = c.evaluate_graph(graph) + v = c.get_virtual_copy() + new_entry = uuid4() + v[new_entry] = True + + # v[new_entry] calls the right method + assert isinstance(v, SqliteCacheVirtual) + assert v[new_entry].startswith("virtual") + + # the two cache_catalog are different + assert new_entry not in c + assert new_entry in v + del v[new_entry] + assert new_entry not in v + for u in c: + del v[u] + assert len(v) == 0 + assert len(c) == 1 diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index bcbfdeb0..263d86b2 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -2,6 +2,9 @@ from kestrel import Session from pandas import DataFrame +from kestrel.display import GraphExplanation +from kestrel.ir.instructions import Construct + def test_execute_in_cache(): hf = """ @@ -26,3 +29,26 @@ def test_execute_in_cache(): assert b2.equals(next(res)) with pytest.raises(StopIteration): next(res) + +def test_explain_in_cache(): + hf = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +browsers = proclist WHERE name != "cmd.exe" +chrome = browsers WHERE pid = 205 +EXPLAIN chrome +""" + with Session() as session: + ress = session.execute_to_generate(hf) + res = next(ress) + assert isinstance(res, GraphExplanation) + assert len(res.graphlets) == 1 + ge = res.graphlets[0] + assert ge.graph == session.irgraph.to_dict() + construct = session.irgraph.get_nodes_by_type(Construct)[0] + assert ge.query == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM "{construct.id.hex}") AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' + with pytest.raises(StopIteration): + next(ress) diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py index c1406abc..29666511 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py +++ b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py @@ -8,6 +8,7 @@ from kestrel.exceptions import DataSourceError from kestrel.interface.datasource.base import AbstractDataSourceInterface from kestrel.ir.graph import IRGraphEvaluable +from kestrel.display import GraphletExplanation from kestrel.ir.instructions import ( DataSource, Instruction, @@ -123,6 +124,22 @@ def evaluate_graph( client.close() return mapping + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + translator = self._evaluate_instruction_in_graph(graph, instruction) + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query_stmt = translator.result() + mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + return mapping + def _evaluate_instruction_in_graph( self, graph: IRGraphEvaluable, From 137c7910886f4aaddc31076eae56023953ff64ff Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Wed, 28 Feb 2024 17:03:13 -0500 Subject: [PATCH 04/61] add Jupyter kernel support of EXPLAIN command --- .../kestrel_jupyter/pyproject.toml | 3 ++ .../src/kestrel_jupyter_kernel/display.py | 38 +++++++++++++++++++ .../src/kestrel_jupyter_kernel/kernel.py | 13 ++++--- 3 files changed, 49 insertions(+), 5 deletions(-) create mode 100644 packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py diff --git a/packages-nextgen/kestrel_jupyter/pyproject.toml b/packages-nextgen/kestrel_jupyter/pyproject.toml index 99bfea56..3cc31435 100644 --- a/packages-nextgen/kestrel_jupyter/pyproject.toml +++ b/packages-nextgen/kestrel_jupyter/pyproject.toml @@ -31,6 +31,9 @@ dependencies = [ "jupyterlab", "jupyter_client", "nbclassic", + "sqlparse==0.4.4", + "pygments==2.17.2", + "matplotlib==3.8.3", ] [project.optional-dependencies] diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py new file mode 100644 index 00000000..f93f5169 --- /dev/null +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py @@ -0,0 +1,38 @@ +from pandas import DataFrame +import tempfile +import base64 +import sqlparse +from pygments import highlight +from pygments.lexers.sql import SqlLexer +from pygments.formatters import HtmlFormatter +import networkx as nx +import matplotlib.pyplot as plt + +from kestrel.display import Display, GraphExplanation +from kestrel.ir.graph import IRGraph + + +def to_html_blocks(d: Display) -> str: + if isinstance(d, DataFrame): + yield d.to_html() + elif isinstance(d, GraphExplanation): + for graphlet in d.graphlets: + graph = IRGraph(graphlet.graph) + plt.figure(figsize=(4, 2)) + nx.draw(graph) + with tempfile.NamedTemporaryFile(delete_on_close=False) as tf: + tf.close() + plt.savefig(tf.name, format="png") + with open(tf.name, "rb") as tfx: + data = tfx.read() + + img = data_uri = base64.b64encode(data).decode("utf-8") + imgx = f'' + yield imgx + + query_indented = sqlparse.format( + graphlet.query, reindent=True, keyword_case="upper" + ) + query = highlight(query_indented, SqlLexer(), HtmlFormatter()) + style = "" + yield style + query diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py index ba361f0d..456cde96 100644 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/kernel.py @@ -1,7 +1,9 @@ from ipykernel.kernelbase import Kernel import logging +import networkx as nx from kestrel.session import Session +from kestrel_jupyter_kernel.display import to_html_blocks _logger = logging.getLogger(__name__) @@ -35,11 +37,12 @@ def do_execute( if not silent: try: for result in self.kestrel_session.execute_to_generate(code): - self.send_response( - self.iopub_socket, - "display_data", - {"data": {"text/html": result.to_html()}, "metadata": {}}, - ) + for html in to_html_blocks(result): + self.send_response( + self.iopub_socket, + "display_data", + {"data": {"text/html": html}, "metadata": {}}, + ) # how to clear output (if needed in the future): # self.send_response(self.iopub_socket, "clear_output") From f22cb703d20dcef70bbdafec32adcebc394c3d2b Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 29 Feb 2024 22:23:00 -0500 Subject: [PATCH 05/61] improve Display object and Jupyter rendering --- .../src/kestrel/cache/inmemory.py | 5 +-- .../kestrel_core/src/kestrel/cache/sqlite.py | 6 ++-- .../kestrel_core/src/kestrel/display.py | 16 ++++++--- .../kestrel_core/tests/test_session.py | 3 +- .../src/kestrel_jupyter_kernel/display.py | 35 +++++++++++++++---- 5 files changed, 49 insertions(+), 16 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py index 82f3f3fb..b96e00bd 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py @@ -12,7 +12,7 @@ from kestrel.cache.base import AbstractCache from kestrel.ir.graph import IRGraphEvaluable -from kestrel.display import GraphletExplanation +from kestrel.display import GraphletExplanation, NativeQuery from kestrel.ir.instructions import ( Instruction, Return, @@ -90,7 +90,8 @@ def explain_graph( for instruction in instructions_to_evaluate: dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) graph_dict = dep_graph.to_dict() - mapping[instruction.id] = GraphletExplanation(graph_dict, "") + query = NativeQuery("DataFrame", "") + mapping[instruction.id] = GraphletExplanation(graph_dict, query) return mapping def _evaluate_instruction_in_graph( diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 4cfeae4b..0001e0f4 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -11,7 +11,7 @@ from kestrel.cache.base import AbstractCache from kestrel.interface.datasource.codegen.sql import SqlTranslator from kestrel.ir.graph import IRGraphEvaluable -from kestrel.display import GraphletExplanation +from kestrel.display import GraphletExplanation, NativeQuery from kestrel.ir.instructions import ( Construct, Instruction, @@ -114,8 +114,8 @@ def explain_graph( translator = self._evaluate_instruction_in_graph(graph, instruction) dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) graph_dict = dep_graph.to_dict() - query_stmt = str(translator.result_w_literal_binds()) - mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + query = NativeQuery("SQL", str(translator.result_w_literal_binds())) + mapping[instruction.id] = GraphletExplanation(graph_dict, query) return mapping def _evaluate_instruction_in_graph( diff --git a/packages-nextgen/kestrel_core/src/kestrel/display.py b/packages-nextgen/kestrel_core/src/kestrel/display.py index c5aefeb4..e6729f85 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/display.py +++ b/packages-nextgen/kestrel_core/src/kestrel/display.py @@ -1,15 +1,23 @@ -from typing import List, Union +from typing import List, Union, Mapping from dataclasses import dataclass from mashumaro.mixins.json import DataClassJSONMixin from pandas import DataFrame +@dataclass +class NativeQuery(DataClassJSONMixin): + # which query language + language: str + # what query statement + statement: str + + @dataclass class GraphletExplanation(DataClassJSONMixin): # serialized IRGraph - graph: dict - # SQL/KQL query statement - query: str + graph: Mapping + # data source query + query: NativeQuery @dataclass diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index 263d86b2..b4c0b47b 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -49,6 +49,7 @@ def test_explain_in_cache(): ge = res.graphlets[0] assert ge.graph == session.irgraph.to_dict() construct = session.irgraph.get_nodes_by_type(Construct)[0] - assert ge.query == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM "{construct.id.hex}") AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' + assert ge.query.language == "SQL" + assert ge.query.statement == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM "{construct.id.hex}") AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' with pytest.raises(StopIteration): next(ress) diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py index f93f5169..ef161029 100644 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py @@ -2,24 +2,42 @@ import tempfile import base64 import sqlparse +from typing import Iterable, Mapping from pygments import highlight +from pygments.lexers import guess_lexer from pygments.lexers.sql import SqlLexer +from pygments.lexers.kusto import KustoLexer from pygments.formatters import HtmlFormatter import networkx as nx import matplotlib.pyplot as plt from kestrel.display import Display, GraphExplanation from kestrel.ir.graph import IRGraph +from kestrel.ir.instructions import Instruction, DataSource, Variable -def to_html_blocks(d: Display) -> str: +def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]: + d = {} + for n in g: + if isinstance(n, Variable): + d[n] = n.name + elif isinstance(n, DataSource): + d[n] = n.datasource + else: + d[n] = f"[{n.instruction.upper()}]" + return d + + +def to_html_blocks(d: Display) -> Iterable[str]: if isinstance(d, DataFrame): yield d.to_html() elif isinstance(d, GraphExplanation): for graphlet in d.graphlets: graph = IRGraph(graphlet.graph) plt.figure(figsize=(4, 2)) - nx.draw(graph) + nx.draw( + graph, with_labels=True, labels=gen_label_mapping(graph), font_size=8 + ) with tempfile.NamedTemporaryFile(delete_on_close=False) as tf: tf.close() plt.savefig(tf.name, format="png") @@ -30,9 +48,14 @@ def to_html_blocks(d: Display) -> str: imgx = f'' yield imgx - query_indented = sqlparse.format( - graphlet.query, reindent=True, keyword_case="upper" - ) - query = highlight(query_indented, SqlLexer(), HtmlFormatter()) + query = graphlet.query.statement + if graphlet.query.language == "SQL": + lexer = SqlLexer() + query = sqlparse.format(query, reindent=True, keyword_case="upper") + elif graphlet.query.language == "KQL": + lexer = KustoLexer() + else: + lexer = guess_lexer(query) + query = highlight(query, lexer, HtmlFormatter()) style = "" yield style + query From 94545092f9a1229089fb0ddb478e6555b1d8f535 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 29 Feb 2024 22:28:11 -0500 Subject: [PATCH 06/61] fix test --- packages-nextgen/kestrel_core/tests/test_session.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index b4c0b47b..d366b785 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -50,6 +50,7 @@ def test_explain_in_cache(): assert ge.graph == session.irgraph.to_dict() construct = session.irgraph.get_nodes_by_type(Construct)[0] assert ge.query.language == "SQL" - assert ge.query.statement == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM "{construct.id.hex}") AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' + stmt = ge.query.statement.replace('"', '') + assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}) AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' with pytest.raises(StopIteration): next(ress) From 9ee76c2007f66df0267bb8c0eeccc23e50b60415 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 29 Feb 2024 22:55:38 -0500 Subject: [PATCH 07/61] add comment --- packages-nextgen/kestrel_core/tests/test_session.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index d366b785..961738af 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -50,7 +50,7 @@ def test_explain_in_cache(): assert ge.graph == session.irgraph.to_dict() construct = session.irgraph.get_nodes_by_type(Construct)[0] assert ge.query.language == "SQL" - stmt = ge.query.statement.replace('"', '') + stmt = ge.query.statement.replace('"', '') # macOS Python 3.8 does not generate double quotes assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}) AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' with pytest.raises(StopIteration): next(ress) From 9944541b377a518068b4372de08e4237820aae67 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 1 Mar 2024 17:05:28 -0500 Subject: [PATCH 08/61] improved SQL in SqliteCache --- .../kestrel_core/src/kestrel/cache/sqlite.py | 4 +++- .../src/kestrel_jupyter_kernel/display.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 0001e0f4..39afaee6 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -31,12 +31,13 @@ class SqliteTranslator(SqlTranslator): def __init__(self, from_obj: Union[SqlTranslator, str]): if isinstance(from_obj, SqlTranslator): - fc = from_obj.query.subquery() + fc = from_obj.query.subquery(name=from_obj.associated_variable) else: # str to represent table name fc = sqlalchemy.table(from_obj) super().__init__( sqlalchemy.dialects.sqlite.dialect(), dt_parser, "time", fc ) # FIXME: need mapping for timestamp? + self.associated_variable = None @typechecked @@ -149,6 +150,7 @@ def _evaluate_instruction_in_graph( # start a new translator and use previous one as subquery # this allows using the variable as a dependent node # if the variable is a sink, `SELECT * FROM (subquery)` also works + translator.associated_variable = instruction.name translator = SqliteTranslator(translator) else: translator.add_instruction(instruction) diff --git a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py index ef161029..21e10883 100644 --- a/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py +++ b/packages-nextgen/kestrel_jupyter/src/kestrel_jupyter_kernel/display.py @@ -13,7 +13,7 @@ from kestrel.display import Display, GraphExplanation from kestrel.ir.graph import IRGraph -from kestrel.ir.instructions import Instruction, DataSource, Variable +from kestrel.ir.instructions import Instruction, DataSource, Variable, Construct def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]: @@ -21,6 +21,8 @@ def gen_label_mapping(g: IRGraph) -> Mapping[Instruction, str]: for n in g: if isinstance(n, Variable): d[n] = n.name + elif isinstance(n, Construct): + d[n] = n.id.hex[:4] elif isinstance(n, DataSource): d[n] = n.datasource else: @@ -36,7 +38,12 @@ def to_html_blocks(d: Display) -> Iterable[str]: graph = IRGraph(graphlet.graph) plt.figure(figsize=(4, 2)) nx.draw( - graph, with_labels=True, labels=gen_label_mapping(graph), font_size=8 + graph, + with_labels=True, + labels=gen_label_mapping(graph), + font_size=8, + node_size=260, + node_color="#bfdff5", ) with tempfile.NamedTemporaryFile(delete_on_close=False) as tf: tf.close() From 8cca11ea2f970339ef07303e58b099b4bfb6f910 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Sat, 2 Mar 2024 18:17:45 -0500 Subject: [PATCH 09/61] add the multi_interface_explain session test --- .../kestrel_core/src/kestrel/cache/sqlite.py | 10 +- .../kestrel_core/src/kestrel/exceptions.py | 4 + .../kestrel_core/src/kestrel/ir/graph.py | 57 +++++++- .../kestrel_core/src/kestrel/session.py | 27 ++-- .../kestrel_core/tests/test_cache_sqlite.py | 2 +- .../kestrel_core/tests/test_ir_graph.py | 10 +- .../kestrel_core/tests/test_session.py | 127 +++++++++++++++++- 7 files changed, 210 insertions(+), 27 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 39afaee6..4360f62b 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -49,12 +49,12 @@ def __init__( ): super().__init__() - basename = self.session_id or "cache" - path = f"{basename}.db" + basename = session_id or "cache" + self.db_path = f"{basename}.db" # for an absolute file path, the three slashes are followed by the absolute path # for a relative path, it's also three slashes? - self.engine = sqlalchemy.create_engine(f"sqlite:///{path}") + self.engine = sqlalchemy.create_engine(f"sqlite:///{self.db_path}") self.connection = self.engine.connect() if initial_cache: @@ -112,9 +112,9 @@ def explain_graph( if not instructions_to_explain: instructions_to_explain = graph.get_sink_nodes() for instruction in instructions_to_explain: - translator = self._evaluate_instruction_in_graph(graph, instruction) dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) graph_dict = dep_graph.to_dict() + translator = self._evaluate_instruction_in_graph(graph, instruction) query = NativeQuery("SQL", str(translator.result_w_literal_binds())) mapping[instruction.id] = GraphletExplanation(graph_dict, query) return mapping @@ -185,4 +185,4 @@ def __delitem__(self, instruction_id: UUID): del self.cache_catalog[instruction_id] def __setitem__(self, instruction_id: UUID, data: Any): - self.cache_catalog[instruction_id] = "virtual" + instruction_id.hex + self.cache_catalog[instruction_id] = instruction_id.hex + "v" diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py index ae278f9a..1032ff27 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py @@ -74,6 +74,10 @@ class DuplicatedReferenceInFilter(KestrelError): pass +class MissingReferenceInFilter(KestrelError): + pass + + class InvalidSerializedDatasourceInterfaceCacheCatalog(KestrelError): pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py index e5729cda..ddc41b7d 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py +++ b/packages-nextgen/kestrel_core/src/kestrel/ir/graph.py @@ -36,6 +36,7 @@ InevaluableInstruction, LargerThanOneIndegreeInstruction, DuplicatedReferenceInFilter, + MissingReferenceInFilter, DanglingReferenceInFilter, DanglingFilter, ) @@ -124,18 +125,34 @@ def add_edges_from( self.add_edge(u, v, deref) def copy(self): - """Copy the IRGraph with all nodes as reference (not deepcopy)""" + """Copy the IRGraph with all nodes as reference (not deepcopy) + + Support subclass of IRGraph to be copied. + """ g = IRGraph() g.update(self) + + # subclass support + if type(g) != type(self): + g = type(self)(g) + return g def deepcopy(self): - """Copy the IRGraph with all nodes copied as new objects""" + """Copy the IRGraph with all nodes copied as new objects + + Support subclass of IRGraph to be deep copied. + """ g = IRGraph() o2n = {n: n.deepcopy() for n in self.nodes()} for u, v in self.edges(): g.add_edge(o2n[u], o2n[v]) g.add_nodes_from([o2n[n] for n in self.nodes() if self.degree(n) == 0]) + + # subclass support + if type(g) != type(self): + g = type(self)(g) + return g def get_node_by_id(self, ux: Union[UUID, str]) -> Instruction: @@ -372,6 +389,8 @@ def get_trunk_n_branches( ps = list(self.predecessors(node)) pps = [(p, pp) for p in self.predecessors(node) for pp in self.predecessors(p)] + # may need to add a patch in find_dependent_subgraphs_of_node() + # for each new case added in the if/elif, e.g., FIlter if isinstance(node, SolePredecessorTransformingInstruction): if len(ps) > 1: raise LargerThanOneIndegreeInstruction() @@ -388,8 +407,10 @@ def get_trunk_n_branches( and p.attrs == [rv.attribute] and pp.name == rv.reference ] - if len(ppfs) > 1: - raise DuplicatedReferenceInFilter(ppfs) + if not ppfs: + raise MissingReferenceInFilter(rv, node, pps) + elif len(ppfs) > 1: + raise DuplicatedReferenceInFilter(rv, node, pps) else: p = ppfs[0][0] r2n[rv] = p @@ -536,10 +557,34 @@ def find_dependent_subgraphs_of_node( ps = set().union(*[set(g.predecessors(n)) for n in a2uns[interface]]) a2uns[interface].update(ps & cached_nodes) + # a patch (corner case handling) for get_trunk_n_branches() + # add Variable/Reference node if succeeded by ProjectAttrs and Filter, + # which are in the dependent graph; the Variable is only needed by + # get_trunk_n_branches() as an auxiliary node + for interface in a2uns: + auxs = [] + for n in a2uns[interface]: + if isinstance(n, ProjectAttrs): + # need to search in `self`, not `g`, since the boundry of + # `g` is cut by the cache + p = next(self.predecessors(n)) + s = next(g.successors(n)) + if ( + isinstance(s, Filter) + and isinstance(p, (Variable, Reference)) + and s in a2uns[interface] + ): + auxs.append(p) + a2uns[interface].update(auxs) + # remove dep graphs with only one node - # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" when v.x not in cache + # e.g., `ds://a` in "y = GET file FROM ds://a WHERE x = v.x" + # when v.x not in cache dep_nodes = [ns for ns in a2uns.values() if len(ns) > 1] - dep_graphs = [IRGraphEvaluable(g.subgraph(ns)) for ns in dep_nodes] + # need to search in `self` due to the patch for get_trunk_n_branches() + dep_graphs = [ + IRGraphEvaluable(self.subgraph(ns)).deepcopy() for ns in dep_nodes + ] return dep_graphs diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index 91f0af44..1e256184 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -63,27 +63,38 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: irgraph_new = parse_kestrel(huntflow_block) self.irgraph.update(irgraph_new) + # The current logic leads to caching results from non-cache and lastly + # evaluate in cache. + # TODO: may evaluate cache first, then push dependent variables to the + # last interface to eval; this requires priority of interfaces for ret in irgraph_new.get_returns(): - is_explain = isinstance(irgraph_new.get_trunk_n_branches(ret)[0], Explain) + pred = irgraph_new.get_trunk_n_branches(ret)[0] + is_explain = isinstance(pred, Explain) is_complete = False display = GraphExplanation([]) cache = self.cache.get_virtual_copy() if is_explain else self.cache + interfaces = ( + [ + cache if interface is self.cache else interface + for interface in self.interfaces + ] + if is_explain + else self.interfaces + ) while not is_complete: for g in self.irgraph.find_dependent_subgraphs_of_node(ret, cache): - interface = get_interface_by_name(g.interface, self.interfaces) - # intermediate result dictionary - ird = ( + interface = get_interface_by_name(g.interface, interfaces) + for iid, _display in ( interface.explain_graph(g) if is_explain else interface.evaluate_graph(g) - ) - for iid, _display in ird.items(): + ).items(): if is_explain: display.graphlets.append(_display) else: display = _display - if g.interface != cache.name: - cache[iid] = True + if interface is not cache: + cache[iid] = display if iid == ret.id: is_complete = True else: diff --git a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py index 2dd71d51..5db07fb6 100644 --- a/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py +++ b/packages-nextgen/kestrel_core/tests/test_cache_sqlite.py @@ -170,7 +170,7 @@ def test_get_virtual_copy(): # v[new_entry] calls the right method assert isinstance(v, SqliteCacheVirtual) - assert v[new_entry].startswith("virtual") + assert v[new_entry].endswith("v") # the two cache_catalog are different assert new_entry not in c diff --git a/packages-nextgen/kestrel_core/tests/test_ir_graph.py b/packages-nextgen/kestrel_core/tests/test_ir_graph.py index 38fa0c1c..cd77da7d 100644 --- a/packages-nextgen/kestrel_core/tests/test_ir_graph.py +++ b/packages-nextgen/kestrel_core/tests/test_ir_graph.py @@ -332,22 +332,22 @@ def test_find_dependent_subgraphs_of_node(): assert len(c) == 2 gs = graph.find_dependent_subgraphs_of_node(ret, c) assert len(gs) == 1 - assert len(gs[0]) == 10 + assert len(gs[0]) == 11 assert p2 in gs[0] assert p21 in gs[0] assert p4 in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) + assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Filter, Filter, Variable, Variable, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) p4_projattr = next(graph.successors(p4)) c[p4_projattr.id] = DataFrame() gs = graph.find_dependent_subgraphs_of_node(ret, c) assert len(gs) == 1 - assert len(gs[0]) == 7 + assert len(gs[0]) == 8 assert p4_projattr.id in c assert p4_projattr in gs[0] assert p5 in gs[0] assert ret in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) + assert Counter(map(type, gs[0].nodes())) == Counter([Filter, Return, Variable, Variable, ProjectEntity, DataSource, ProjectAttrs, ProjectAttrs]) def test_find_simple_query_subgraphs(): @@ -400,7 +400,7 @@ def test_find_simple_query_subgraphs(): gs = graph.find_dependent_subgraphs_of_node(graph.get_returns()[0], c) assert len(gs) == 1 assert sink in gs[0] - assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity]) + assert Counter(map(type, gs[0].nodes())) == Counter([Variable, Filter, ProjectAttrs, DataSource, Return, ProjectEntity, Variable]) for g in gs[0].find_simple_query_subgraphs(c): assert Counter(map(type, g.nodes())) == Counter([ProjectAttrs, Variable, Filter, ProjectEntity, DataSource]) assert sink in g diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index 961738af..98039637 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -1,9 +1,14 @@ import pytest +import os from kestrel import Session from pandas import DataFrame +from uuid import uuid4 from kestrel.display import GraphExplanation from kestrel.ir.instructions import Construct +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER +from kestrel.frontend.parser import parse_kestrel +from kestrel.cache import SqliteCache def test_execute_in_cache(): @@ -30,6 +35,30 @@ def test_execute_in_cache(): with pytest.raises(StopIteration): next(res) + +def test_double_deref_in_cache(): + # When the Filter node is dereferred twice + # The node should be deepcopied each time to avoid issue + hf = """ +proclist = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +px = proclist WHERE name != "cmd.exe" AND pid = 205 +chrome = proclist WHERE pid IN px.pid +DISP chrome +DISP chrome +""" + df = DataFrame([ {"name": "chrome.exe", "pid": 205} ]) + with Session() as session: + res = session.execute_to_generate(hf) + assert df.equals(next(res)) + assert df.equals(next(res)) + with pytest.raises(StopIteration): + next(res) + + def test_explain_in_cache(): hf = """ proclist = NEW process [ {"name": "cmd.exe", "pid": 123} @@ -50,7 +79,101 @@ def test_explain_in_cache(): assert ge.graph == session.irgraph.to_dict() construct = session.irgraph.get_nodes_by_type(Construct)[0] assert ge.query.language == "SQL" - stmt = ge.query.statement.replace('"', '') # macOS Python 3.8 does not generate double quotes - assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}) AS anon_3 \nWHERE name != \'cmd.exe\') AS anon_2 \nWHERE pid = 205) AS anon_1' + stmt = ge.query.statement.replace('"', '') + assert stmt == f'SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {construct.id.hex}v) AS proclist \nWHERE name != \'cmd.exe\') AS browsers \nWHERE pid = 205) AS chrome' + with pytest.raises(StopIteration): + next(ress) + + +def test_multi_interface_explain(): + extra_db = [] + with Session() as session: + stmt1 = """ +procs = NEW process [ {"name": "cmd.exe", "pid": 123} + , {"name": "explorer.exe", "pid": 99} + , {"name": "firefox.exe", "pid": 201} + , {"name": "chrome.exe", "pid": 205} + ] +DISP procs +""" + session.execute(stmt1) + class DataLake(SqliteCache): + @property + def name(self): + return "datalake" + session.interfaces[0].__class__ = DataLake + session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "datalake" + + new_cache = SqliteCache(session_id = uuid4()) + extra_db.append(new_cache.db_path) + session.cache = new_cache + session.interfaces.append(new_cache) + stmt2 = """ +nt = NEW network [ {"pid": 123, "source": "192.168.1.1", "destination": "1.1.1.1"} + , {"pid": 205, "source": "192.168.1.1", "destination": "1.1.1.2"} + ] +DISP nt +""" + session.execute(stmt2) + class Gateway(SqliteCache): + @property + def name(self): + return "gateway" + session.interfaces[1].__class__ = Gateway + session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "gateway" + + new_cache = SqliteCache(session_id = uuid4()) + extra_db.append(new_cache.db_path) + session.cache = new_cache + session.interfaces.append(new_cache) + stmt3 = """ +domain = NEW domain [ {"ip": "1.1.1.1", "domain": "cloudflare.com"} + , {"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"} + ] +DISP domain +""" + session.execute(stmt3) + + stmt = """ +p2 = procs WHERE name IN ("firefox.exe", "chrome.exe") +ntx = nt WHERE pid IN p2.pid +d2 = domain WHERE ip IN ntx.destination +EXPLAIN d2 +""" + ress = session.execute_to_generate(stmt) + disp = next(ress) with pytest.raises(StopIteration): next(ress) + + assert isinstance(disp, GraphExplanation) + assert len(disp.graphlets) == 4 + + assert len(disp.graphlets[0].graph["nodes"]) == 5 + query = disp.graphlets[0].query.statement.replace('"', '') + procs = session.irgraph.get_variable("procs") + c1 = next(session.irgraph.predecessors(procs)) + assert query == f"SELECT pid \nFROM (SELECT * \nFROM (SELECT * \nFROM {c1.id.hex}) AS procs \nWHERE name IN ('firefox.exe', 'chrome.exe')) AS p2" + + assert len(disp.graphlets[1].graph["nodes"]) == 2 + query = disp.graphlets[1].query.statement.replace('"', '') + nt = session.irgraph.get_variable("nt") + c2 = next(session.irgraph.predecessors(nt)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM {c2.id.hex}) AS nt" + + # the current session.execute_to_generate() logic does not store + # in cache if evaluated by cache; the behavior may change in the future + assert len(disp.graphlets[2].graph["nodes"]) == 2 + query = disp.graphlets[2].query.statement.replace('"', '') + domain = session.irgraph.get_variable("domain") + c3 = next(session.irgraph.predecessors(domain)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain" + + assert len(disp.graphlets[3].graph["nodes"]) == 12 + print(disp.graphlets[3].graph["nodes"]) + query = disp.graphlets[3].query.statement.replace('"', '') + p2 = session.irgraph.get_variable("p2") + p2pa = next(session.irgraph.successors(p2)) + assert query == f"SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain \nWHERE ip IN (SELECT destination \nFROM (SELECT * \nFROM {nt.id.hex}v \nWHERE pid IN (SELECT * \nFROM {p2pa.id.hex}v)) AS ntx)) AS d2" + + for db_file in extra_db: + os.remove(db_file) From 585ad57a48971714886064af42027a567e205979 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 4 Mar 2024 13:56:55 -0500 Subject: [PATCH 10/61] fix #483; better garbage collection behavior --- packages-nextgen/kestrel_core/src/kestrel/cache/base.py | 8 ++++++-- packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py | 6 ++++++ .../src/kestrel/interface/datasource/utils.py | 1 + packages-nextgen/kestrel_core/tests/test_session.py | 6 ++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py index d9cd0ea5..9565ce76 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py @@ -64,8 +64,8 @@ def __delitem__(self, instruction_id: UUID): def get_virtual_copy(self) -> AbstractCache: """Create a virtual cache object from this cache - This method needs to reimplement __getitem__, __setitem__, __delitem__ - to not actually hit the store media of the cache, e.g., SQLite. + This method needs to reimplement __del__, __getitem__, __setitem__, + __delitem__ to not actually hit the store media, e.g., SQLite. The virtual cache is useful for the implementation of the Explain() instruction, pretending the dependent graphs are evaluated, so the @@ -77,6 +77,10 @@ def get_virtual_copy(self) -> AbstractCache: object to the subclass to correctly invoke the new set of __xitem___. https://docs.python.org/3/reference/datamodel.html#special-lookup + + And Python garbage collector could clean up the virtual cache when + not in use, so the __del__ method should be reimplemented to make + sure the store media is not closed. """ ... diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 4360f62b..8fe26672 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -3,6 +3,9 @@ from typing import Iterable, Mapping, Optional, Union, Any from uuid import UUID +import traceback +import sys + import sqlalchemy from dateutil.parser import parse as dt_parser from pandas import DataFrame, read_sql @@ -186,3 +189,6 @@ def __delitem__(self, instruction_id: UUID): def __setitem__(self, instruction_id: UUID, data: Any): self.cache_catalog[instruction_id] = instruction_id.hex + "v" + + def __del__(self): + pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py index 33a49975..eccea8f7 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py @@ -21,6 +21,7 @@ def get_interface_by_name( Returns: The interface found """ + ifs = filter(lambda x: x.name == interface_name, interfaces) try: interface = next(ifs) diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index 98039637..59473643 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -139,9 +139,12 @@ def name(self): ntx = nt WHERE pid IN p2.pid d2 = domain WHERE ip IN ntx.destination EXPLAIN d2 +DISP d2 """ ress = session.execute_to_generate(stmt) disp = next(ress) + df_res = next(ress) + with pytest.raises(StopIteration): next(ress) @@ -175,5 +178,8 @@ def name(self): p2pa = next(session.irgraph.successors(p2)) assert query == f"SELECT * \nFROM (SELECT * \nFROM (SELECT * \nFROM {c3.id.hex}) AS domain \nWHERE ip IN (SELECT destination \nFROM (SELECT * \nFROM {nt.id.hex}v \nWHERE pid IN (SELECT * \nFROM {p2pa.id.hex}v)) AS ntx)) AS d2" + df_ref = DataFrame([{"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"}]) + assert df_ref.equals(df_res) + for db_file in extra_db: os.remove(db_file) From 4d6f8faa4bb53c94869efd84a10b965f7c7d2121 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 4 Mar 2024 13:58:22 -0500 Subject: [PATCH 11/61] remove unused imports --- packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index 8fe26672..da6c3604 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -3,9 +3,6 @@ from typing import Iterable, Mapping, Optional, Union, Any from uuid import UUID -import traceback -import sys - import sqlalchemy from dateutil.parser import parse as dt_parser from pandas import DataFrame, read_sql From 85c0817cd599d11b922546e23c75d9677dbc7b69 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 11:10:39 -0500 Subject: [PATCH 12/61] upgrade interface.name to interface.schemes --- .../kestrel_core/src/kestrel/interface/datasource/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py index b838ecd6..f40f7e9a 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py @@ -55,10 +55,10 @@ def __init__( @property @abstractmethod - def name(self) -> str: - """The name of the interface + def schemes(self) -> Iterable[str]: + """The schemes to specify the interface - The name should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` + Each scheme should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` """ ... From d30b1b072de918e5a6bbea27f311acefc2d32e39 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 14:23:59 -0500 Subject: [PATCH 13/61] implement Interface Manager for Kestrel2 --- .../kestrel_core/src/kestrel/cache/base.py | 10 +- .../kestrel_core/src/kestrel/exceptions.py | 10 +- .../src/kestrel/interface/__init__.py | 2 + .../interface/{datasource => }/base.py | 7 +- .../{analytics => codegen}/__init__.py | 0 .../{datasource => }/codegen/dataframe.py | 0 .../interface/{datasource => }/codegen/kql.py | 0 .../interface/{datasource => }/codegen/sql.py | 0 .../kestrel/interface/datasource/__init__.py | 1 - .../kestrel/interface/datasource/manager.py | 21 --- .../datasource/translation/result/__init__.py | 0 .../src/kestrel/interface/datasource/utils.py | 38 ----- .../src/kestrel/interface/manager.py | 140 ++++++++++-------- .../codegen => translation}/__init__.py | 0 .../query}/__init__.py | 0 .../query => translation/result}/__init__.py | 0 .../kestrel_core/src/kestrel/session.py | 38 ++--- .../kestrel_core/tests/test_session.py | 29 ++-- .../interface.py | 8 +- 19 files changed, 123 insertions(+), 181 deletions(-) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource => }/base.py (95%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{analytics => codegen}/__init__.py (100%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource => }/codegen/dataframe.py (100%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource => }/codegen/kql.py (100%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource => }/codegen/sql.py (100%) delete mode 100644 packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py delete mode 100644 packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py delete mode 100644 packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py delete mode 100644 packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource/codegen => translation}/__init__.py (100%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource/translation => translation/query}/__init__.py (100%) rename packages-nextgen/kestrel_core/src/kestrel/interface/{datasource/translation/query => translation/result}/__init__.py (100%) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py index 9565ce76..42359aa8 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py @@ -5,20 +5,20 @@ from abc import abstractmethod from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER -from kestrel.interface.datasource import AbstractDataSourceInterface +from kestrel.interface import AbstractInterface -class AbstractCache(AbstractDataSourceInterface, MutableMapping): +class AbstractCache(AbstractInterface, MutableMapping): """Base class for Kestrel cache - Additional @abstractmethod from AbstractDataSourceInterface: + Additional @abstractmethod from AbstractInterface: - evaluate_graph() """ @property - def name(self): - return CACHE_INTERFACE_IDENTIFIER + def schemes(self): + return [CACHE_INTERFACE_IDENTIFIER] @abstractmethod def __del__(self): diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py index 1032ff27..a0c94a07 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py @@ -94,23 +94,19 @@ class InterfaceNotFound(KestrelError): pass -class InterfaceNameCollision(KestrelError): - pass - - class IRGraphMissingNode(KestrelError): pass -class DataSourceInterfaceNotFound(KestrelError): +class InterfaceNotFound(KestrelError): pass -class InvalidDataSourceInterfaceImplementation(KestrelError): +class InvalidInterfaceImplementation(KestrelError): pass -class ConflictingDataSourceInterfaceScheme(KestrelError): +class ConflictingInterfaceScheme(KestrelError): pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py index e69de29b..3c4b25e5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/__init__.py @@ -0,0 +1,2 @@ +from kestrel.interface.base import AbstractInterface +from kestrel.interface.manager import InterfaceManager diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py similarity index 95% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/base.py index f40f7e9a..4cefaa60 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py @@ -17,11 +17,11 @@ ) -MODULE_PREFIX = "kestrel_datasource_" +MODULE_PREFIX = "kestrel_interface_" -class AbstractDataSourceInterface(ABC): - """Abstract class for datasource interface +class AbstractInterface(ABC): + """Abstract class for datasource/analytics interface Concepts: @@ -44,7 +44,6 @@ def __init__( session_id: Optional[UUID] = None, ): self.session_id = session_id - self.datasources: Mapping[str, str] = {} self.cache_catalog: MutableMapping[UUID, str] = {} if serialized_cache_catalog: diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/analytics/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/dataframe.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/dataframe.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/kql.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/kql.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py b/packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/sql.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/codegen/sql.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py deleted file mode 100644 index bd74f728..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel.interface.datasource.base import AbstractDataSourceInterface diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py deleted file mode 100644 index d6806715..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/manager.py +++ /dev/null @@ -1,21 +0,0 @@ -from kestrel.exceptions import ( - DataSourceInterfaceNotFound, - InvalidDataSourceInterfaceImplementation, - ConflictingDataSourceInterfaceScheme, -) -from kestrel.interface.manager import InterfaceManager -from kestrel.interface.datasource.base import ( - MODULE_PREFIX, - AbstractDataSourceInterface, -) - - -class DataSourceManager(InterfaceManager): - def __init__(self): - super().__init__( - MODULE_PREFIX, - AbstractDataSourceInterface, - DataSourceInterfaceNotFound, - InvalidDataSourceInterfaceImplementation, - ConflictingDataSourceInterfaceScheme, - ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/result/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py b/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py deleted file mode 100644 index eccea8f7..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/utils.py +++ /dev/null @@ -1,38 +0,0 @@ -from typing import Iterable -from typeguard import typechecked - -from kestrel.interface.datasource import AbstractDataSourceInterface -from kestrel.exceptions import ( - InterfaceNotFound, - InterfaceNameCollision, -) - - -@typechecked -def get_interface_by_name( - interface_name: str, interfaces: Iterable[AbstractDataSourceInterface] -): - """Find an interface by its name - - Parameters: - interface_name: the name of an interface - interfaces: the list of interfaces - - Returns: - The interface found - """ - - ifs = filter(lambda x: x.name == interface_name, interfaces) - try: - interface = next(ifs) - except StopIteration: - raise InterfaceNotFound(interface_name) - else: - try: - next(ifs) - except StopIteration: - # expected behavior - pass - else: - raise InterfaceNameCollision(interface_name) - return interface diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index a66a1ce1..df9cabbc 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -1,94 +1,108 @@ -from abc import ABC - +from __future__ import annotations import importlib import pkgutil import logging import inspect import sys +import itertools +from copy import copy +from typeguard import typechecked +from typing import Mapping, Iterable -from kestrel.exceptions import KestrelError +from kestrel.exceptions import ( + InterfaceNotFound, + InvalidInterfaceImplementation, + ConflictingInterfaceScheme, +) +from kestrel.interface.base import MODULE_PREFIX, AbstractInterface +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER _logger = logging.getLogger(__name__) -class InterfaceManager: - def __init__( - self, - module_name_prefix: str, - interface_class: ABC, - nonexist_interface_exception: KestrelError, - invalid_interface_exception: KestrelError, - conflict_interface_exception: KestrelError, - ): - self.scheme_to_interface: dict[str, ABC] = {} - self.nonexist_interface_exception = nonexist_interface_exception - - for iface_cls in _load_interfaces( - module_name_prefix, - interface_class, - invalid_interface_exception, - conflict_interface_exception, - ).values(): +# basically a scheme to interface mapping +@typechecked +class InterfaceManager(Mapping): + def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): + interface_classes = _load_interface_classes() + self.interfaces = list(init_interfaces) # copy/recreate the list + for iface_cls in interface_classes: iface = iface_cls() - _logger.debug("Loading data source interface '%s' (%s)", iface.name, iface) - self.scheme_to_interface[iface.name] = iface - - def interfaces(self): - return list(self.scheme_to_interface.values()) - - def schemes(self): - return list(self.scheme_to_interface.keys()) - + _logger.debug(f"Initialize interface {iface.__name__}") + self.interfaces.append(iface) -def _load_interfaces( - module_name_prefix, - interface_class, - invalid_interface_exception, - conflict_interface_exception, -): - is_interface = _is_class(interface_class) - interface_names = _list_interfaces(module_name_prefix) - interfaces = {} - for interface_name in interface_names: - mod = importlib.import_module(interface_name) - _logger.debug("Imported %s from interface name %s", mod, interface_name) - cls = inspect.getmembers(sys.modules[interface_name], is_interface) + def __getitem__(self, scheme: str) -> AbstractInterface: + for interface in self.interfaces: + if scheme in interface.schemes: + return interface + else: + x = [i.__class__ for i in self.interfaces] + raise InterfaceNotFound(f"no interface loaded for scheme {scheme}; {x}") + + def __iter__(self) -> Iterable[str]: + return itertools.chain(*[i.schemes for i in self.interfaces]) + + def __len__(self) -> int: + return sum(1 for _ in iter(self)) + + def copy_with_virtual_cache(self) -> InterfaceManager: + im = copy(self) + # shallow copy refers to the same list, so create/copy a new one + im.interfaces = copy(im.interfaces) + # now swap in virtual cache + cache = im[CACHE_INTERFACE_IDENTIFIER] + im.interfaces.remove(cache) + im.interfaces.append(cache.get_virtual_copy()) + return im + + def del_cache(self): + cache = self[CACHE_INTERFACE_IDENTIFIER] + self.interfaces.remove(cache) + del cache + + +def _load_interface_classes(): + interface_clss = [] + for itf_pkg_name in _list_interface_pkg_names(): + mod = importlib.import_module(itf_pkg_name) + _logger.debug(f"Imported {mod} from package {itf_pkg_name}") + cls = inspect.getmembers( + sys.modules[itf_pkg_name], _is_class(AbstractInterface) + ) if not cls: - raise invalid_interface_exception( - f'no interface class found in "{interface_name}"' + raise InvalidInterfaceImplementation( + f'no interface class found in package "{itf_pkg_name}"' ) elif len(cls) > 1: - raise invalid_interface_exception( - f'more than one interface class found in "{interface_name}"' + raise InvalidInterfaceImplementation( + f'more than one interface class found in package "{itf_pkg_name}"' ) else: - interface = cls[0][1] - interface_conflict, scheme_conflict = _search_scheme_conflict( - interface, interfaces.values() - ) - if interface_conflict: - raise conflict_interface_exception( - interface, interface_conflict, scheme_conflict - ) - interfaces[interface_name] = interface - return interfaces + interface_cls = cls[0][1] + _guard_scheme_conflict(interface_cls, interface_clss) + interface_clss.append(interface_cls) + return interface_clss -def _list_interfaces(module_name_prefix): +def _list_interface_pkg_names(): pkg_names = [x.name for x in pkgutil.iter_modules()] - itf_names = [pkg for pkg in pkg_names if pkg.startswith(module_name_prefix)] - return list(itf_names) + itf_names = [pkg for pkg in pkg_names if pkg.startswith(MODULE_PREFIX)] + return itf_names def _is_class(cls): return lambda obj: inspect.isclass(obj) and obj.__bases__[0] == cls -def _search_scheme_conflict(new_interface, interfaces): +@typechecked +def _guard_scheme_conflict( + new_interface: AbstractInterface, interfaces: Iterable[AbstractInterface] +): for interface in interfaces: for scheme_new in new_interface.schemes(): for scheme_old in interface.schemes(): if scheme_new == scheme_old: - return interface, scheme_new - return None, None + raise ConflictingInterfaceScheme( + f"scheme: {scheme_new} conflicting between {new_interface.__name__} and {interface.__name__}" + ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/codegen/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/query/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py b/packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py similarity index 100% rename from packages-nextgen/kestrel_core/src/kestrel/interface/datasource/translation/query/__init__.py rename to packages-nextgen/kestrel_core/src/kestrel/interface/translation/result/__init__.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index 1e256184..2e912ca9 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -9,9 +9,8 @@ from kestrel.ir.instructions import Explain from kestrel.frontend.parser import parse_kestrel from kestrel.cache import AbstractCache, SqliteCache -from kestrel.interface.datasource import AbstractDataSourceInterface -from kestrel.interface.datasource.manager import DataSourceManager -from kestrel.interface.datasource.utils import get_interface_by_name +from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER +from kestrel.interface import AbstractInterface, InterfaceManager _logger = logging.getLogger(__name__) @@ -22,17 +21,12 @@ class Session(AbstractContextManager): """Kestrel huntflow execution session""" def __init__(self): - self.session_id: UUID = uuid4() - self.irgraph: IRGraph = IRGraph() - self.cache: AbstractCache = SqliteCache() + self.session_id = uuid4() + self.irgraph = IRGraph() - # Datasource interfaces in this session - # Cache is a special datasource interface and should always be added - self.interfaces: Iterable[AbstractDataSourceInterface] = [self.cache] - - # Load data sources and add to list - data_source_manager = DataSourceManager() - self.interfaces.extend(data_source_manager.interfaces()) + # load all interfaces; cache is a special interface + cache = SqliteCache() + self.interface_manager = InterfaceManager([cache]) def execute(self, huntflow_block: str) -> Iterable[Display]: """Execute a Kestrel huntflow block. @@ -72,18 +66,15 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: is_explain = isinstance(pred, Explain) is_complete = False display = GraphExplanation([]) - cache = self.cache.get_virtual_copy() if is_explain else self.cache - interfaces = ( - [ - cache if interface is self.cache else interface - for interface in self.interfaces - ] + interface_manager = ( + self.interface_manager.copy_with_virtual_cache() if is_explain - else self.interfaces + else self.interface_manager ) + cache = interface_manager[CACHE_INTERFACE_IDENTIFIER] while not is_complete: for g in self.irgraph.find_dependent_subgraphs_of_node(ret, cache): - interface = get_interface_by_name(g.interface, interfaces) + interface = interface_manager[g.interface] for iid, _display in ( interface.explain_graph(g) if is_explain @@ -119,9 +110,8 @@ def close(self): """ # Note there are two conditions that trigger this function, so it is probably executed twice # Be careful to write the logic in this function to avoid deleting nonexist files/dirs - if self.cache: - del self.cache - self.cache = None + if CACHE_INTERFACE_IDENTIFIER in self.interface_manager: + self.interface_manager.del_cache() def __exit__(self, exception_type, exception_value, traceback): self.close() diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index 59473643..d4edbaf5 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -86,6 +86,17 @@ def test_explain_in_cache(): def test_multi_interface_explain(): + + class DataLake(SqliteCache): + @property + def schemes(self): + return ["datalake"] + + class Gateway(SqliteCache): + @property + def schemes(self): + return ["gateway"] + extra_db = [] with Session() as session: stmt1 = """ @@ -97,17 +108,12 @@ def test_multi_interface_explain(): DISP procs """ session.execute(stmt1) - class DataLake(SqliteCache): - @property - def name(self): - return "datalake" - session.interfaces[0].__class__ = DataLake + session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = DataLake session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "datalake" new_cache = SqliteCache(session_id = uuid4()) extra_db.append(new_cache.db_path) - session.cache = new_cache - session.interfaces.append(new_cache) + session.interface_manager.interfaces.append(new_cache) stmt2 = """ nt = NEW network [ {"pid": 123, "source": "192.168.1.1", "destination": "1.1.1.1"} , {"pid": 205, "source": "192.168.1.1", "destination": "1.1.1.2"} @@ -115,17 +121,12 @@ def name(self): DISP nt """ session.execute(stmt2) - class Gateway(SqliteCache): - @property - def name(self): - return "gateway" - session.interfaces[1].__class__ = Gateway + session.interface_manager[CACHE_INTERFACE_IDENTIFIER].__class__ = Gateway session.irgraph.get_nodes_by_type_and_attributes(Construct, {"interface": CACHE_INTERFACE_IDENTIFIER})[0].interface = "gateway" new_cache = SqliteCache(session_id = uuid4()) extra_db.append(new_cache.db_path) - session.cache = new_cache - session.interfaces.append(new_cache) + session.interface_manager.interfaces.append(new_cache) stmt3 = """ domain = NEW domain [ {"ip": "1.1.1.1", "domain": "cloudflare.com"} , {"ip": "1.1.1.2", "domain": "xyz.cloudflare.com"} diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py index 29666511..d9780915 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py +++ b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py @@ -6,7 +6,7 @@ from pandas import DataFrame, Series, concat from kestrel.exceptions import DataSourceError -from kestrel.interface.datasource.base import AbstractDataSourceInterface +from kestrel.interface import AbstractInterface from kestrel.ir.graph import IRGraphEvaluable from kestrel.display import GraphletExplanation from kestrel.ir.instructions import ( @@ -69,7 +69,7 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame: return concat(dfs) -class OpenSearchInterface(AbstractDataSourceInterface): +class OpenSearchInterface(AbstractInterface): def __init__( self, serialized_cache_catalog: Optional[str] = None, @@ -91,8 +91,8 @@ def __init__( self.conns[name] = client @property - def name(self): - return "opensearch" + def schemes(self): + return ["opensearch"] def store( self, From 3f0e559c830b6985e180adc2623d8403cfc8159a Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 14:28:48 -0500 Subject: [PATCH 14/61] update opensearch interface name --- .../src/kestrel_datasource_opensearch/__init__.py | 1 - .../pyproject.toml | 2 +- .../src/kestrel_datasource_opensearch/__init__.py | 1 + .../src/kestrel_datasource_opensearch/config.py | 0 .../src/kestrel_datasource_opensearch/interface.py | 4 ++-- .../src/kestrel_datasource_opensearch/ossql.py | 0 .../tests/__init__.py | 0 .../tests/test_config.py | 2 +- .../tests/test_ossql.py | 2 +- 9 files changed, 6 insertions(+), 6 deletions(-) delete mode 100644 packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/pyproject.toml (96%) create mode 100644 packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/src/kestrel_datasource_opensearch/config.py (100%) rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/src/kestrel_datasource_opensearch/interface.py (98%) rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/src/kestrel_datasource_opensearch/ossql.py (100%) rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/tests/__init__.py (100%) rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/tests/test_config.py (97%) rename packages-nextgen/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/tests/test_ossql.py (98%) diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py b/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py deleted file mode 100644 index f932e879..00000000 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from kestrel_datasource_opensearch.interface import OpenSearchInterface diff --git a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml similarity index 96% rename from packages-nextgen/kestrel_datasource_opensearch/pyproject.toml rename to packages-nextgen/kestrel_interface_opensearch/pyproject.toml index 6d5017a0..6270f6d0 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/pyproject.toml +++ b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools >= 68.2.2", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "kestrel_datasource_opensearch" +name = "kestrel_interface_opensearch" version = "2.0.0" description = "Kestrel OpenSearch Datasource Interface" readme = "README.rst" diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py new file mode 100644 index 00000000..3ee389ca --- /dev/null +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py @@ -0,0 +1 @@ +from kestrel_interface_opensearch.interface import OpenSearchInterface diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/config.py similarity index 100% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/config.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/config.py diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/interface.py similarity index 98% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/interface.py index d9780915..a7dba14c 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/interface.py @@ -20,8 +20,8 @@ SolePredecessorTransformingInstruction, ) -from kestrel_datasource_opensearch.config import load_config -from kestrel_datasource_opensearch.ossql import OpenSearchTranslator +from kestrel_interface_opensearch.config import load_config +from kestrel_interface_opensearch.ossql import OpenSearchTranslator _logger = logging.getLogger(__name__) diff --git a/packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/ossql.py similarity index 100% rename from packages-nextgen/kestrel_datasource_opensearch/src/kestrel_datasource_opensearch/ossql.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/ossql.py diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py b/packages-nextgen/kestrel_interface_opensearch/tests/__init__.py similarity index 100% rename from packages-nextgen/kestrel_datasource_opensearch/tests/__init__.py rename to packages-nextgen/kestrel_interface_opensearch/tests/__init__.py diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py similarity index 97% rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py rename to packages-nextgen/kestrel_interface_opensearch/tests/test_config.py index 51964889..85241b71 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_config.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_config.py @@ -2,7 +2,7 @@ import yaml -from kestrel_datasource_opensearch.config import ( +from kestrel_interface_opensearch.config import ( PROFILE_PATH_ENV_VAR, Connection, load_config, diff --git a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py similarity index 98% rename from packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py rename to packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py index d4e17eaf..c213a963 100644 --- a/packages-nextgen/kestrel_datasource_opensearch/tests/test_ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py @@ -1,7 +1,7 @@ from datetime import datetime from dateutil import parser -from kestrel_datasource_opensearch.ossql import OpenSearchTranslator +from kestrel_interface_opensearch.ossql import OpenSearchTranslator from kestrel.exceptions import UnsupportedOperatorError from kestrel.ir.filter import ( BoolExp, From c544910a8d06c68a92e7644029614967621140dd Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 16:24:08 -0500 Subject: [PATCH 15/61] fix import error --- packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py | 2 +- packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py index b96e00bd..87557222 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/inmemory.py @@ -22,7 +22,7 @@ SourceInstruction, TransformingInstruction, ) -from kestrel.interface.datasource.codegen.dataframe import ( +from kestrel.interface.codegen.dataframe import ( evaluate_source_instruction, evaluate_transforming_instruction, ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py index da6c3604..97b8fb13 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/sqlite.py @@ -9,7 +9,7 @@ from typeguard import typechecked from kestrel.cache.base import AbstractCache -from kestrel.interface.datasource.codegen.sql import SqlTranslator +from kestrel.interface.codegen.sql import SqlTranslator from kestrel.ir.graph import IRGraphEvaluable from kestrel.display import GraphletExplanation, NativeQuery from kestrel.ir.instructions import ( From 31ce604fce5131871214bd454ac6a1ee46030ffd Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 16:29:28 -0500 Subject: [PATCH 16/61] fix import error in tests --- .../tests/test_interface_datasource_codegen_dataframe.py | 2 +- .../kestrel_core/tests/test_interface_datasource_codegen_sql.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py index dc6164b6..e57ff1d9 100644 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py +++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -from kestrel.interface.datasource.codegen.dataframe import ( +from kestrel.interface.codegen.dataframe import ( evaluate_source_instruction, evaluate_transforming_instruction, ) diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py index 27e0aca4..1cc3c46c 100644 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py +++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_sql.py @@ -1,7 +1,7 @@ from datetime import datetime from dateutil import parser -from kestrel.interface.datasource.codegen.sql import SqlTranslator +from kestrel.interface.codegen.sql import SqlTranslator from kestrel.ir.filter import ( BoolExp, ExpOp, From e31f8732424f9000e5bb078a0320c646a8491192 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 16:56:27 -0500 Subject: [PATCH 17/61] split method in Session --- .../kestrel_core/src/kestrel/session.py | 66 ++++++++++--------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index 2e912ca9..bfeae707 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -6,11 +6,12 @@ from kestrel.display import Display, GraphExplanation from kestrel.ir.graph import IRGraph -from kestrel.ir.instructions import Explain +from kestrel.ir.instructions import Instruction, Explain from kestrel.frontend.parser import parse_kestrel from kestrel.cache import AbstractCache, SqliteCache from kestrel.config.internal import CACHE_INTERFACE_IDENTIFIER from kestrel.interface import AbstractInterface, InterfaceManager +from kestrel.exceptions import InstructionNotFound _logger = logging.getLogger(__name__) @@ -57,39 +58,44 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: irgraph_new = parse_kestrel(huntflow_block) self.irgraph.update(irgraph_new) + for ret in irgraph_new.get_returns(): + yield self.evaluate_instruction(ret) + + def evaluate_instruction(self, ins: Instruction) -> Display: + if ins not in self.irgraph: + raise InstructionNotFound(ins.to_dict()) + + pred = self.irgraph.get_trunk_n_branches(ins)[0] + is_explain = isinstance(pred, Explain) + display = GraphExplanation([]) + + _interface_manager = ( + self.interface_manager.copy_with_virtual_cache() + if is_explain + else self.interface_manager + ) + _cache = _interface_manager[CACHE_INTERFACE_IDENTIFIER] + # The current logic leads to caching results from non-cache and lastly # evaluate in cache. # TODO: may evaluate cache first, then push dependent variables to the # last interface to eval; this requires priority of interfaces - for ret in irgraph_new.get_returns(): - pred = irgraph_new.get_trunk_n_branches(ret)[0] - is_explain = isinstance(pred, Explain) - is_complete = False - display = GraphExplanation([]) - interface_manager = ( - self.interface_manager.copy_with_virtual_cache() - if is_explain - else self.interface_manager - ) - cache = interface_manager[CACHE_INTERFACE_IDENTIFIER] - while not is_complete: - for g in self.irgraph.find_dependent_subgraphs_of_node(ret, cache): - interface = interface_manager[g.interface] - for iid, _display in ( - interface.explain_graph(g) - if is_explain - else interface.evaluate_graph(g) - ).items(): - if is_explain: - display.graphlets.append(_display) - else: - display = _display - if interface is not cache: - cache[iid] = display - if iid == ret.id: - is_complete = True - else: - yield display + while True: + for g in self.irgraph.find_dependent_subgraphs_of_node(ins, _cache): + interface = _interface_manager[g.interface] + for iid, _display in ( + interface.explain_graph(g) + if is_explain + else interface.evaluate_graph(g) + ).items(): + if is_explain: + display.graphlets.append(_display) + else: + display = _display + if interface is not _cache: + _cache[iid] = display + if iid == ins.id: + return display def do_complete(self, huntflow_block: str, cursor_pos: int): """Kestrel code auto-completion. From a128aa4a7308f98ee709a21dc6e39aa9da146259 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 16:59:51 -0500 Subject: [PATCH 18/61] add comments --- packages-nextgen/kestrel_core/src/kestrel/session.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/session.py b/packages-nextgen/kestrel_core/src/kestrel/session.py index bfeae707..48ebf1f8 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/session.py +++ b/packages-nextgen/kestrel_core/src/kestrel/session.py @@ -54,7 +54,6 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: Yields: Evaluated result per Return instruction """ - irgraph_new = parse_kestrel(huntflow_block) self.irgraph.update(irgraph_new) @@ -62,6 +61,14 @@ def execute_to_generate(self, huntflow_block: str) -> Iterable[Display]: yield self.evaluate_instruction(ret) def evaluate_instruction(self, ins: Instruction) -> Display: + """Evaluate a single Instruction. + + Parameters: + ins: the instruction to evaluate + + Returns: + Evaluated result (Kestrel Display object) + """ if ins not in self.irgraph: raise InstructionNotFound(ins.to_dict()) From 736eb695f8e24b43dd7ff3d1a9fe2c8360e67ff0 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 5 Mar 2024 17:05:45 -0500 Subject: [PATCH 19/61] remove debug code --- packages-nextgen/kestrel_core/src/kestrel/interface/manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index df9cabbc..5b5b27f8 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -37,8 +37,7 @@ def __getitem__(self, scheme: str) -> AbstractInterface: if scheme in interface.schemes: return interface else: - x = [i.__class__ for i in self.interfaces] - raise InterfaceNotFound(f"no interface loaded for scheme {scheme}; {x}") + raise InterfaceNotFound(f"no interface loaded for scheme {scheme}") def __iter__(self) -> Iterable[str]: return itertools.chain(*[i.schemes for i in self.interfaces]) From 0dce14fd0defbcc750be3a6babbf3c0eee7f526c Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 11 Mar 2024 14:36:51 -0400 Subject: [PATCH 20/61] opensearch: fix sort direction --- .../__init__.py | 0 .../config.py | 0 .../interface.py | 0 .../ossql.py | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) rename packages-nextgen/kestrel_interface_opensearch/src/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/__init__.py (100%) rename packages-nextgen/kestrel_interface_opensearch/src/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/config.py (100%) rename packages-nextgen/kestrel_interface_opensearch/src/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/interface.py (100%) rename packages-nextgen/kestrel_interface_opensearch/src/{kestrel_datasource_opensearch => kestrel_interface_opensearch}/ossql.py (99%) diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py similarity index 100% rename from packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/__init__.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/__init__.py diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py similarity index 100% rename from packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/config.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py similarity index 100% rename from packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/interface.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py similarity index 99% rename from packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/ossql.py rename to packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 55976d23..8d04e7f5 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_datasource_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -230,7 +230,7 @@ def result(self) -> str: if where: stages.append(f"WHERE {where}") if self.order_by: - stages.append(f"ORDER BY {self.order_by} {self.sort_dir}") + stages.append(f"ORDER BY {self.order_by} {self.sort_dir.value}") if self.limit: # https://opensearch.org/docs/latest/search-plugins/sql/sql/basic/#limit if self.offset: From 09f25797f842a8769f3b20c830f1c89b48286aa7 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 11 Mar 2024 14:44:56 -0400 Subject: [PATCH 21/61] add kestrel_interface_opensearch to unit test workflow --- .github/workflows/unit-testing-kestrel2.yml | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml index 06b79fde..b44d2ce8 100644 --- a/.github/workflows/unit-testing-kestrel2.yml +++ b/.github/workflows/unit-testing-kestrel2.yml @@ -42,3 +42,25 @@ jobs: - name: Unit testing run: pytest -vv + test-kestrel-interface-opensearch: + strategy: + matrix: + os: [ubuntu-latest, macos-latest] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash + working-directory: ./packages-nextgen/kestrel_interface_opensearch + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install Python Tools + run: pip install --upgrade pip setuptools wheel pytest + - name: Install kestrel_interface_opensearch + run: pip install . + - name: Unit testing + run: pytest -vv From 466281c01e10922a324dbbf4fb4d54e0005778d8 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 11 Mar 2024 14:51:02 -0400 Subject: [PATCH 22/61] install kestrel_core before kestrel_interface_opensearch in unit test workflow --- .github/workflows/unit-testing-kestrel2.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml index b44d2ce8..65c73044 100644 --- a/.github/workflows/unit-testing-kestrel2.yml +++ b/.github/workflows/unit-testing-kestrel2.yml @@ -60,6 +60,9 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install Python Tools run: pip install --upgrade pip setuptools wheel pytest + - name: Install kestrel_core + working-directory: ./packages-nextgen/kestrel_core + run: pip install . - name: Install kestrel_interface_opensearch run: pip install . - name: Unit testing From e0df6edb4b7c1da36423069c7f89593770d5f63e Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Mon, 11 Mar 2024 15:03:55 -0400 Subject: [PATCH 23/61] Unit test fix for Python 3.8 --- .../src/kestrel_interface_opensearch/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index add15f4a..c7294bac 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass, field -from typing import Dict, Optional +from typing import Dict, Mapping, Optional import yaml from mashumaro.mixins.json import DataClassJSONMixin @@ -43,7 +43,7 @@ class Index(DataClassJSONMixin): timestamp: str timestamp_format: str data_model_mapping: Optional[str] = None - data_model_map: dict = field(default_factory=dict) + data_model_map: Mapping = field(default_factory=dict) def __post_init__(self): if self.data_model_mapping: From f6377ba38452e3d7e28acb0f1412e959f4dc3a9f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 11 Mar 2024 17:01:25 -0400 Subject: [PATCH 24/61] fix the interface type bug for Kestrel 2 --- .../kestrel_core/src/kestrel/cache/base.py | 4 ++-- .../kestrel_core/src/kestrel/interface/base.py | 6 ++++-- .../kestrel_core/src/kestrel/interface/manager.py | 11 ++++++----- .../test_interface_datasource_codegen_dataframe.py | 2 +- packages-nextgen/kestrel_core/tests/test_session.py | 8 ++++---- .../src/kestrel_interface_opensearch/interface.py | 4 ++-- 6 files changed, 19 insertions(+), 16 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py index 42359aa8..4d1a94bb 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/cache/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/cache/base.py @@ -16,8 +16,8 @@ class AbstractCache(AbstractInterface, MutableMapping): - evaluate_graph() """ - @property - def schemes(self): + @staticmethod + def schemes() -> Iterable[str]: return [CACHE_INTERFACE_IDENTIFIER] @abstractmethod diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/base.py b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py index 4cefaa60..50f5601f 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/base.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/base.py @@ -52,9 +52,11 @@ def __init__( except: raise InvalidSerializedDatasourceInterfaceCacheCatalog() - @property + # Python 3.13 will drop chain of @classmethod and @property + # use @staticmethod instead (cannot make it a property) + @staticmethod @abstractmethod - def schemes(self) -> Iterable[str]: + def schemes() -> Iterable[str]: """The schemes to specify the interface Each scheme should be defined as ``("_"|LETTER) ("_"|LETTER|DIGIT)*`` diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index 5b5b27f8..9e90f777 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -7,7 +7,7 @@ import itertools from copy import copy from typeguard import typechecked -from typing import Mapping, Iterable +from typing import Mapping, Iterable, Type from kestrel.exceptions import ( InterfaceNotFound, @@ -26,7 +26,7 @@ class InterfaceManager(Mapping): def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): interface_classes = _load_interface_classes() - self.interfaces = list(init_interfaces) # copy/recreate the list + self.interfaces = list(init_interfaces) # copy/recreate the list for iface_cls in interface_classes: iface = iface_cls() _logger.debug(f"Initialize interface {iface.__name__}") @@ -34,13 +34,13 @@ def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): def __getitem__(self, scheme: str) -> AbstractInterface: for interface in self.interfaces: - if scheme in interface.schemes: + if scheme in interface.schemes(): return interface else: raise InterfaceNotFound(f"no interface loaded for scheme {scheme}") def __iter__(self) -> Iterable[str]: - return itertools.chain(*[i.schemes for i in self.interfaces]) + return itertools.chain(*[i.schemes() for i in self.interfaces]) def __len__(self) -> int: return sum(1 for _ in iter(self)) @@ -96,7 +96,8 @@ def _is_class(cls): @typechecked def _guard_scheme_conflict( - new_interface: AbstractInterface, interfaces: Iterable[AbstractInterface] + new_interface: Type[AbstractInterface], + interfaces: Iterable[Type[AbstractInterface]], ): for interface in interfaces: for scheme_new in new_interface.schemes(): diff --git a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py index e57ff1d9..4f9f7507 100644 --- a/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py +++ b/packages-nextgen/kestrel_core/tests/test_interface_datasource_codegen_dataframe.py @@ -56,7 +56,7 @@ def test_evaluate_ProjectAttrs(): def test_evaluate_Construct_Filter_ProjectAttrs(): - stmt = """ + stmt = r""" proclist = NEW process [ {"name": "cmd.exe", "pid": 123} , {"name": "explorer.exe", "pid": 99} , {"name": "firefox.exe", "pid": 201} diff --git a/packages-nextgen/kestrel_core/tests/test_session.py b/packages-nextgen/kestrel_core/tests/test_session.py index d4edbaf5..115154d4 100644 --- a/packages-nextgen/kestrel_core/tests/test_session.py +++ b/packages-nextgen/kestrel_core/tests/test_session.py @@ -88,13 +88,13 @@ def test_explain_in_cache(): def test_multi_interface_explain(): class DataLake(SqliteCache): - @property - def schemes(self): + @staticmethod + def schemes(): return ["datalake"] class Gateway(SqliteCache): - @property - def schemes(self): + @staticmethod + def schemes(): return ["gateway"] extra_db = [] diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index a7dba14c..de68d270 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -90,8 +90,8 @@ def __init__( ) self.conns[name] = client - @property - def schemes(self): + @staticmethod + def schemes() -> Iterable[str]: return ["opensearch"] def store( From 0cca5d2250d9a24cfb53e65019f6ed60bf156c4f Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 14 Mar 2024 13:36:31 -0400 Subject: [PATCH 25/61] data model mapping --- .../src/kestrel/frontend/compile.py | 35 +- .../src/kestrel/frontend/parser.py | 14 +- .../src/kestrel/interface/manager.py | 2 +- .../src/kestrel/mapping/data_model.py | 277 ++++++++++++ .../kestrel/mapping/entityattribute/ecs.yaml | 421 ++++++++---------- .../kestrel/mapping/entityattribute/stix.yaml | 353 ++++++--------- .../src/kestrel/mapping/transformers.py | 86 ++++ .../kestrel_core/src/kestrel/mapping/utils.py | 122 +---- .../kestrel_core/tests/test_mapping.py | 59 --- .../tests/test_mapping_data_model.py | 175 ++++++++ .../tests/test_mapping_transformers.py | 23 + .../kestrel_core/tests/test_parser.py | 13 +- .../kestrel_interface_opensearch/config.py | 17 +- .../kestrel_interface_opensearch/interface.py | 7 +- .../src/kestrel_interface_opensearch/ossql.py | 98 ++-- .../tests/test_ossql.py | 32 +- 16 files changed, 1048 insertions(+), 686 deletions(-) create mode 100644 packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py create mode 100644 packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py create mode 100644 packages-nextgen/kestrel_core/tests/test_mapping_data_model.py create mode 100644 packages-nextgen/kestrel_core/tests/test_mapping_transformers.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py index 7e720d86..cb1f897f 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py @@ -1,5 +1,6 @@ # Lark Transformer +import logging from datetime import datetime, timedelta from functools import reduce @@ -7,6 +8,7 @@ from lark import Transformer, Token from typeguard import typechecked +from kestrel.mapping.data_model import translate_comparison_to_ocsf from kestrel.utils import unescape_quoted_string from kestrel.ir.filter import ( FExpression, @@ -46,6 +48,9 @@ from kestrel.exceptions import IRGraphMissingNode +_logger = logging.getLogger(__name__) + + DEFAULT_VARIABLE = "_" DEFAULT_SORT_ORDER = "DESC" @@ -95,17 +100,29 @@ def _map_filter_exp( if ":" not in field: field = f"{entity_name}:{field}" # map field to new syntax (e.g. STIX to OCSF) - map_result = property_map.get(field, filter_exp.field) + # TODO: ECS to OCSF? Would need to merge STIX and ECS data model maps. + map_result = translate_comparison_to_ocsf( + property_map, field, filter_exp.op, filter_exp.value + ) # Build a MultiComp if field maps to several values - if isinstance(map_result, (list, tuple)): - op = filter_exp.op - value = filter_exp.value + if len(map_result) > 1: filter_exp = MultiComp( - ExpOp.OR, [_create_comp(field, op, value) for field in map_result] + ExpOp.OR, + [_create_comp(field, op, value) for field, op, value in map_result], ) - else: # change the name of the field if it maps to a single value - filter_exp.field = map_result - + elif len(map_result) == 1: # it maps to a single value + mapping = map_result[0] + _logger.debug("mapping = %s", mapping) + field = mapping[0] + prefix = f"{entity_name}." + if field.startswith(prefix): + # Need to prune the entity name + field = field[len(prefix) :] + filter_exp.field = field + filter_exp.op = mapping[1] + filter_exp.value = mapping[2] + else: # pass-through + pass # TODO: for RefComparison, map the attribute in value (may not be possible here) elif isinstance(filter_exp, BoolExp): @@ -152,7 +169,7 @@ def __init__( self.default_sort_order = default_sort_order self.token_prefix = token_prefix self.entity_map = entity_map - self.property_map = property_map + self.property_map = property_map # TODO: rename to data_model_map? super().__init__() def start(self, args): diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py index e5bcbdab..0ff482c5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/parser.py @@ -1,14 +1,20 @@ # parse Kestrel syntax, apply frontend mapping, transform to IR +import logging +import os from itertools import chain from kestrel.frontend.compile import _KestrelT +from kestrel.mapping.data_model import reverse_mapping from kestrel.utils import load_data_file from lark import Lark -import os from typeguard import typechecked import yaml + +_logger = logging.getLogger(__name__) + + frontend_mapping = {} @@ -21,9 +27,13 @@ def get_mapping(mapping_type: str, mapping_package: str, mapping_filepath: str) try: mapping_str = load_data_file(mapping_package, mapping_filepath) mapping = yaml.safe_load(mapping_str) + if mapping_type == "property": + # New data model map is always OCSF->native + mapping = reverse_mapping(mapping) frontend_mapping[mapping_type] = mapping except Exception as ex: - mapping = None + _logger.error("Failed to load %s", mapping_str, exc_info=ex) + mapping = None # FIXME: this is not a dict return mapping diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index 9e90f777..3e155ded 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -29,7 +29,7 @@ def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): self.interfaces = list(init_interfaces) # copy/recreate the list for iface_cls in interface_classes: iface = iface_cls() - _logger.debug(f"Initialize interface {iface.__name__}") + _logger.debug(f"Initialize interface {iface_cls.__name__}") self.interfaces.append(iface) def __getitem__(self, scheme: str) -> AbstractInterface: diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py new file mode 100644 index 00000000..c7ee086c --- /dev/null +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -0,0 +1,277 @@ +import logging +from typing import Union + +import yaml +from typeguard import typechecked + +from kestrel.mapping.transformers import run_transformer +from kestrel.utils import list_folder_files + +_logger = logging.getLogger(__name__) + + +def _add_mapping(obj: dict, key: str, mapping: dict): + """Add `key` -> `mapping` to `obj`, appending if necessary""" + existing_mapping = obj.get(key) + if existing_mapping: + if isinstance(existing_mapping, str): + existing_mapping = [{"ocsf_field": existing_mapping}] + elif isinstance(existing_mapping, dict): + existing_mapping = [existing_mapping] + else: + existing_mapping = [] + existing_mapping.append(mapping) + obj[key] = existing_mapping + + +def _reverse_dict(obj: dict, k: str, v: dict): + """Reverse a single OCSF -> native mapping and add it to `obj`""" + key = v["native_field"] + mapping = {i: j for i, j in v.items() if i != "native_field"} + mapping["ocsf_field"] = k + _add_mapping(obj, key, mapping) + + +def _add_attr(obj: dict, key: str, value: str): + """Add `key` -> `value` to `obj`, appending if necessary""" + if key not in obj: + obj[key] = value + else: + existing = obj[key] + if isinstance(existing, str): + obj[key] = [existing, value] + else: + existing.append(value) + + +def reverse_mapping(obj: dict, prefix: str = None, result: dict = None) -> dict: + """Reverse the mapping; return native -> OCSF map""" + if result is None: + result = {} + for k, v in obj.items(): + k = ".".join((prefix, k)) if prefix else k + # Recurse if necessary + if isinstance(v, str): + _add_attr(result, v, k) + elif isinstance(v, list): + # Need to handle multiple mappings + for i in v: + if isinstance(i, str): + _add_attr(result, i, k) + elif "native_field" in i: + _reverse_dict(result, k, i) + else: + # Need to "deep" merge with current results + reverse_mapping(i, k, result) + elif isinstance(v, dict): + # First determine if this is a complex mapping or just another level + if "native_field" in v: + _reverse_dict(result, k, v) + else: + # Need to "deep" merge with current results + reverse_mapping(v, k, result) + + return result + + +def get_simple_property_mapping( + obj: dict, prefix: str = None +) -> dict: # TODO: remove? Not really needed + """Parse the data model map `obj` and return a simple "native -> OCSF" attribute name map""" + attribute_map = {} + for k, v in obj.items(): + k = ".".join((prefix, k)) if prefix else k + # Recurse if necessary + if isinstance(v, str): + _add_attr(attribute_map, v, k) + elif isinstance(v, list): + for i in v: + if isinstance(i, str): + _add_attr(attribute_map, i, k) + else: + native_field = i.get("native_field") + if native_field: + _add_attr(attribute_map, native_field, k) + else: + # Need to "deep" merge with current results + attribute_map.update(get_simple_property_mapping(i, k)) + elif isinstance(v, dict): + # First determine if this is a complex mapping or just another level + native_field = v.get("native_field") + if native_field: + _add_attr(attribute_map, native_field, k) + else: + # Need to "deep" merge with current results + attribute_map.update(get_simple_property_mapping(v, k)) + return attribute_map + + +def _map_op(op: str, mapped_op: str) -> str: + # TODO: does native_value matter? + return mapped_op if mapped_op else op + + +def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple: + mapped_op = d.get(f"{prefix}_op") # to_native_op? + transform = d.get(f"{prefix}_value") # to_native_value? + new_value = run_transformer(transform, value) + new_op = _map_op(op, mapped_op) + return (d[f"{prefix}_field"], new_op, new_value) + + +def translate_comparison_to_native( + dmm: dict, field: str, op: str, value: Union[str, int, float] +) -> list: + """Translate the (`field`, `op`, `value`) triple using data model map `dmm` + + This function may be used in datasource interfaces to translate a comparison + in the OCSF data model to the native data model, according to the data model + mapping in `dmm`. + + This function translates the (`field`, `op`, `value`) triple into a list of + translated triples based on the provided data model map. The data model map + is a dictionary that maps fields from one data model to another. For + example, if you have a field named "user.name" in your data model, but the + corresponding field in the native data model is "username", then you can use + the data model map to translate the field name. + + Parameters: + dmm: A dictionary that maps fields from one data model to another. + field: The field name to be translated. + op: The comparison operator. + value: The value to be compared against. + + Returns: + A list of translated triples. + + Raises: + KeyError: If the field cannot be found in the data model map. + """ + _logger.debug("comp_to_native: %s %s %s", field, op, value) + result = [] + mapping = dmm.get(field) + if mapping: + if isinstance(mapping, str): + # Simple 1:1 field name mapping + result.append((mapping, op, value)) + else: + raise NotImplementedError("complex native mapping") + else: + parts = field.split(".") + tmp = dmm + for part in parts: + if isinstance(tmp, dict): + tmp = tmp.get(part, {}) # tmp[part] + else: + break + if tmp: + if isinstance(tmp, list): + for i in tmp: + if isinstance(i, dict): + mapped_op = i.get("native_op") # to_native_op? + transform = i.get("native_value") # to_native_value? + new_value = run_transformer(transform, value) + new_op = _map_op(op, mapped_op) + result.append((i["native_field"], new_op, new_value)) + else: + result.append((i, op, value)) + elif isinstance(tmp, dict): + result.append(_get_map_triple(tmp, "native", op, value)) + elif isinstance(tmp, str): + result.append((tmp, op, value)) + else: + # Pass-through + result.append((field, op, value)) + _logger.debug("comp_to_native: return %s", result) + return result + + +def translate_comparison_to_ocsf( + dmm: dict, field: str, op: str, value: Union[str, int, float] +) -> list: + """Translate the (`field`, `op`, `value`) triple using data model map `dmm` + + This function is used in the frontend to translate a comparison in + the STIX (or, in the future, ECS) data model to the OCSF data + model, according to the data model mapping in `dmm`. + + This function translates the (`field`, `op`, `value`) triple into a list of + translated triples based on the provided data model map. The data model map + is a dictionary that maps fields from one data model to another. For + example, if you have a field named "user.name" in your data model, but the + corresponding field in the native data model is "username", then you can use + the data model map to translate the field name. + + Parameters: + dmm: A dictionary that maps fields from one data model to another. + field: The field name to be translated. + op: The comparison operator. + value: The value to be compared against. + + Returns: + A list of translated triples. + + Raises: + KeyError: If the field cannot be found in the data model map. + + """ + _logger.debug("comp_to_ocsf: %s %s %s", field, op, value) + result = [] + mapping = dmm.get(field) + if isinstance(mapping, str): + # Simple 1:1 field name mapping + result.append((mapping, op, value)) + elif isinstance(mapping, list): + for i in mapping: + if isinstance(i, dict): + # mapped_op = i.get("ocsf_op") # to_ocsf_op? + # transform = i.get("ocst_value") # to_ocsf_value? + # new_value = run_transformer(transform, value) if transform else value + # new_op = _map_op(op, mapped_op) + # result.append((i["ocsf_field"], new_op, new_value)) + result.append(_get_map_triple(i, "ocsf", op, value)) + else: + result.append((i, op, value)) + return result + + +def flatten_mapping(dmm: dict, key: str = None) -> dict: + """Flatten the nested `dmm` so that the keys are dotted "paths", optionally starting at `key` + + Parameters: + dmm: A dictionary that maps fields from one data model to another. + key: starting point in `dmm` (optional) + + Returns: + A dict with dotted path keys + """ + root = dmm[key] if key else dmm + return _flatten(root) + + +def _flatten(root: dict, result: dict = None, prefix: str = "") -> dict: + if result is None: + result = {} + for k, v in root.items(): + key = prefix + "." + k if prefix else k + if isinstance(v, dict): + _flatten(v, result, key) + else: + result[key] = v + return result + + +@typechecked +def load_mapping( + data_model_name: str, + mapping_pkg: str = "kestrel.mapping", + submodule: str = "entityattribute", +): + result = {} + entityattr_mapping_files = list_folder_files( + mapping_pkg, submodule, prefix=data_model_name, suffix=".yaml" + ) + for f in entityattr_mapping_files: + with open(f, "r") as fp: + result.update(yaml.safe_load(fp)) + return result diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index ef3ff62c..0abe4cec 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -1,237 +1,188 @@ -process.command_line: process.cmd_line -process.end: process.end_time -process.entity_id: process.uid -process.executable: process.file.path -process.exit_code: process.exit_code -process.name: process.name -process.pid: process.pid -process.start: process.start_time -process.thread.id: process.tid -# process.args -# process.args_count -# process.entry_meta.type -# process.env_vars -# process.interactive -# process.same_as_process -# process.thread.capabilities.effective -# process.thread.capabilities.permitted -# process.thread.name -# process.title -# process.tty -# process.uptime -# process.vpid -# process.working_directory -file.accessed: file.accessed_time -file.attributes: file.attributes -file.created: file.created_time -file.ctime: file.modified_time -file.directory: file.parent_folder -file.gid: file.xattributes.primary_group -file.mime_type: file.mime_type -file.mode: file.mode -file.mtime: file.modified_time -file.name: file.name -file.owner: file.owner -file.path: file.path -file.size: file.size -file.target_path: file.xattributes.link_name -file.type: file.type -# file.device -# file.drive_letter -# file.extension -# file.fork_name -# file.inode -# file.uid -group.name: group.name -group.id: group.uid -# group.domain -client.bytes: traffic.bytes_out -client.domain: src_endpoint.domain -client.ip: src_endpoint.ip -client.mac: src_endpoint.mac -client.packets: traffic.packets_out -client.port: src_endpoint.port -# client.address -# client.nat.ip -# client.nat.port -# client.registered_domain -# client.subdomain -# client.top_level_domain -destination.bytes: traffic.bytes_in -destination.domain: dst_endpoint.domain -destination.ip: dst_endpoint.ip -destination.mac: dst_endpoint.mac -destination.packets: traffic.packets_in -destination.port: dst_endpoint.port -# destination.address -# destination.nat.ip -# destination.nat.port -# destination.registered_domain -# destination.subdomain -# destination.top_level_domain -server.bytes: traffic.bytes_in -server.domain: dst_endpoint.domain -server.ip: dst_endpoint.ip -server.mac: dst_endpoint.mac -server.packets: traffic.packets_in -server.port: dst_endpoint.port -# server.address -# server.nat.ip -# server.nat.port -# server.registered_domain -# server.subdomain -# server.top_level_domain -source.bytes: traffic.bytes_out -source.domain: src_endpoint.domain -source.ip: src_endpoint.ip -source.mac: src_endpoint.mac -source.packets: traffic.packets_out -source.port: src_endpoint.port -# source.address -# source.nat.ip -# source.nat.port -# source.registered_domain -# source.subdomain -# source.top_level_domain +# https://schema.ocsf.io/1.1.0/objects/file +file: + accessed_time: file.accessed + attributes: file.attributes + created_time: file.created + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: hash.md5 + hashes[?algorithm_id == 2]: + value: hash.sha1 + hashes[?algorithm_id == 3]: + value: hash.sha256 + hashes[?algorithm_id == 4]: + value: hash.sha512 + hashes[?algorithm_id == 5]: + value: hash.ssdeep + hashes[?algorithm_id == 6]: + value: hash.tlsh + hashes[*]: + value: + - hash.md5 + - hash.sha1 + - hash.sha256 + - hash.sha512 + - hash.ssdeep + - hash.tlsh + modified_time: file.ctime + mime_type: file.mime_type + mode: file.mode + modified_time: file.mtime + name: file.name + owner: file.owner + parent_folder: file.directory + path: file.path + size: file.size + type: file.type + xattributes: + primary_group: file.gid + link_name: file.target_path -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009] -network.application: app_name -network.bytes: traffic.bytes -network.direction: connection_info.direction -network.iana_number: connection_info.protocol_num -network.packets: traffic.packets -network.protocol: connection_info.protocol_name -network.type: connection_info.protocol_ver_id -# network.community_id -# network.forwarded_ip -# network.inner -# network.name -# network.transport: -hash.md5: file.hashes[?algorithm_id == 1].value -hash.sha1: file.hashes[?algorithm_id == 2].value -hash.sha256: file.hashes[?algorithm_id == 3].value -hash.sha512: file.hashes[?algorithm_id == 4].value -hash.ssdeep: file.hashes[?algorithm_id == 5].value -hash.tlsh: file.hashes[?algorithm_id == 6].value -# hash.sha384 -x509.not_after: certificate.expiration_time -x509.not_before: certificate.created_time -x509.serial_number: certificate.serial_number -x509.signature_algorithm: certificate.fingerprints.algorithm -x509.version_number: certificate.version -# x509.alternative_names -# x509.issuer.common_name: certificate.issuer -# x509.issuer.country: certificate.issuer -# x509.issuer.distinguished_name: certificate.issuer -# x509.issuer.locality: certificate.issuer -# x509.issuer.organization: certificate.issuer -# x509.issuer.organizational_unit: certificate.issuer -# x509.issuer.state_or_province: certificate.issuer -# x509.public_key_algorithm -# x509.public_key_curve -# x509.public_key_exponent -# x509.public_key_size -# x509.subject.common_name: certificate.subject -# x509.subject.country: certificate.subject -# x509.subject.distinguished_name: certificate.subject -# x509.subject.locality: certificate.subject -# x509.subject.organization: certificate.subject -# x509.subject.organizational_unit: certificate.subject -# x509.subject.state_or_province: certificate.subject -as.number: device.org.number -as.organization.name: device.org.name -geo.city_name: location.city -geo.continent_name: location.continent -geo.country_iso_code: location.county -geo.location: location.coordinates -geo.postal_code: location.postal_code -geo.region_iso_code: location.region -# geo.continent_code -# geo.country_name -# geo.name -# geo.region_name -# geo.timezone -user.domain: user.domain -user.email: user.email_addr -user.full_name: user.full_name -user.id: user.uid -user.name: user.name -# user.roles -# user.hash: -referenced_fields: - process.group: - ref: group - prefix: process - process.hash: - ref: hash - prefix: process - process.parent: - ref: process # ECS entity used for attribute mapping - prefix: process # OCSF Prefix - target_entity: parent_process # Updated OCSF entity name - process.user: - ref: user - prefix: process - # process.code_signature: code_signature - # process.entry_leader: process - # process.entry_leader.parent: process - # process.entry_leader.parent.session_leader: process - # process.entry_meta.source: source - # process.group_leader: process - # process.macho: macho - # process.parent.group_leader: process - # process.pe: pe - # process.previous: process - # process.real_group: group - # process.real_user: user - # process.saved_group: group - # process.saved_user: user - # process.session_leader: process - # process.session_leader.parent: process - # process.session_leader.parent.session_leader: process - # process.supplemental_groups: group - file.hash: - ref: hash - prefix: null - file.x509: - ref: x509 - prefix: tls - # file.code_signature.* - # file.pe.* - client.as: - ref: as - prefix: null - client.geo: - ref: geo - prefix: src_endpoint - # client.user: - # ref: user - # prefix: src_endpoint - destination.as: - ref: as - prefix: null - destination.geo: - ref: geo - prefix: dst_endpoint - # destination.user: - # ref: user - # prefix: dst_endpoint - server.as: - ref: as - prefix: null - server.geo: - ref: geo - prefix: dst_endpoint - # server.user: - # ref: user - # prefix: dst_endpoint - source.as: - ref: as - prefix: null - source.geo: - ref: geo - prefix: src_endpoint - # source.user: - # ref: user - # prefix: src_endpoint +# https://schema.ocsf.io/1.1.0/objects/group +group: + domain: group.domain + name: group.name + uid: group.id + + +# https://schema.ocsf.io/1.1.0/objects/process +process: + cmd_line: process.command_line + name: process.name + pid: process.pid + uid: process.entity_id + file: + name: + native_field: process.executable + native_op: LIKE + native_value: endswith + ocsf_value: basename + path: process.executable + parent_folder: + native_field: process.executable + native_op: LIKE + native_value: startswith + ocsf_value: dirname + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: process.hash.md5 + hashes[?algorithm_id == 2]: + value: process.hash.sha1 + hashes[?algorithm_id == 3]: + value: process.hash.sha256 + hashes[?algorithm_id == 4]: + value: process.hash.sha512 + hashes[?algorithm_id == 5]: + value: process.hash.ssdeep + hashes[?algorithm_id == 6]: + value: process.hash.tlsh + hashes[*]: + value: + - process.hash.md5 + - process.hash.sha1 + - process.hash.sha256 + - process.hash.sha512 + - process.hash.ssdeep + - process.hash.tlsh + parent_process: + cmd_line: process.parent.command_line + name: process.parent.name + pid: process.parent.pid + uid: process.parent.entity_id + file: + name: + native_field: process.parent.executable + native_op: LIKE + native_value: endswith + ocsf_value: basename + path: process.parent.executable + parent_folder: + native_field: process.parent.executable + native_op: LIKE + native_value: startswith + ocsf_value: dirname + + +# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +src_endpoint: + domain: + - client.domain + - source.domain + hostname: + - client.domain + - source.domain + ip: + - client.ip + - source.ip + mac: + - client.mac + - source.mac + +# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +dst_endpoint: + domain: + - server.domain + - destination.domain + hostname: + - server.domain + - destination.domain + ip: + - server.ip + - destination.ip + mac: + - server.mac + - destination.mac + + +# https://schema.ocsf.io/1.1.0/objects/network_traffic +traffic: # should be `network_traffic`? + bytes: network.bytes + bytes_in: + - destination.bytes + - server.bytes + bytes_out: + - client.bytes + - source.bytes + packets: network.packets + packets_in: + - destination.packets + - server.packets + packets_out: + - client.packets + - source.packets + + +# https://schema.ocsf.io/1.1.0/objects/network_connection_info +connection_info: + direction: network.direction #TODO: need transformer? + protocol_num: network.iana_number + protocol_name: network.transport + protocol_ver: network.type + protocol_ver_id: + native_field: network.type + native_value: ip_version_to_network_layer + ocsf_value: network_layer_to_ip_version + + +# https://schema.ocsf.io/1.1.0/objects/certificate +certificate: + expiration_time: x509.not_after + created_time: x509.not_before + serial_number: x509.serial_number + fingerprints[*]: + algorithm: x509.signature_algorithm + version: x509.version_number + issuer: x509.issuer.distinguished_name + subject: x509.subject.distinguished_name + #uid: + + +# https://schema.ocsf.io/1.1.0/objects/user +user: + full_name: user-account:display_name + name: user-account:account_login + type: user-account:account_type + uid: user-account:user_id diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml index f0ed912a..7082e6dd 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/stix.yaml @@ -1,210 +1,143 @@ -# All Categories [*] -autonomous-system:name: device.org.name -autonomous-system:number: device.org.uid - -# File System Activity [1001] -directory:path: file.path -directory:accessed: file.accessed_time -directory:created: file.created_time -directory:modified: file.modified_time - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], Email Activity [4009] -domain-name.value: - - src_endpoint.domain - - dst_endpoint.domain - - dns_query.hostname - -# Email Activity [4009] -email-addr:value: user.email_addr -email-addr:display_name: user.full_name -# email-message:is_multipart -# email-message:date -# email-message:content_type -email-message:from_ref.value: email.from -email-message:sender_ref.value: email.smtp_from -email-message:to_refs[*].value: email.to -email-message:cc_refs[*].value: email.cc -email-message:subject: email.subject -# email-message:received_lines -email-message:additional_header_fields: email.raw_header -# email-message:body -email-message:body_multipart.body_raw_ref.name: file.name -# email-message:raw_email_ref -# email-message:body_multipart.body: file.mime_type - -# File System Activity [1001], Network File Activity [4010], Email File Activity [4011] -file:accessed: file.accessed_time -file:created: file.created_time -file:name: file.name -file:size: file.size -file:hashes.SHA-256: file.hashes[?algorithm_id == 3].value -file:hashes.SHA-1: file.hashes[?algorithm_id == 2].value -file:hashes.MD5: file.hashes[?algorithm_id == 1].value -file:parent_directory_ref.path: file.parent_folder -# file:name_enc -# file:magic_number_hex -file:mime_type: file.mime_type -# file:is_encrypted -# file:encryption_algorithm -# file:decryption_key -# file:contains_refs -# file:content_ref - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -ipv4-addr:value: - - dst_endpoint.ip - - src_endpoint.ip - - device.ip -# ipv4-addr.belongs_to_refs -# ipv4-addr.resolves_to_refs - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -ipv6-addr:value: - - dst_endpoint.ip - - src_endpoint.ip - - device.ip - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -mac-addr:value: - - dst_endpoint.mac - - src_endpoint.mac - - device.mac - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003] -network-traffic:dst_byte_count: traffic.bytes_in -network-traffic:dst_packets: traffic.packets_in -network-traffic:dst_port: dst_endpoint.port -network-traffic:dst_ref.value: dst_endpoint.ip -network-traffic:protocols[*]: - - connection_info.protocol_num - - connection_info.protocol_name - - connection_info.protocol_ver_id -network-traffic:src_byte_count: traffic.bytes_out -network-traffic:src_packets: traffic.packets_out -network-traffic:src_port: src_endpoint.port -network-traffic:src_ref.value: src_endpoint.ip -network_traffic:start: start_time -network_traffic:end: end_time -# network_traffic:is_active -# network_traffic:ipfix -# network_traffic:src_payload_ref -# network_traffic:dst_payload_ref -# network_traffic:encapsulates_refs -# network_traffic:encapsulated_by_ref - -# Process Activity [1007] -process:binary_ref.name: file.name -process:command_line: process.cmd_line -process:created: process.created_time -process:mime_type: mime_type -process:name: process.name -process:pid: process.pid -process:x_unique_id: process.uid -process:parent_ref.name: - - actor.process.name - - process.parent_process.name - -# Base Event [0] -software:extension.product.feature_name: metadata.product.feature.name -software:extension.product.feature_uid: metadata.product.feature.uid -software:extension.product.feature_version: metadata.product.feature.version -software:extension.product.path: metadata.product.path -software:extension.product.uid: metadata.product.uid -software:languages: metadata.product.lang -software:name: metadata.product.name -software:vendor: metadata.product.vendor_name -software:version: metadata.product.version - -# HTTP Activity [4002] -url:value: http_request.url - -# Account Change [3001], Authentication [3002], Authorize Session [3003], User Access Management [3005] -user-account:account_type: user.account.type -user-account:display_name: user.account.name -user-account:user_id: user.account.uid - -# Base Event [0] -x-ibm-finding:alert_id: - - observables.type_id - - finding.uid -x-ibm-finding:description: observables.value -x-ibm-finding:dst_ip_ref.value: dst_endpoint.ip -x-ibm-finding:end: end_time -x-ibm-finding:event_count: count -x-ibm-finding:finding_type: observables.type -x-ibm-finding:name: - - observables.name - - finding.title -x-ibm-finding:severity: severity_id -x-ibm-finding:src_ip_ref.value: src_endpoint.ip -x-ibm-finding:start: finding.created_time -x-ibm-finding:time_observed: finding.first_seen_time -x-ibm-finding:types: finding.types - -# All Categories [*] -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_id: attacks[*].tactics.uid -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.tactic_name: attacks[*].tactics.name -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.technique_id: attacks[*].technique.uid -x-ibm-ttp-tagging:extensions.'mitre-attack-ext'.version: attacks[*].version -x-ibm-ttp-tagging:name: attacks[*].technique.name - -# All Categories [*] -x-oca-asset:name: - - dst_endpoint.name - - src_endpoint.name - - device.name -x-oca-asset:os_name: device.os.name -x-oca-asset:hostname: device.hostname -x-oca-asset:device_id: device.uid -x-oca-asset:ip_refs[*].value: device.network_interfaces[*].ip -x-oca-asset:mac_refs[*].value: device.network_interfaces[*].mac -x-oca-asset:os_ref: device.os -x-oca-asset:architecture: device.hw_info -x-oca-asset:host_type: device.type -x-oca-asset:ingress: device.network_interfaces -x-oca-asset:egress: device.network_interfaces -x-oca-asset:geo_ref: device.location - -# Base Event [0] -x-oca-event:action: - - activity - - activity_name -x-oca-event:category: category_name -x-oca-event:code: - - activity_id - - category_uid -x-oca-event:confidence: confidence -x-oca-event:created: time -x-oca-event:duration: duration -x-oca-event:module: class_name -x-oca-event:network_ref.dst_ref.value: dst_endpoint.ip -x-oca-event:network_ref.src_ref.value: src_endpoint.ip -x-oca-event:timezone: timezone_offset - -# Network Activity [4001], HTTP Activity [4002], DNS Activity [4003], FTP Activity [4008], SSH Activity [4007] -x509-certificate:hashes.SHA-256: file.hashes[?algorithm_id == 3].value -x509-certificate:hashes.SHA-1: file.hashes[?algorithm_id == 2].value -x509-certificate:hashes.MD5: file.hashes[?algorithm_id == 1].value -x509-certificate:version: tls.certificate.version -x509-certificate:serial_number: tls.certificate.serial_number -x509-certificate:issuer: tls.certificate.issuer -x509-certificate:validity_not_before: tls.certificate.created_time -x509-certificate:validity_not_after: tls.certificate.expiration_time -x509-certificate:subject: tls.certificate.subject -x509-certificate:x509_v3_extensions: tls.extension_list -x509-certificate:signature_algorithm: tls.certificate.fingerprints.algorithm - -# Registry Key Activity [201001] -windows-registry-key:key: win/registry_key.path - -# Additional mapping for STIX 2.1 -# File System Activity [1001] -directory:atime: file.accessed_time -directory:ctime: file.created_time -directory:mtime: file.modified_time -file:atime: file.accessed_time -file:ctime: file.created_time -file:mtime: file.modified_time - -# Process Activity [1007] -process:image_ref.name: file.name +# https://schema.ocsf.io/1.1.0/objects/file +file: + name: file:name + size: file:size + accessed_time: file:accessed + created_time: file:created + modified_time: file:modified + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: file:hashes.MD5 + hashes[?algorithm_id == 2]: + value: "file:hashes.'SHA-1'" + hashes[?algorithm_id == 3]: + value: "file:hashes.'SHA-256'" + hashes[?algorithm_id == 4]: + value: "file:hashes.'SHA-512'" + hashes[?algorithm_id == 5]: + value: file:hashes.SSDEEP + hashes[?algorithm_id == 6]: + value: file:hashes.TLSH + hashes[*]: + value: + - file:hashes.MD5 + - "file:hashes.'SHA-1'" + - "file:hashes.'SHA-256'" + - "file:hashes.'SHA-512'" + - file:hashes.SSDEEP + - file:hashes.TLSH + + +# https://schema.ocsf.io/1.1.0/objects/group +# group: +# domain: +# name: +# uid: + + +# https://schema.ocsf.io/1.1.0/objects/process +process: + cmd_line: process:command_line + name: process:name + pid: process:pid + uid: process:x_unique_id + file: + name: process:binary_ref.name + parent_folder: process:binary_ref.parent_directory_ref.path + # This "hashes" notation comes from jmespath (filter projection) + # It's much easier to use the ECS notation in this case + hashes[?algorithm_id == 1]: + value: process:binary_ref.hashes.MD5 + hashes[?algorithm_id == 2]: + value: process:binary_ref.hashes.'SHA-1' + hashes[?algorithm_id == 3]: + value: process:binary_ref.hashes.'SHA-256' + hashes[?algorithm_id == 4]: + value: process:binary_ref.hashes.'SHA-512' + hashes[?algorithm_id == 5]: + value: process:binary_ref.hashes.SSDEEP + hashes[?algorithm_id == 6]: + value: process:binary_ref.hashes.TLSH + hashes[*]: + value: + - process:binary_ref.hashes.MD5 + - process:binary_ref.hashes.'SHA-1' + - process:binary_ref.hashes.'SHA-256' + - process:binary_ref.hashes.'SHA-512' + - process:binary_ref.hashes.SSDEEP + - process:binary_ref.hashes.TLSH + parent_process: + cmd_line: process:parent_ref.command_line + name: process:parent_ref.name + pid: process:parent_ref.pid + uid: process:parent_ref.x_unique_id + file: + name: process:parent_ref.binary_ref.name + parent_folder: process:parent_ref.binary_ref.parent_directory_ref.path + + +# dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +dst_endpoint: + ip: + - network-traffic:dst_ref.value + - ipv4-addr:value + port: network-traffic:dst_port + + +# src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +src_endpoint: + ip: + - network-traffic:src_ref.value + - ipv4-addr:value + port: network-traffic:src_port + + +# https://schema.ocsf.io/1.1.0/objects/endpoint +endpoint: + ip: ipv4-addr:value + + +# https://schema.ocsf.io/1.1.0/objects/device +device: + ip: ipv4-addr:value + + +# https://schema.ocsf.io/1.1.0/objects/network_traffic +traffic: # should be `network_traffic`? + #TODO: bytes: sum of byte counts? + bytes_in: network-traffic:dst_byte_count + bytes_out: network-traffic:src_byte_count + #TODO: packets: sum of packet counts? + packets_in: network-traffic:dst_packets + packets_out: network-traffic:src_packets + + +# https://schema.ocsf.io/1.1.0/objects/network_connection_info +# connection_info: +# direction: +# protocol_num: +# protocol_name: +# protocol_ver: +# protocol_ver_id: + + +# https://schema.ocsf.io/1.1.0/objects/certificate +certificate: + expiration_time: x509-certificate:validity_not_after + created_time: x509-certificate:validity_not_before + serial_number: x509-certificate:serial_number + fingerprints[*]: + algorithm: x509-certificate:signature_algorithm + version: x509-certificate:version_number + issuer: x509-certificate:issuer + subject: x509-certificate:subject + #uid: + + +# https://schema.ocsf.io/1.1.0/objects/user +user: + full_name: user-account:display_name + name: user-account:account_login + type: user-account:account_type + uid: user-account:user_id diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py new file mode 100644 index 00000000..e4ce89ab --- /dev/null +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py @@ -0,0 +1,86 @@ +"""Kestrel Data Model Map value transformers""" + +from typing import Callable + + +# Dict of "registered" transformers +_transformers = {} + + +def transformer(func: Callable) -> Callable: + """A decorator for registering a transformer""" + _transformers[func.__name__] = func + return func + + +@transformer +def dirname(path: str) -> str: # TODO: rename to winpath_dirname? + """Get the directory part of `path`""" + path_dir, _, _ = path.rpartition("\\") + return path_dir + + +@transformer +def basename(path: str) -> str: # TODO: rename to winpath_dirname? + """Get the filename part of `path`""" + _, _, path_file = path.rpartition("\\") + return path_file + + +@transformer +def startswith(value: str) -> str: # TODO: rename to winpath_startswith? + return f"{value}\\%" + + +@transformer +def endswith(value: str) -> str: # TODO: rename to winpath_endswith? + return f"%\\{value}" + + +@transformer +def to_int(value) -> int: + """Ensure `value` is an int""" + try: + return int(value) + except ValueError: + # Maybe it's a hexadecimal string? + return int(value, 16) + + +@transformer +def to_str(value) -> str: + """Ensure `value` is a str""" + return str(value) + + +@transformer +def ip_version_to_network_layer(value: int) -> str: + if value == 4: + return "ipv4" + elif value == 6: + return "ipv6" + elif value == 99: + return "other" + return "unknown" + + +@transformer +def network_layer_to_ip_version(val: str) -> int: + value = val.lower() + if value == "ipv4": + return 4 + elif value == "ipv6": + return 6 + elif value == "other": + return 99 + return 0 + + +def run_transformer(transformer_name: str, value): + """Run the registered transformer with name `transformer_name` on `value`""" + func = _transformers.get(transformer_name) + if func: + result = func(value) + else: + raise NameError(transformer_name) + return result diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py index 3e15b036..d06a5854 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py @@ -1,15 +1,21 @@ -from kestrel.exceptions import MappingParseError -from kestrel.utils import load_data_file, list_folder_files +import logging import os -from typeguard import typechecked from typing import ( Iterable, Union, ) + +from typeguard import typechecked import yaml +from kestrel.exceptions import MappingParseError +from kestrel.utils import load_data_file, list_folder_files + + +_logger = logging.getLogger(__name__) -# _entityname_mapping and _entityattr_mapping are dictionaries that contain + +# _entityname_mapping is dictionaries that contain # the info needed to translate: # a. queries between: # 1. STIX and OCSF @@ -18,25 +24,18 @@ # b. results between: # 1. ECS and OCSF _entityname_mapping = {} -_entityattr_mapping = {} @typechecked def load_standard_config(mapping_pkg: str): global _entityname_mapping - global entityattr_mapping - if len(_entityname_mapping) > 0 and len(_entityattr_mapping) > 0: + if len(_entityname_mapping) > 0: return entityname_mapping_files = list_folder_files( mapping_pkg, "entityname", suffix=".yaml" ) for f in entityname_mapping_files: parse_entityname_mapping_file(mapping_pkg, f.name) - entityattr_mapping_files = list_folder_files( - mapping_pkg, "entityattribute", suffix=".yaml" - ) - for f in entityattr_mapping_files: - parse_entityattr_mapping_file(mapping_pkg, f.name) @typechecked @@ -53,58 +52,11 @@ def parse_entityname_mapping_file(mapping_pkg: str, filename: str): mapping = yaml.safe_load(mapping_str) dst_dict.update(mapping) except Exception as ex: - raise MappingParseError() + raise MappingParseError() from ex src_dict[dst_lang] = dst_dict _entityname_mapping[src_lang] = src_dict -@typechecked -def expand_referenced_field(mapping: dict, key: str, value: dict) -> dict: - res = {} - ref = value.get("ref") - prefix = value.get("prefix") - target_entity = value.get("target_entity") - for k, v in mapping.items(): - if k.startswith(f"{ref}."): - k_no_ref = k[len(ref) + 1 :] - ref_key = ".".join([key, k_no_ref]) - if prefix is None: - ref_value = v - else: - prefix_tokens = prefix.split(".") - v_tokens = v.split(".") - if target_entity is not None: - v_tokens[0] = target_entity - ref_value = ".".join(prefix_tokens + v_tokens) - res[ref_key] = ref_value - return res - - -@typechecked -def parse_entityattr_mapping_file(mapping_pkg: str, filename: str): - global _entityattr_mapping - mapping_fpath = os.path.join("entityattribute", filename) - filename_no_ext, _ = filename.split(".") - src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext - dst_lang = "ocsf" - src_dict = _entityattr_mapping.get(src_lang, {}) - dst_dict = src_dict.get(dst_lang, {}) - try: - mapping_str = load_data_file(mapping_pkg, mapping_fpath) - mapping = yaml.safe_load(mapping_str) - mapping_referenced_fields = mapping.pop("referenced_fields", {}) - expanded_refs = {} - for key, value in mapping_referenced_fields.items(): - expanded_ref = expand_referenced_field(mapping, key, value) - expanded_refs.update(expanded_ref) - mapping.update(expanded_refs) - dst_dict.update(mapping) - except Exception as ex: - raise MappingParseError() - src_dict[dst_lang] = dst_dict - _entityattr_mapping[src_lang] = src_dict - - def load_custom_config(): # ~/.config/kestrel/mapping/entity/*.yaml # ~/.config/kestrel/mapping/property/*.yaml @@ -120,53 +72,3 @@ def normalize_entity( .get(dst_lang, {}) .get(entityname, entityname) ) - - -@typechecked -def normalize_property( - entityattr: str, src_lang: str, dst_lang: str -) -> Union[str, Iterable[str]]: - return ( - _entityattr_mapping.get(src_lang, {}) - .get(dst_lang, {}) - .get(entityattr, entityattr) - ) - - -@typechecked -def from_ocsf_key_value_pair(from_ocsf_dict: dict, key: str, value: str): - values = from_ocsf_dict.get(key, []) - if value not in values: - values.append(value) - from_ocsf_dict[key] = values - - -@typechecked -def from_ocsf_dictionary(to_oscf_dict: dict) -> dict: - from_ocsf_dict = {} - for key, value in to_oscf_dict.items(): - if isinstance(value, list): - for val in value: - from_ocsf_key_value_pair(from_ocsf_dict, val, key) - else: - from_ocsf_key_value_pair(from_ocsf_dict, value, key) - return from_ocsf_dict - - -@typechecked -def generate_from_ocsf_dictionaries(source_schema_name: str) -> (dict, dict): - attr_map = _entityattr_mapping.get(source_schema_name, {}).get("ocsf", {}) - name_map = _entityname_mapping.get(source_schema_name, {}).get("ocsf", {}) - from_ocsf_names = from_ocsf_dictionary(name_map) - from_ocsf_attrs = from_ocsf_dictionary(attr_map) - return (from_ocsf_names, from_ocsf_attrs) - - -# if __name__ == "__main__": -# load_standard_config("kestrel.mapping") -# res = normalize_entity("ecs", "ocsf", "process") -# from_ocsf_names, from_ocsf_attrs = generate_from_ocsf_dictionaries("ecs") -# print("\n\n\n NAMES ") -# print(yaml.dump(from_ocsf_names)) -# print("\n\n\n ATTRIBUTES ") -# print(yaml.dump(from_ocsf_attrs)) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping.py b/packages-nextgen/kestrel_core/tests/test_mapping.py index c0860c42..1f4a07df 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping.py @@ -8,11 +8,6 @@ def test_mapping_load_config(): assert "ocsf" in entity_name_map.get("stix", {}) assert "ecs" in entity_name_map assert "ocsf" in entity_name_map.get("ecs", {}) - entity_attr_map = mapping_utils._entityattr_mapping - assert "stix" in entity_attr_map - assert "ocsf" in entity_attr_map.get("stix", {}) - assert "ecs" in entity_attr_map - assert "ocsf" in entity_attr_map.get("ecs", {}) def test_mapping_entity_names(): @@ -22,57 +17,3 @@ def test_mapping_entity_names(): assert res == "i_dont_exist" res = mapping_utils.normalize_entity("network", "ecs", "ocsf") assert res == "network_activity" - - -def test_mapping_entity_attributes(): - res = mapping_utils.normalize_property("process.parent.executable", - "ecs", "ocsf") - assert res == "process.parent_process.file.path" - res = mapping_utils.normalize_property("process.hash.md5", "ecs", "ocsf") - assert res == "process.file.hashes[?algorithm_id == 1].value" - res = mapping_utils.normalize_property("process.group.id", "ecs", "ocsf") - assert res == "process.group.uid" - res = mapping_utils.normalize_property("processx.non.existent", - "ecs", "ocsf") - assert res == "processx.non.existent" - res = mapping_utils.normalize_property("file.hash.md5", "ecs", "ocsf") - assert res == "file.hashes[?algorithm_id == 1].value" - - -def test_from_ocsf_dicionaries(): - from_ocsf_names, from_ocsf_attrs = mapping_utils.generate_from_ocsf_dictionaries("ecs") - res = from_ocsf_names.get("process") - assert (len(res) == 1 and "process" in res) - res = from_ocsf_names.get("network_endpoint") - assert (len(res) == 4 and "client" in res and "destination" in res and - "server" in res and "source" in res) - res = from_ocsf_attrs.get("process.name") - assert (len(res) == 1 and "process.name" in res) - res = from_ocsf_attrs.get("process.cmd_line") - assert (len(res) == 1 and "process.command_line" in res) - res = from_ocsf_attrs.get("process.file.hashes[?algorithm_id == 1].value") - assert (len(res) == 1 and "process.hash.md5" in res) - res = from_ocsf_attrs.get("process.file.path") - assert (len(res) == 1 and "process.executable" in res) - res = from_ocsf_attrs.get("process.parent_process.file.path") - assert (len(res) == 1 and "process.parent.executable" in res) - res = from_ocsf_attrs.get("process.parent_process.tid") - assert (len(res) == 1 and "process.parent.thread.id" in res) - res = from_ocsf_attrs.get("src_endpoint.domain") - assert (len(res) == 2 and "client.domain" in res and - "source.domain" in res) - res = from_ocsf_attrs.get("src_endpoint.location.city") - assert (len(res) == 2 and "client.geo.city_name" in res and - "source.geo.city_name" in res) - res = from_ocsf_attrs.get("tls.certificate.created_time") - assert (len(res) == 1 and "file.x509.not_before" in res) - res = from_ocsf_attrs.get("tls.certificate.expiration_time") - assert (len(res) == 1 and "file.x509.not_after" in res) - res = from_ocsf_attrs.get("tls.certificate.fingerprints.algorithm") - assert (len(res) == 1 and "file.x509.signature_algorithm" in res) - res = from_ocsf_attrs.get("traffic.packets_in") - assert (len(res) == 2 and "destination.packets" in res and - "server.packets" in res) - res = from_ocsf_attrs.get("file.hashes[?algorithm_id == 4].value") - assert (len(res) == 2 and "hash.sha512" in res and - "file.hash.sha512" in res) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py new file mode 100644 index 00000000..edfde272 --- /dev/null +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -0,0 +1,175 @@ +import pytest + +from kestrel.mapping.data_model import ( + load_mapping, + reverse_mapping, + translate_comparison_to_native, + translate_comparison_to_ocsf, +) + + +# A "custom" mapping for an opensearch/elasticsearch datasource. +# This mapping works with data from Blue Team Village's 2023 DefCon CTF, for example. +WINLOGBEAT_MAPPING = { + "file": { + "path": "file.path", + "name": "file.name" + }, + "process": { + "cmd_line": "winlog.event_data.CommandLine", + "pid": { + "native_field": "process.pid", + "native_value": "to_str", + "ocsf_value": "to_int" + }, + "uid": "winlog.event_data.ProcessGuid", + "file": { + "path": "winlog.event_data.Image", + "name": [ + { + "native_field": "winlog.event_data.Image", + "native_op": "LIKE", + "native_value": "endswith", + "ocsf_value": "basename" + } + ], + "parent_folder": [ + { + "native_field": "winlog.event_data.Image", + "native_op": "LIKE", + "native_value": "startswith", + "ocsf_value": "dirname" + } + ] + }, + "parent_process": { + "cmd_line": "winlog.event_data.ParentCommandLine", + "pid": "winlog.event_data.ParentProcessId", + "uid": "winlog.event_data.ParentProcessGuid", + "file": { + "path": "winlog.event_data.ParentImage", + "name": [ + { + "native_field": "winlog.event_data.ParentImage", + "native_op": "LIKE", + "native_value": "endswith", + "ocsf_value": "basename" + } + ], + "parent_folder": [ + { + "native_field": "winlog.event_data.ParentImage", + "native_op": "LIKE", + "native_value": "startswith", + "ocsf_value": "dirname" + } + ] + } + } + }, + "dst_endpoint": { + "ip": "winlog.event_data.DestinationIp", + "port": "winlog.event_data.DestinationPort" + }, + "src_endpoint": { + "ip": "winlog.event_data.SourceIp", + "port": "winlog.event_data.SourcePort" + } +} + + +# Simplified subset of the standard mapping +STIX_MAPPING = { + "device": { + "ip": "ipv4-addr:value" + }, + "endpoint": { + "ip": "ipv4-addr:value" + }, +} + + +# This mapping is used in 2 places: +# - frontend comparison from ECS to OCSF +# - backend comparison from OCSF to ECS (datasource) +ECS_MAPPING = load_mapping("ecs") + + +def test_reverse_mapping_ipv4(): + reverse_map = reverse_mapping(STIX_MAPPING) + ipv4 = reverse_map["ipv4-addr:value"] + assert isinstance(ipv4, list) + assert set(ipv4) == {"device.ip", "endpoint.ip"} + + +def test_reverse_mapping_executable(): + reverse_map = reverse_mapping(ECS_MAPPING) + exe = reverse_map["process.executable"] + assert isinstance(exe, list) + assert "process.file.path" in exe + for item in exe: + if isinstance(item, dict): + assert "ocsf_field" in item + if item["ocsf_field"] == "process.file.name": + # Make sure all metadata from the mapping got reversed + assert item["native_value"] == "endswith" + assert item["native_op"] == "LIKE" + assert item["ocsf_value"] == "basename" + + + +@pytest.mark.parametrize( + "dmm, field, op, value, expected_result", + [ + (WINLOGBEAT_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", + [("winlog.event_data.Image", "=", "C:\\TMP\\foo.exe")]), + (WINLOGBEAT_MAPPING, "process.file.name", "=", "foo.exe", + [("winlog.event_data.Image", "LIKE", "%\\foo.exe")]), + (ECS_MAPPING, "process.file.path", "=", "C:\\TMP\\foo.exe", + [("process.executable", "=", "C:\\TMP\\foo.exe")]), + (ECS_MAPPING, "process.file.name", "=", "foo.exe", + [("process.executable", "LIKE", "%\\foo.exe")]), + ], +) +def test_translate_comparison_to_native(dmm, field, op, value, expected_result): + assert translate_comparison_to_native(dmm, field, op, value) == expected_result + + +@pytest.mark.parametrize( + "dmm, field, op, value, expected_result", + [ + # (WINLOGBEAT_MAPPING, "winlog.event_data.Image", "=", "C:\\TMP\\foo.exe", #TODO: don't need this test case + # [ + # ("process.file.path", "=", "C:\\TMP\\foo.exe"), + # ("process.file.name", "=", "foo.exe"), + # ("process.file.parent_folder", "=", "C:\\TMP"), + # ]), + # (WINLOGBEAT_MAPPING, "winlog.event_data.Image", "LIKE", "%\\foo.exe", #TODO: don't need this test case + # [ + # ("process.file.path", "LIKE", "%\\foo.exe"), + # ("process.file.name", "LIKE", "foo.exe"), + # ("process.file.parent_folder", "LIKE", "%"), + # ]), + (ECS_MAPPING, "process.executable", "=", "C:\\TMP\\foo.exe", + [ + ("process.file.path", "=", "C:\\TMP\\foo.exe"), + ("process.file.name", "=", "foo.exe"), + ("process.file.parent_folder", "=", "C:\\TMP"), + ]), + (ECS_MAPPING, "process.executable", "LIKE", "%\\foo.exe", + [ + ("process.file.path", "LIKE", "%\\foo.exe"), + ("process.file.name", "LIKE", "foo.exe"), #TODO: could optimize this to "=" + ("process.file.parent_folder", "LIKE", "%"), #TODO: could eliminate this? + ]), + (STIX_MAPPING, "ipv4-addr:value", "=", "198.51.100.13", + [ + ("device.ip", "=", "198.51.100.13"), + ("endpoint.ip", "=", "198.51.100.13"), + ]), + ], +) +def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result): + """Test the translate function.""" + reverse_dmm = reverse_mapping(dmm) # Make the dmms fixtures? + assert set(translate_comparison_to_ocsf(reverse_dmm, field, op, value)) == set(expected_result) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py new file mode 100644 index 00000000..b8461b11 --- /dev/null +++ b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py @@ -0,0 +1,23 @@ +import pytest + +from kestrel.mapping.transformers import ( + run_transformer, +) + + +@pytest.mark.parametrize( + "transform, value, expected", [ + ("dirname", r"C:\Windows\System32\cmd.exe", r"C:\Windows\System32"), + ("basename", r"C:\Windows\System32\cmd.exe", r"cmd.exe"), + ("startswith", r"C:\Windows\System32", r"C:\Windows\System32\%"), + ("endswith", "cmd.exe", r"%\cmd.exe"), + ("to_int", 1234, 1234), + ("to_int", 1234.1234, 1234), # Maybe this should fail? + ("to_int", "1234", 1234), + ("to_int", "0x4d2", 1234), + ("to_str", "1234", "1234"), + ("to_str", 1234, "1234"), + ] +) +def test_run_transformer(transform, value, expected): + assert run_transformer(transform, value) == expected diff --git a/packages-nextgen/kestrel_core/tests/test_parser.py b/packages-nextgen/kestrel_core/tests/test_parser.py index 3e7310d5..1ca5d314 100644 --- a/packages-nextgen/kestrel_core/tests/test_parser.py +++ b/packages-nextgen/kestrel_core/tests/test_parser.py @@ -111,10 +111,10 @@ def test_parser_mapping_single_comparison_to_multiple_values(): stmt = "x = GET ipv4-addr FROM if://ds WHERE value = '192.168.22.3'" parse_filter = get_parsed_filter_exp(stmt) comps = parse_filter.comps - assert isinstance(comps, list) and len(comps) == 3 + assert isinstance(comps, list) and len(comps) == 4 fields = [x.field for x in comps] assert ("dst_endpoint.ip" in fields and "src_endpoint.ip" in fields and - "device.ip" in fields) + "device.ip" in fields and "endpoint.ip" in fields) def test_parser_mapping_multiple_comparison_to_multiple_values(): @@ -124,12 +124,9 @@ def test_parser_mapping_multiple_comparison_to_multiple_values(): field1 = parse_filter.lhs.field assert field1 == 'file.name' field2 = parse_filter.rhs.lhs.field - assert field2 == 'process.name' - comps3 = parse_filter.rhs.rhs.comps - assert isinstance(comps3, list) and len(comps3) == 2 - fields3 = [x.field for x in comps3] - assert ("actor.process.name" in fields3 and - "process.parent_process.name" in fields3) + assert field2 == 'name' # 'process.name' + field3 = parse_filter.rhs.rhs.field + assert field3 == "parent_process.name" def test_parser_new_json(): diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index c7294bac..bb1d79ed 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -9,10 +9,7 @@ CONFIG_DIR_DEFAULT, load_user_config, ) -from kestrel.mapping.utils import ( - generate_from_ocsf_dictionaries, - load_standard_config, -) +from kestrel.mapping.data_model import load_mapping PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "opensearch.yaml" @@ -42,22 +39,16 @@ class Index(DataClassJSONMixin): connection: str timestamp: str timestamp_format: str - data_model_mapping: Optional[str] = None + data_model_mapping: Optional[str] = None # Filename for mapping data_model_map: Mapping = field(default_factory=dict) def __post_init__(self): if self.data_model_mapping: with open(self.data_model_mapping, "r") as fp: - data_model_map = yaml.safe_load(fp) - # Reverse it so it's ocsf -> native - self.data_model_map = { - v: k for k, v in data_model_map.items() if isinstance(v, str) - } + self.data_model_map = yaml.safe_load(fp) else: # Default to the built-in ECS mapping - load_standard_config("kestrel.mapping") - _, data_model_map = generate_from_ocsf_dictionaries("ecs") - self.data_model_map = {k: v[0] for k, v in data_model_map.items()} + self.data_model_map = load_mapping("ecs") @dataclass diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index de68d270..481ecc2b 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -112,7 +112,8 @@ def evaluate_graph( for instruction in instructions_to_evaluate: translator = self._evaluate_instruction_in_graph(graph, instruction) # TODO: may catch error in case evaluation starts from incomplete SQL - _logger.debug("SQL query generated: %s", translator.result()) + sql = translator.result() + _logger.debug("SQL query generated: %s", sql) ds = self.config.indexes[translator.table] # table == datasource conn = self.config.connections[ds.connection] client = OpenSearch( @@ -120,7 +121,9 @@ def evaluate_graph( http_auth=(conn.auth.username, conn.auth.password), verify_certs=conn.verify_certs, ) - mapping[instruction.id] = read_sql(translator.result(), client) + mapping[instruction.id] = read_sql( + sql, client + ) # TODO: results data mapping! Need to create any columns that don't already exist. client.close() return mapping diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 8d04e7f5..817a9f88 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -26,6 +26,11 @@ Sort, SortDirection, ) +from kestrel.mapping.data_model import ( + flatten_mapping, + translate_comparison_to_native, + reverse_mapping, +) _logger = logging.getLogger(__name__) @@ -68,6 +73,16 @@ def _or(lhs: str, rhs: Value) -> str: } +def _format_value(value): + if isinstance(value, str): + # Need to quote string values + value = f"'{value}'" + elif isinstance(value, list): + # SQL uses parens for lists + value = tuple(value) + return value + + @typechecked class OpenSearchTranslator: def __init__( @@ -102,23 +117,21 @@ def __init__( @typechecked def _render_comp(self, comp: FComparison) -> str: - if isinstance(comp, StrComparison): - # Need to quote string values - value = f"'{comp.value}'" - elif isinstance(comp, ListComparison): - # SQL uses parens for lists - value = tuple(comp.value) - else: - value = comp.value - # Need to map OCSF filter field to native - prefix = f"{self.entity}." if self.entity else "" + prefix = ( + f"{self.entity}." if (self.entity and comp.field != self.timestamp) else "" + ) ocsf_field = f"{prefix}{comp.field}" - field = self.from_ocsf_map.get(ocsf_field, comp.field) - _logger.debug("Mapped field '%s' to '%s'", ocsf_field, field) + comps = translate_comparison_to_native( + self.from_ocsf_map, ocsf_field, comp.op, comp.value + ) try: - result = f"{field} {comp2func[comp.op]} {value}" + comps = [f"{f} {comp2func[o]} {_format_value(v)}" for f, o, v in comps] + conj = " OR ".join(comps) + result = conj if len(comps) == 1 else f"({conj})" except KeyError: - raise UnsupportedOperatorError(comp.op.value) + raise UnsupportedOperatorError( + comp.op.value + ) # FIXME: need to report the mapped op, not the original return result @typechecked @@ -177,24 +190,49 @@ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: # Just save projection and compile it later self.project = proj - def _get_ocsf_cols(self): - prefix = f"{self.entity}." if self.entity else "" - if not self.project: - ocsf_cols = [k for k in self.from_ocsf_map.keys() if k.startswith(prefix)] - else: - ocsf_cols = [f"{prefix}{col}" for col in self.project.attrs] - _logger.debug("OCSF fields: %s", ocsf_cols) - return ocsf_cols + def _get_fields(self) -> dict: # TODO: rename + # prefix = f"{self.entity}." if self.entity else "" + entity_map = ( + self.from_ocsf_map[self.entity] if self.entity else self.from_ocsf_map + ) + flat_map = flatten_mapping(reverse_mapping(entity_map)) + fields = {} + for k, v in flat_map.items(): + # FIXME: ProjectAttrs in compile.py aren't mapped to OCSF, so if you use STIX it doesn't work at all + # Check for 1:N mappings + if isinstance(v, list): + one_to_ones = [i for i in v if isinstance(i, str)] + if len(one_to_ones) == 0: + _logger.warning("No suitable mapping for %s", k) + continue # FIXME: we need to do something here + if len(one_to_ones) > 1: + _logger.warning("Ambiguous mapping for %s", k) + v = one_to_ones[0] # TODO: how else can we choose? + elif isinstance(v, str): + pass # Nothing to do? + if self.project and not ( + v in self.project.attrs or k in self.project.attrs + ): # FIXME: v might be dict!!! + # It's not in the projection, so skip it + _logger.debug("skipping %s -> %s since it's not in projection", k, v) + continue + fields[k] = v + + if not fields: + # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? + fields = {attr: attr for attr in self.project.attrs} + + _logger.debug("OCSF fields: %s", fields) + return fields def _render_proj(self): - fields = { - self.from_ocsf_map.get(col, col): col for col in self._get_ocsf_cols() - } - _logger.debug("Fields: %s", fields) - proj = [ - f"`{k}` AS `{v.partition('.')[2]}`" if "." in v else v - for k, v in fields.items() - ] + """Get a list of native cols to project with their OCSF equivalents as SQL aliases""" + # input is either (flat) OCSF, ECS, or STIX fields and we need to create (native, OCSF) alias mapping + # - this may be a common capability? Need a func to produce native:ocsf dict + # - how to handle collisions? + # Need access to schema to prune to fields that are actually available? + fields = self._get_fields() + proj = [f"`{k}` AS `{v}`" if k != v else k for k, v in fields.items()] _logger.debug("Set projection to %s", proj) return proj diff --git a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py index c213a963..67ac6f8c 100644 --- a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py @@ -33,12 +33,28 @@ TIMEFMT = '%Y-%m-%dT%H:%M:%S.%fZ' +# A much-simplified test mapping data_model_map = { - "process.cmd_line": "CommandLine", - "process.file.path": "Image", - "process.pid": "ProcessId", - "actor.process.pid": "ParentProcessId", + "process": { + "cmd_line": "CommandLine", + "file": { + "path": "Image", + # "name": [ + # { + # "native_field": "Image", + # "native_value": "basename", + # "ocsf_op": "LIKE", + # "ocsf_value": "endswith" + # } + # ] + }, + "pid": "ProcessId", + "parent_process": { + "pid": "ParentProcessId", + }, + }, } + schema = { "CommandLine": "text", "Image": "text", @@ -86,9 +102,11 @@ def _remove_nl(s): ] ) def test_opensearch_translator(iseq, sql): - cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`' - if ProjectEntity not in {type(i) for i in iseq}: - cols += ', `ParentProcessId` AS `process.pid`' + cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' + if ProjectEntity in {type(i) for i in iseq}: + cols = '`CommandLine` AS `cmd_line`, `Image` AS `file.path`, `ProcessId` AS `pid`, `ParentProcessId` AS `parent_process.pid`' + else: + cols = '`CommandLine` AS `process.cmd_line`, `Image` AS `process.file.path`, `ProcessId` AS `process.pid`, `ParentProcessId` AS `process.parent_process.pid`' trans = OpenSearchTranslator(TIMEFMT, "timestamp", "my_table", data_model_map, schema) for i in iseq: trans.add_instruction(i) From 334d0ecb71d859f22c14487f38ddebcb0fbd6298 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Fri, 15 Mar 2024 13:54:30 -0400 Subject: [PATCH 26/61] data model map cleanup --- .../src/kestrel/mapping/data_model.py | 24 ++++--------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index c7ee086c..f15fba67 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -106,16 +106,11 @@ def get_simple_property_mapping( return attribute_map -def _map_op(op: str, mapped_op: str) -> str: - # TODO: does native_value matter? - return mapped_op if mapped_op else op - - def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple: - mapped_op = d.get(f"{prefix}_op") # to_native_op? - transform = d.get(f"{prefix}_value") # to_native_value? + mapped_op = d.get(f"{prefix}_op") + transform = d.get(f"{prefix}_value") new_value = run_transformer(transform, value) - new_op = _map_op(op, mapped_op) + new_op = mapped_op if mapped_op else op return (d[f"{prefix}_field"], new_op, new_value) @@ -161,18 +156,14 @@ def translate_comparison_to_native( tmp = dmm for part in parts: if isinstance(tmp, dict): - tmp = tmp.get(part, {}) # tmp[part] + tmp = tmp.get(part, {}) else: break if tmp: if isinstance(tmp, list): for i in tmp: if isinstance(i, dict): - mapped_op = i.get("native_op") # to_native_op? - transform = i.get("native_value") # to_native_value? - new_value = run_transformer(transform, value) - new_op = _map_op(op, mapped_op) - result.append((i["native_field"], new_op, new_value)) + result.append(_get_map_triple(i, "native", op, value)) else: result.append((i, op, value)) elif isinstance(tmp, dict): @@ -224,11 +215,6 @@ def translate_comparison_to_ocsf( elif isinstance(mapping, list): for i in mapping: if isinstance(i, dict): - # mapped_op = i.get("ocsf_op") # to_ocsf_op? - # transform = i.get("ocst_value") # to_ocsf_value? - # new_value = run_transformer(transform, value) if transform else value - # new_op = _map_op(op, mapped_op) - # result.append((i["ocsf_field"], new_op, new_value)) result.append(_get_map_triple(i, "ocsf", op, value)) else: result.append((i, op, value)) From d07ea862ed574239e8e842ff9049408eb0920998 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Mar 2024 16:24:45 -0400 Subject: [PATCH 27/61] data_model: remove flatten_mapping, as it's not needed --- .../src/kestrel/mapping/data_model.py | 26 ------------------- .../src/kestrel_interface_opensearch/ossql.py | 3 +-- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index f15fba67..feb9cd45 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -221,32 +221,6 @@ def translate_comparison_to_ocsf( return result -def flatten_mapping(dmm: dict, key: str = None) -> dict: - """Flatten the nested `dmm` so that the keys are dotted "paths", optionally starting at `key` - - Parameters: - dmm: A dictionary that maps fields from one data model to another. - key: starting point in `dmm` (optional) - - Returns: - A dict with dotted path keys - """ - root = dmm[key] if key else dmm - return _flatten(root) - - -def _flatten(root: dict, result: dict = None, prefix: str = "") -> dict: - if result is None: - result = {} - for k, v in root.items(): - key = prefix + "." + k if prefix else k - if isinstance(v, dict): - _flatten(v, result, key) - else: - result[key] = v - return result - - @typechecked def load_mapping( data_model_name: str, diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 817a9f88..6eca27d8 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -27,7 +27,6 @@ SortDirection, ) from kestrel.mapping.data_model import ( - flatten_mapping, translate_comparison_to_native, reverse_mapping, ) @@ -195,7 +194,7 @@ def _get_fields(self) -> dict: # TODO: rename entity_map = ( self.from_ocsf_map[self.entity] if self.entity else self.from_ocsf_map ) - flat_map = flatten_mapping(reverse_mapping(entity_map)) + flat_map = reverse_mapping(entity_map) fields = {} for k, v in flat_map.items(): # FIXME: ProjectAttrs in compile.py aren't mapped to OCSF, so if you use STIX it doesn't work at all From eea4c1d84426e25b312a7787941a0fe0c29d4bf2 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Mar 2024 16:38:03 -0400 Subject: [PATCH 28/61] ignore unconfigured interfaces --- .../kestrel_core/src/kestrel/exceptions.py | 2 +- .../kestrel_core/src/kestrel/interface/manager.py | 10 +++++++--- .../src/kestrel_interface_opensearch/config.py | 6 +++++- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py index a0c94a07..cd088afe 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/exceptions.py +++ b/packages-nextgen/kestrel_core/src/kestrel/exceptions.py @@ -98,7 +98,7 @@ class IRGraphMissingNode(KestrelError): pass -class InterfaceNotFound(KestrelError): +class InterfaceNotConfigured(KestrelError): pass diff --git a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py index 3e155ded..b5fd0904 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py +++ b/packages-nextgen/kestrel_core/src/kestrel/interface/manager.py @@ -10,6 +10,7 @@ from typing import Mapping, Iterable, Type from kestrel.exceptions import ( + InterfaceNotConfigured, InterfaceNotFound, InvalidInterfaceImplementation, ConflictingInterfaceScheme, @@ -28,9 +29,12 @@ def __init__(self, init_interfaces: Iterable[AbstractInterface] = []): interface_classes = _load_interface_classes() self.interfaces = list(init_interfaces) # copy/recreate the list for iface_cls in interface_classes: - iface = iface_cls() - _logger.debug(f"Initialize interface {iface_cls.__name__}") - self.interfaces.append(iface) + try: + iface = iface_cls() + _logger.debug(f"Initialize interface {iface_cls.__name__}") + self.interfaces.append(iface) + except InterfaceNotConfigured as e: + _logger.debug(f"Interface {iface_cls.__name__} not configured; ignored") def __getitem__(self, scheme: str) -> AbstractInterface: for interface in self.interfaces: diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index bb1d79ed..16eddf92 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -9,6 +9,7 @@ CONFIG_DIR_DEFAULT, load_user_config, ) +from kestrel.exceptions import InterfaceNotConfigured from kestrel.mapping.data_model import load_mapping @@ -62,4 +63,7 @@ def __post_init__(self): def load_config(): - return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + try: + return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + except TypeError: + raise InterfaceNotConfigured() From a98dfa3562a870e5e6ae5985b70bc6a98472a58a Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Mar 2024 16:44:43 -0400 Subject: [PATCH 29/61] remove commented out test cases --- .../kestrel_core/tests/test_mapping_data_model.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py index edfde272..c74d76e4 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -138,18 +138,6 @@ def test_translate_comparison_to_native(dmm, field, op, value, expected_result): @pytest.mark.parametrize( "dmm, field, op, value, expected_result", [ - # (WINLOGBEAT_MAPPING, "winlog.event_data.Image", "=", "C:\\TMP\\foo.exe", #TODO: don't need this test case - # [ - # ("process.file.path", "=", "C:\\TMP\\foo.exe"), - # ("process.file.name", "=", "foo.exe"), - # ("process.file.parent_folder", "=", "C:\\TMP"), - # ]), - # (WINLOGBEAT_MAPPING, "winlog.event_data.Image", "LIKE", "%\\foo.exe", #TODO: don't need this test case - # [ - # ("process.file.path", "LIKE", "%\\foo.exe"), - # ("process.file.name", "LIKE", "foo.exe"), - # ("process.file.parent_folder", "LIKE", "%"), - # ]), (ECS_MAPPING, "process.executable", "=", "C:\\TMP\\foo.exe", [ ("process.file.path", "=", "C:\\TMP\\foo.exe"), From 2b4c817af3dfd04d2b13cb346a4c0fdafcd712fd Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 19 Mar 2024 17:03:02 -0400 Subject: [PATCH 30/61] data_model: remove unused get_simple_property_mapping --- .../src/kestrel/frontend/compile.py | 2 +- .../src/kestrel/mapping/data_model.py | 32 ------------------- 2 files changed, 1 insertion(+), 33 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py index cb1f897f..cf15abb5 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py @@ -117,7 +117,7 @@ def _map_filter_exp( prefix = f"{entity_name}." if field.startswith(prefix): # Need to prune the entity name - field = field[len(prefix) :] + field = field[len(prefix):] filter_exp.field = field filter_exp.op = mapping[1] filter_exp.value = mapping[2] diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index feb9cd45..05ad76b4 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -74,38 +74,6 @@ def reverse_mapping(obj: dict, prefix: str = None, result: dict = None) -> dict: return result -def get_simple_property_mapping( - obj: dict, prefix: str = None -) -> dict: # TODO: remove? Not really needed - """Parse the data model map `obj` and return a simple "native -> OCSF" attribute name map""" - attribute_map = {} - for k, v in obj.items(): - k = ".".join((prefix, k)) if prefix else k - # Recurse if necessary - if isinstance(v, str): - _add_attr(attribute_map, v, k) - elif isinstance(v, list): - for i in v: - if isinstance(i, str): - _add_attr(attribute_map, i, k) - else: - native_field = i.get("native_field") - if native_field: - _add_attr(attribute_map, native_field, k) - else: - # Need to "deep" merge with current results - attribute_map.update(get_simple_property_mapping(i, k)) - elif isinstance(v, dict): - # First determine if this is a complex mapping or just another level - native_field = v.get("native_field") - if native_field: - _add_attr(attribute_map, native_field, k) - else: - # Need to "deep" merge with current results - attribute_map.update(get_simple_property_mapping(v, k)) - return attribute_map - - def _get_map_triple(d: dict, prefix: str, op: str, value) -> tuple: mapped_op = d.get(f"{prefix}_op") transform = d.get(f"{prefix}_value") From e67e00e5267b646bc3805beb95fac141e79312fc Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Wed, 20 Mar 2024 16:12:13 -0400 Subject: [PATCH 31/61] basic results translation --- .../src/kestrel/mapping/transformers.py | 12 ++++ .../tests/test_mapping_transformers.py | 8 +++ .../pyproject.toml | 1 + .../kestrel_interface_opensearch/interface.py | 34 ++++++++-- .../src/kestrel_interface_opensearch/ossql.py | 65 ++++++++++++------- 5 files changed, 89 insertions(+), 31 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py index e4ce89ab..3d0a0885 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py @@ -2,6 +2,8 @@ from typing import Callable +from pandas import Series + # Dict of "registered" transformers _transformers = {} @@ -84,3 +86,13 @@ def run_transformer(transformer_name: str, value): else: raise NameError(transformer_name) return result + + +def run_transformer_on_series(transformer_name: str, value: Series): + """Run the registered transformer with name `transformer_name` on `value`""" + func = _transformers.get(transformer_name) + if func: + result = value.apply(func) + else: + raise NameError(transformer_name) + return result diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py index b8461b11..90db4d28 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py @@ -1,7 +1,9 @@ +import pandas as pd import pytest from kestrel.mapping.transformers import ( run_transformer, + run_transformer_on_series, ) @@ -21,3 +23,9 @@ ) def test_run_transformer(transform, value, expected): assert run_transformer(transform, value) == expected + + +def test_run_series_basename(): + data = pd.Series([r"C:\Windows\System32\cmd.exe", r"C:\TMP"]) + result = list(run_transformer_on_series("basename", data)) + assert result == ["cmd.exe", "TMP"] diff --git a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml index 6270f6d0..ad815d8b 100644 --- a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml +++ b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ ] dependencies = [ + "dpath>=2.1.6", "kestrel_core>=2.0.0", "opensearch-py>=2.4.2", ] diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index 481ecc2b..8b7de865 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -2,13 +2,15 @@ from typing import Iterable, Mapping, Optional from uuid import UUID +import dpath +import numpy as np from opensearchpy import OpenSearch from pandas import DataFrame, Series, concat +from kestrel.display import GraphletExplanation from kestrel.exceptions import DataSourceError from kestrel.interface import AbstractInterface from kestrel.ir.graph import IRGraphEvaluable -from kestrel.display import GraphletExplanation from kestrel.ir.instructions import ( DataSource, Instruction, @@ -19,6 +21,7 @@ TransformingInstruction, SolePredecessorTransformingInstruction, ) +from kestrel.mapping.transformers import run_transformer_on_series from kestrel_interface_opensearch.config import load_config from kestrel_interface_opensearch.ossql import OpenSearchTranslator @@ -33,11 +36,25 @@ def _jdbc2df(schema: dict, datarows: dict) -> DataFrame: return DataFrame(datarows, columns=columns) -def read_sql(sql: str, conn: OpenSearch) -> DataFrame: +def _translate_df(df: DataFrame, dmm: dict) -> DataFrame: + # Translate results into Kestrel OCSF data model + # The column names of df are already mapped + df = df.replace({np.nan: None}) + for col in df.columns: + mapping = dpath.get(dmm, col, separator=".") + if isinstance(mapping, dict): + transformer_name = mapping.get("ocsf_value") + df[col] = run_transformer_on_series(transformer_name, df[col]) + + return df + + +def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFrame: """Execute `sql` and return the results as a DataFrame, a la pandas.read_sql""" # https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#query-api body = { - "fetch_size": 10000, # Should we make this configurable? + # Temporarily comment out fetch_size due to https://github.com/opensearch-project/sql/issues/2579 + # FIXME: "fetch_size": 10000, # Should we make this configurable? "query": sql, } query_resp = conn.http.post("/_plugins/_sql?format=jdbc", body=body) @@ -57,7 +74,12 @@ def read_sql(sql: str, conn: OpenSearch) -> DataFrame: dfs = [] done = False while not done: - dfs.append(_jdbc2df(schema, query_resp["datarows"])) + df = _jdbc2df(schema, query_resp["datarows"]) + if dmm is not None: + # Need to use Data Model Map to do results translation + dfs.append(_translate_df(df, dmm)) + else: + dfs.append(df) cursor = query_resp.get("cursor") if not cursor: break @@ -122,8 +144,8 @@ def evaluate_graph( verify_certs=conn.verify_certs, ) mapping[instruction.id] = read_sql( - sql, client - ) # TODO: results data mapping! Need to create any columns that don't already exist. + sql, client, translator.from_ocsf_map[translator.entity] + ) client.close() return mapping diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 6eca27d8..fef36436 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -2,6 +2,7 @@ from functools import reduce from typing import Optional, Union +import dpath from typeguard import typechecked from kestrel.exceptions import UnsupportedOperatorError @@ -189,49 +190,63 @@ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: # Just save projection and compile it later self.project = proj - def _get_fields(self) -> dict: # TODO: rename - # prefix = f"{self.entity}." if self.entity else "" + def _get_fields(self) -> list: + """This method produces a subset of that Data Model Map. That subset + is used to generate the SQL projection (SELECT columns), with + each native column aliased to its relative OCSF name. + + """ + prefix = f"{self.entity}." if self.entity else "" entity_map = ( self.from_ocsf_map[self.entity] if self.entity else self.from_ocsf_map ) + # Use the reverse map since we want native->ocsf. We could + # instead flatten the normal map, but reversed is naturally + # flat. flat_map = reverse_mapping(entity_map) - fields = {} + fields = [] # Collect the needed field mappings for k, v in flat_map.items(): # FIXME: ProjectAttrs in compile.py aren't mapped to OCSF, so if you use STIX it doesn't work at all # Check for 1:N mappings if isinstance(v, list): - one_to_ones = [i for i in v if isinstance(i, str)] - if len(one_to_ones) == 0: - _logger.warning("No suitable mapping for %s", k) - continue # FIXME: we need to do something here - if len(one_to_ones) > 1: - _logger.warning("Ambiguous mapping for %s", k) - v = one_to_ones[0] # TODO: how else can we choose? + for i in v: + if isinstance(i, str): + fields.append((k, i)) + elif isinstance(i, dict): + fields.append((k, i["ocsf_field"])) + else: + _logger.debug("Unhandled mapping: %s", i) + elif isinstance(v, dict): + fields.append((k, v["ocsf_field"])) elif isinstance(v, str): - pass # Nothing to do? - if self.project and not ( - v in self.project.attrs or k in self.project.attrs - ): # FIXME: v might be dict!!! - # It's not in the projection, so skip it - _logger.debug("skipping %s -> %s since it's not in projection", k, v) - continue - fields[k] = v + fields.append((k, v)) if not fields: # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? - fields = {attr: attr for attr in self.project.attrs} + fields = [(attr, attr) for attr in self.project.attrs] - _logger.debug("OCSF fields: %s", fields) + _logger.debug("Field mappings: %s", fields) return fields def _render_proj(self): """Get a list of native cols to project with their OCSF equivalents as SQL aliases""" - # input is either (flat) OCSF, ECS, or STIX fields and we need to create (native, OCSF) alias mapping - # - this may be a common capability? Need a func to produce native:ocsf dict - # - how to handle collisions? - # Need access to schema to prune to fields that are actually available? fields = self._get_fields() - proj = [f"`{k}` AS `{v}`" if k != v else k for k, v in fields.items()] + name_pairs = [] + for pair in fields: + native_field, tmp = pair + ocsf_field = tmp["ocsf_field"] if isinstance(tmp, dict) else tmp + if self.project and not ( + ocsf_field in self.project.attrs or native_field in self.project.attrs + ): + # It's not in the projection, so skip it + _logger.debug( + "skipping %s -> %s since it's not in projection", + native_field, + ocsf_field, + ) + continue + name_pairs.append((native_field, ocsf_field)) + proj = [f"`{k}` AS `{v}`" if k != v else k for k, v in name_pairs] _logger.debug("Set projection to %s", proj) return proj From a2bd20976594fb80eaa5e993a1ad27db7f50f0f3 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Wed, 20 Mar 2024 17:08:29 -0400 Subject: [PATCH 32/61] opensearch: fix pass-through for projection attrs --- .../src/kestrel_interface_opensearch/ossql.py | 7 +++---- .../kestrel_interface_opensearch/tests/test_ossql.py | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index fef36436..45d0428c 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -221,10 +221,6 @@ def _get_fields(self) -> list: elif isinstance(v, str): fields.append((k, v)) - if not fields: - # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? - fields = [(attr, attr) for attr in self.project.attrs] - _logger.debug("Field mappings: %s", fields) return fields @@ -247,6 +243,9 @@ def _render_proj(self): continue name_pairs.append((native_field, ocsf_field)) proj = [f"`{k}` AS `{v}`" if k != v else k for k, v in name_pairs] + if not proj: + # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? + proj = [f"`{attr}`" for attr in self.project.attrs] _logger.debug("Set projection to %s", proj) return proj diff --git a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py index 67ac6f8c..838b57e2 100644 --- a/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/tests/test_ossql.py @@ -84,10 +84,10 @@ def _remove_nl(s): "SELECT {} FROM my_table WHERE foo >= 0 AND timestamp >= '2023-12-06T08:17:00.000000Z' AND timestamp < '2023-12-07T08:17:00.000000Z'"), # Add a limit and projection ([Limit(3), ProjectAttrs(['foo', 'bar', 'baz']), Filter(StrComparison('foo', StrCompOp.EQ, 'abc'))], - "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"), + "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), # Same as above but reverse order ([Filter(StrComparison('foo', StrCompOp.EQ, 'abc')), ProjectAttrs(['foo', 'bar', 'baz']), Limit(3)], - "SELECT foo, bar, baz FROM my_table WHERE foo = 'abc' LIMIT 3"), + "SELECT `foo`, `bar`, `baz` FROM my_table WHERE foo = 'abc' LIMIT 3"), ([Filter(ListComparison('foo', ListOp.NIN, ['abc', 'def']))], "SELECT {} FROM my_table WHERE foo NOT IN ('abc', 'def')"), ([Filter(MultiComp(ExpOp.OR, [IntComparison('foo', NumCompOp.EQ, 1), IntComparison('bar', NumCompOp.EQ, 1)]))], From 56d4b851b63052822f040fd6d13b23c26c6ac232 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 26 Mar 2024 16:49:58 -0400 Subject: [PATCH 33/61] ecs: add endpoint mapping --- .../kestrel/mapping/entityattribute/ecs.yaml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index 0abe4cec..e9050d80 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -121,6 +121,29 @@ src_endpoint: - client.mac - source.mac +# endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint +endpoint: + domain: + - client.domain + - source.domain + - server.domain + - destination.domain + hostname: + - client.domain + - source.domain + - server.domain + - destination.domain + ip: + - client.ip + - source.ip + - server.ip + - destination.ip + mac: + - client.mac + - source.mac + - server.mac + - destination.mac + # dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint dst_endpoint: domain: From c44703b297e58097d01c667149ba2a30a31524d8 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 26 Mar 2024 16:51:07 -0400 Subject: [PATCH 34/61] Add translate_projection_to_native function --- .../src/kestrel/frontend/compile.py | 2 +- .../src/kestrel/mapping/data_model.py | 57 +++++++++++++++++- .../tests/test_mapping_data_model.py | 28 ++++++++- .../src/kestrel_interface_opensearch/ossql.py | 59 +++---------------- 4 files changed, 91 insertions(+), 55 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py index cf15abb5..cb1f897f 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py +++ b/packages-nextgen/kestrel_core/src/kestrel/frontend/compile.py @@ -117,7 +117,7 @@ def _map_filter_exp( prefix = f"{entity_name}." if field.startswith(prefix): # Need to prune the entity name - field = field[len(prefix):] + field = field[len(prefix) :] filter_exp.field = field filter_exp.op = mapping[1] filter_exp.value = mapping[2] diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index 05ad76b4..e1e72ee1 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -1,5 +1,5 @@ import logging -from typing import Union +from typing import Optional, Union import yaml from typeguard import typechecked @@ -203,3 +203,58 @@ def load_mapping( with open(f, "r") as fp: result.update(yaml.safe_load(fp)) return result + + +@typechecked +def _get_from_mapping(mapping: Union[str, list, dict], key) -> list[str]: + result = [] + if isinstance(mapping, list): + for i in mapping: + if isinstance(i, dict): + result.append(i[key]) + else: + result.append(i) + elif isinstance(mapping, dict): + result.append(mapping[key]) + elif isinstance(mapping, str): + result.append(mapping) + return result + + +@typechecked +def translate_projection_to_native( + dmm: dict, + entity_type: Optional[str], + attrs: Optional[list], + # TODO: optional str or callable for joining entity_type and attr? +) -> list: + result = [] + if entity_type: + dmm = dmm[entity_type] + if not attrs: + for native_field, mapping in reverse_mapping(dmm).items(): + result.extend( + [(native_field, i) for i in _get_from_mapping(mapping, "ocsf_field")] + ) + attrs = [] + for attr in attrs: + mapping = dmm.get(attr) + if not mapping: + parts = attr.split(".") + tmp = dmm + for part in parts: + if isinstance(tmp, dict): + tmp = tmp.get(part, {}) + else: + break + if tmp: + mapping = tmp + if mapping: + result.extend( + [(i, attr) for i in _get_from_mapping(mapping, "native_field")] + ) + else: + # Pass-through? + result.append((attr, attr)) # FIXME: raise exception instead? + _logger.debug("proj_to_native: return %s", result) + return result diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py index c74d76e4..8fe177a6 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -5,6 +5,7 @@ reverse_mapping, translate_comparison_to_native, translate_comparison_to_ocsf, + translate_projection_to_native, ) @@ -18,7 +19,7 @@ "process": { "cmd_line": "winlog.event_data.CommandLine", "pid": { - "native_field": "process.pid", + "native_field": "winlog.event_data.ProcessId", "native_value": "to_str", "ocsf_value": "to_int" }, @@ -161,3 +162,28 @@ def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result): """Test the translate function.""" reverse_dmm = reverse_mapping(dmm) # Make the dmms fixtures? assert set(translate_comparison_to_ocsf(reverse_dmm, field, op, value)) == set(expected_result) + + +@pytest.mark.parametrize( + "dmm, entity, field, expected_result", + [ + (WINLOGBEAT_MAPPING, "process", ["file.name", "pid"], + [("winlog.event_data.Image", "file.name"), ("winlog.event_data.ProcessId", "pid")]), + (WINLOGBEAT_MAPPING, "process", None, + [("winlog.event_data.CommandLine", "cmd_line"), + ("winlog.event_data.ProcessId", "pid"), + ("winlog.event_data.ProcessGuid", "uid"), + ("winlog.event_data.Image", "file.path"), + ("winlog.event_data.Image", "file.name"), + ("winlog.event_data.Image", "file.parent_folder"), + ("winlog.event_data.ParentCommandLine", "parent_process.cmd_line"), + ("winlog.event_data.ParentProcessId", "parent_process.pid"), + ("winlog.event_data.ParentProcessGuid", "parent_process.uid"), + ("winlog.event_data.ParentImage", "parent_process.file.path"), + ("winlog.event_data.ParentImage", "parent_process.file.name"), + ("winlog.event_data.ParentImage", "parent_process.file.parent_folder"), + ]), + ], +) +def test_translate_projection_to_native(dmm, entity, field, expected_result): + assert translate_projection_to_native(dmm, entity, field) == expected_result diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 45d0428c..56fb5f57 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -28,8 +28,9 @@ SortDirection, ) from kestrel.mapping.data_model import ( - translate_comparison_to_native, reverse_mapping, + translate_comparison_to_native, + translate_projection_to_native, ) @@ -190,59 +191,13 @@ def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: # Just save projection and compile it later self.project = proj - def _get_fields(self) -> list: - """This method produces a subset of that Data Model Map. That subset - is used to generate the SQL projection (SELECT columns), with - each native column aliased to its relative OCSF name. - - """ - prefix = f"{self.entity}." if self.entity else "" - entity_map = ( - self.from_ocsf_map[self.entity] if self.entity else self.from_ocsf_map - ) - # Use the reverse map since we want native->ocsf. We could - # instead flatten the normal map, but reversed is naturally - # flat. - flat_map = reverse_mapping(entity_map) - fields = [] # Collect the needed field mappings - for k, v in flat_map.items(): - # FIXME: ProjectAttrs in compile.py aren't mapped to OCSF, so if you use STIX it doesn't work at all - # Check for 1:N mappings - if isinstance(v, list): - for i in v: - if isinstance(i, str): - fields.append((k, i)) - elif isinstance(i, dict): - fields.append((k, i["ocsf_field"])) - else: - _logger.debug("Unhandled mapping: %s", i) - elif isinstance(v, dict): - fields.append((k, v["ocsf_field"])) - elif isinstance(v, str): - fields.append((k, v)) - - _logger.debug("Field mappings: %s", fields) - return fields - def _render_proj(self): """Get a list of native cols to project with their OCSF equivalents as SQL aliases""" - fields = self._get_fields() - name_pairs = [] - for pair in fields: - native_field, tmp = pair - ocsf_field = tmp["ocsf_field"] if isinstance(tmp, dict) else tmp - if self.project and not ( - ocsf_field in self.project.attrs or native_field in self.project.attrs - ): - # It's not in the projection, so skip it - _logger.debug( - "skipping %s -> %s since it's not in projection", - native_field, - ocsf_field, - ) - continue - name_pairs.append((native_field, ocsf_field)) - proj = [f"`{k}` AS `{v}`" if k != v else k for k, v in name_pairs] + projection = self.project.attrs if self.project else None + name_pairs = translate_projection_to_native( + self.from_ocsf_map, self.entity, projection + ) + proj = [f"`{k}` AS `{v}`" if k != v else f"`{k}`" for k, v in name_pairs] if not proj: # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? proj = [f"`{attr}`" for attr in self.project.attrs] From 6479f7597f675b1865d3eee111ff895f2c0344bf Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 26 Mar 2024 17:07:41 -0400 Subject: [PATCH 35/61] Add kestrel_interface_sqlalchemy --- .../pyproject.toml | 35 +++ .../kestrel_interface_sqlalchemy/__init__.py | 1 + .../kestrel_interface_sqlalchemy/config.py | 58 ++++ .../kestrel_interface_sqlalchemy/interface.py | 294 ++++++++++++++++++ .../tests/test_config.py | 42 +++ 5 files changed, 430 insertions(+) create mode 100644 packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml create mode 100644 packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py create mode 100644 packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py create mode 100644 packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py create mode 100644 packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml new file mode 100644 index 00000000..c4309e70 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["setuptools >= 68.2.2", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "kestrel_interface_sqlalchemy" +version = "2.0.0" +description = "Kestrel SQLAlchemy Datasource Interface" +readme = "README.rst" +requires-python = ">=3.8" +license = {text = "Apache 2.0 License"} +maintainers = [ + {name = "Xiaokui Shu", email = "xiaokui.shu@ibm.com"}, + {name = "Paul Coccoli", email = "pcoccoli@us.ibm.com"}, +] +keywords = [ + "kestrel", + "cybersecurity", + "threat hunting", +] +classifiers = [ + "Topic :: Security", + "Operating System :: OS Independent", + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3", +] + +dependencies = [ + "kestrel_core>=2.0.0", +] + +[project.urls] +Homepage = "https://github.com/opencybersecurityalliance/kestrel-lang" +Documentation = "https://kestrel.readthedocs.io/" +Repository = "https://github.com/opencybersecurityalliance/kestrel-lang.git" diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py new file mode 100644 index 00000000..781df021 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/__init__.py @@ -0,0 +1 @@ +from kestrel_interface_sqlalchemy.interface import SQLAlchemyInterface diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py new file mode 100644 index 00000000..05c2557a --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py @@ -0,0 +1,58 @@ +import logging +from dataclasses import dataclass, field +from typing import Dict, Mapping, Optional + +import yaml +from mashumaro.mixins.json import DataClassJSONMixin + +from kestrel.config.utils import ( + CONFIG_DIR_DEFAULT, + load_user_config, +) +from kestrel.exceptions import InterfaceNotConfigured +from kestrel.mapping.data_model import load_mapping + + +PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "sqlalchemy.yaml" +PROFILE_PATH_ENV_VAR = "KESTREL_SQLALCHEMY_CONFIG" + +_logger = logging.getLogger(__name__) + + +@dataclass +class Connection(DataClassJSONMixin): + url: str # SQLAlchemy "connection URL" or "connection string" + + +@dataclass +class Table(DataClassJSONMixin): + connection: str + timestamp: str + timestamp_format: str + data_model_mapping: Optional[str] = None # Filename for mapping + data_model_map: Mapping = field(default_factory=dict) + + def __post_init__(self): + if self.data_model_mapping: + with open(self.data_model_mapping, "r") as fp: + self.data_model_map = yaml.safe_load(fp) + else: + # Default to the built-in ECS mapping + self.data_model_map = load_mapping("ecs") # FIXME: need a default? + + +@dataclass +class Config(DataClassJSONMixin): + connections: Dict[str, Connection] + tables: Dict[str, Table] + + def __post_init__(self): + self.connections = {k: Connection(**v) for k, v in self.connections.items()} + self.tables = {k: Table(**v) for k, v in self.tables.items()} + + +def load_config(): + try: + return Config(**load_user_config(PROFILE_PATH_ENV_VAR, PROFILE_PATH_DEFAULT)) + except TypeError: + raise InterfaceNotConfigured() diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py new file mode 100644 index 00000000..323eccb2 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py @@ -0,0 +1,294 @@ +import json +import logging +from functools import reduce +import re +from typing import Callable, Iterable, Mapping, Optional +from uuid import UUID + +# import numpy as np +import dpath +import numpy as np +from pandas import DataFrame, read_sql +import sqlalchemy +from sqlalchemy import and_, column, or_, select, FromClause, asc, desc +from sqlalchemy.engine import Compiled, default +from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList +from sqlalchemy.sql.expression import ColumnClause, ColumnOperators +from sqlalchemy.sql.selectable import Select +from typeguard import typechecked + +from kestrel.display import GraphletExplanation +from kestrel.exceptions import DataSourceError +from kestrel.interface import AbstractInterface +from kestrel.interface.codegen.sql import SqlTranslator, comp2func +from kestrel.ir.filter import ( + BoolExp, + ExpOp, + FComparison, + ListOp, + MultiComp, + NumCompOp, + StrComparison, + StrCompOp, +) +from kestrel.ir.graph import IRGraphEvaluable +from kestrel.ir.instructions import ( + DataSource, + Filter, + Instruction, + ProjectAttrs, + ProjectEntity, + Return, + SolePredecessorTransformingInstruction, + SourceInstruction, + TransformingInstruction, + Variable, +) +from kestrel.mapping.data_model import ( + reverse_mapping, + translate_comparison_to_native, + translate_projection_to_native, +) +from kestrel.mapping.transformers import run_transformer_on_series + +from kestrel_interface_sqlalchemy.config import load_config + + +_logger = logging.getLogger(__name__) + + +# TODO: move this someplace common? +def _translate_df(df: DataFrame, dmm: dict) -> DataFrame: + # Translate results into Kestrel OCSF data model + # The column names of df are already mapped + df = df.replace({np.nan: None}) + for col in df.columns: + mapping = dpath.get(dmm, col, separator=".") + if isinstance(mapping, dict): + transformer_name = mapping.get("ocsf_value") + df[col] = run_transformer_on_series(transformer_name, df[col]) + + return df + + +@typechecked +class SQLAlchemyTranslator(SqlTranslator): + def __init__( + self, + dialect: sqlalchemy.engine.default.DefaultDialect, + timefmt: Callable, + timestamp: str, + from_obj: sqlalchemy.FromClause, + dmm: dict, + ): + super().__init__(dialect, timefmt, timestamp, from_obj) + self.dmm = dmm + self.proj = None + self.entity_type = None + + @typechecked + def _render_comp(self, comp: FComparison): + prefix = ( + f"{self.entity_type}." + if (self.entity_type and comp.field != self.timestamp) + else "" + ) + ocsf_field = f"{prefix}{comp.field}" + _logger.debug("PC: ent=%s ocsf_field=%s", self.entity_type, ocsf_field) + comps = translate_comparison_to_native( + self.dmm, ocsf_field, comp.op, comp.value + ) + translated_comps = [] + for comp in comps: + field, op, value = comp + col: ColumnClause = column(field) + if op == StrCompOp.NMATCHES: + tmp = ~comp2func[op](col, value) + else: + tmp = comp2func[op](col, value) + translated_comps.append(tmp) + return reduce(or_, translated_comps) + + @typechecked + def _render_multi_comp(self, comps: MultiComp): + op = _and if comps.op == ExpOp.AND else _or + return reduce(op, map(self._render_comp, comps.comps)) + + # This is copied verbatim from sql.py but we need to supply our own _render_comp + def _render_exp(self, exp: BoolExp) -> BooleanClauseList: + if isinstance(exp.lhs, BoolExp): + lhs = self._render_exp(exp.lhs) + elif isinstance(exp.lhs, MultiComp): + lhs = self._render_multi_comp(exp.lhs) + else: + lhs = self._render_comp(exp.lhs) + if isinstance(exp.rhs, BoolExp): + rhs = self._render_exp(exp.rhs) + elif isinstance(exp.rhs, MultiComp): + rhs = self._render_multi_comp(exp.rhs) + else: + rhs = self._render_comp(exp.rhs) + return and_(lhs, rhs) if exp.op == ExpOp.AND else or_(lhs, rhs) + + @typechecked + def _add_filter(self) -> Optional[str]: + if not self.filt: + return None + filt = self.filt + if filt.timerange.start: + # Convert the timerange to the appropriate pair of comparisons + start_comp = StrComparison( + self.timestamp, ">=", self.timefmt(filt.timerange.start) + ) + stop_comp = StrComparison( + self.timestamp, "<", self.timefmt(filt.timerange.stop) + ) + # AND them together + time_exp = BoolExp(start_comp, ExpOp.AND, stop_comp) + # AND that with any existing filter expression + exp = BoolExp(filt.exp, ExpOp.AND, time_exp) + else: + exp = filt.exp + if isinstance(exp, BoolExp): + comp = self._render_exp(exp) + elif isinstance(exp, MultiComp): + comp = self._render_multi_comp(exp) + else: + comp = self._render_comp(exp) + self.query = self.query.where(comp) + + def add_Filter(self, filt: Filter) -> None: + # Just save filter and compile it later + # Probably need the entity projection set first + self.filt = filt + + def add_ProjectAttrs(self, proj: ProjectAttrs) -> None: + self.proj = proj + + def add_ProjectEntity(self, proj: ProjectEntity) -> None: + self.entity_type = proj.entity_type + + def result(self) -> sqlalchemy.Compiled: + proj = self.proj.attrs if self.proj else None + pairs = translate_projection_to_native(self.dmm, self.entity_type, proj) + cols = [sqlalchemy.column(i).label(j) for i, j in pairs] + self._add_filter() + self.query = self.query.with_only_columns(*cols) # TODO: mapping? + return self.query.compile(dialect=self.dialect) + + +class SQLAlchemyInterface(AbstractInterface): + def __init__( + self, + serialized_cache_catalog: Optional[str] = None, + session_id: Optional[UUID] = None, + ): + _logger.debug("SQLAlchemyInterface: loading config") + super().__init__(serialized_cache_catalog, session_id) + self.config = load_config() + self.schemas: dict = {} # Schema per table (index) + self.engines: dict = {} # Map of conn name -> engine + self.conns: dict = {} # Map of conn name -> connection + for info in self.config.tables.values(): + name = info.connection + conn_info = self.config.connections[name] + if name not in self.engines: + self.engines[name] = sqlalchemy.create_engine(conn_info.url) + if name not in self.conns: + engine = self.engines[name] + self.conns[name] = engine.connect() + _logger.debug("SQLAlchemyInterface: configured %s", name) + + @staticmethod + def schemes() -> Iterable[str]: + return ["sqlalchemy"] + + def store( + self, + instruction_id: UUID, + data: DataFrame, + ): + raise NotImplementedError("SQLAlchemyInterface.store") # TEMP + + def evaluate_graph( + self, + graph: IRGraphEvaluable, + instructions_to_evaluate: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, DataFrame]: + mapping = {} + if not instructions_to_evaluate: + instructions_to_evaluate = graph.get_sink_nodes() + for instruction in instructions_to_evaluate: + translator = self._evaluate_instruction_in_graph(graph, instruction) + # TODO: may catch error in case evaluation starts from incomplete SQL + sql = translator.result() + _logger.debug("SQL query generated: %s", sql) + # Get the "from" table for this query + tables = translator.query.selectable.get_final_froms() + table = tables[0].name # TODO: what if there's more than 1? + # Get the data source's SQLAlchemy connection object + conn = self.conns[self.config.tables[table].connection] + df = read_sql(sql, conn) + dmm = translator.dmm[ + translator.entity_type + ] # TODO: need a method for this? + mapping[instruction.id] = _translate_df(df, dmm) + return mapping + + def explain_graph( + self, + graph: IRGraphEvaluable, + instructions_to_explain: Optional[Iterable[Instruction]] = None, + ) -> Mapping[UUID, GraphletExplanation]: + mapping = {} + if not instructions_to_explain: + instructions_to_explain = graph.get_sink_nodes() + for instruction in instructions_to_explain: + translator = self._evaluate_instruction_in_graph(graph, instruction) + dep_graph = graph.duplicate_dependent_subgraph_of_node(instruction) + graph_dict = dep_graph.to_dict() + query_stmt = translator.result() + mapping[instruction.id] = GraphletExplanation(graph_dict, query_stmt) + return mapping + + def _evaluate_instruction_in_graph( + self, + graph: IRGraphEvaluable, + instruction: Instruction, + ) -> SQLAlchemyTranslator: + _logger.debug("instruction: %s", str(instruction)) + translator = None + if isinstance(instruction, TransformingInstruction): + trunk, _r2n = graph.get_trunk_n_branches(instruction) + translator = self._evaluate_instruction_in_graph(graph, trunk) + + if isinstance(instruction, SolePredecessorTransformingInstruction): + if isinstance(instruction, Return): + pass + elif isinstance(instruction, Variable): + pass + else: + translator.add_instruction(instruction) + + elif isinstance(instruction, Filter): + translator.add_instruction(instruction) + + else: + raise NotImplementedError(f"Unknown instruction type: {instruction}") + + elif isinstance(instruction, SourceInstruction): + if isinstance(instruction, DataSource): + ds = self.config.tables[instruction.datasource] + connection = ds.connection + dialect = self.engines[connection].dialect + translator = SQLAlchemyTranslator( + dialect, + lambda dt: dt.strftime(ds.timestamp_format), + ds.timestamp, + sqlalchemy.table(instruction.datasource), + ds.data_model_map, + ) + else: + raise NotImplementedError(f"Unhandled instruction type: {instruction}") + + return translator diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py new file mode 100644 index 00000000..a19d97a6 --- /dev/null +++ b/packages-nextgen/kestrel_interface_sqlalchemy/tests/test_config.py @@ -0,0 +1,42 @@ +import os + +import yaml + +from kestrel_interface_sqlalchemy.config import ( + PROFILE_PATH_ENV_VAR, + Connection, + load_config, +) + + +def test_load_config(tmp_path): + config = { + "connections": { + "localhost": { + "url": "sqlite:////home/jdoe/test.db", + }, + "some-data-lake": { + "url": "presto://jdoe@example.com:8889/hive", + } + }, + "tables": { + "cloud_table": { + "connection": "some-data-lake", + "timestamp": "eventTime", + "timestamp_format": "%Y-%m-%d %H:%M:%S.%f", + "data_model_mapping": str(tmp_path / "mapping.yaml") + } + } + } + map_file = tmp_path / "mapping.yaml" + with open(map_file, 'w') as fp: + fp.write("some.field: other.field\n") + config_file = tmp_path / "sqlalchemy.yaml" + with open(config_file, 'w') as fp: + yaml.dump(config, fp) + os.environ[PROFILE_PATH_ENV_VAR] = str(config_file) + read_config = load_config() + conn: Connection = read_config.connections["localhost"] + assert conn.url == config["connections"]["localhost"]["url"] + assert read_config.connections["localhost"].url == config["connections"]["localhost"]["url"] + assert read_config.tables["cloud_table"].timestamp == config["tables"]["cloud_table"]["timestamp"] From 5c8462721b753091e490b796f985e71513e175eb Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 26 Mar 2024 17:11:08 -0400 Subject: [PATCH 36/61] fix typing error with Python 3.8 --- packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index e1e72ee1..bf874be9 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -206,7 +206,7 @@ def load_mapping( @typechecked -def _get_from_mapping(mapping: Union[str, list, dict], key) -> list[str]: +def _get_from_mapping(mapping: Union[str, list, dict], key) -> list: result = [] if isinstance(mapping, list): for i in mapping: From b05b853372c26960f5bc26ae15347c4cb502a2d9 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Tue, 26 Mar 2024 17:12:25 -0400 Subject: [PATCH 37/61] sqlalchemy: add missing dpath dep --- packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml index c4309e70..7a42ede0 100644 --- a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml +++ b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml @@ -26,6 +26,7 @@ classifiers = [ ] dependencies = [ + "dpath>=2.1.6", "kestrel_core>=2.0.0", ] From a8332e9ee4d2ff5dc59b9baa0f6ca1d899aeaf84 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Wed, 27 Mar 2024 08:59:01 -0400 Subject: [PATCH 38/61] refactor _translate_df to common translate_dataframe function --- .../src/kestrel/mapping/data_model.py | 21 ++++++++++- .../tests/test_mapping_data_model.py | 11 ++++++ .../kestrel_interface_opensearch/interface.py | 19 ++-------- .../src/kestrel_interface_opensearch/ossql.py | 3 -- .../kestrel_interface_sqlalchemy/interface.py | 35 +++---------------- 5 files changed, 38 insertions(+), 51 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index bf874be9..e3942104 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -1,10 +1,16 @@ import logging from typing import Optional, Union +import dpath +import numpy as np import yaml +from pandas import DataFrame from typeguard import typechecked -from kestrel.mapping.transformers import run_transformer +from kestrel.mapping.transformers import ( + run_transformer, + run_transformer_on_series, +) from kestrel.utils import list_folder_files _logger = logging.getLogger(__name__) @@ -258,3 +264,16 @@ def translate_projection_to_native( result.append((attr, attr)) # FIXME: raise exception instead? _logger.debug("proj_to_native: return %s", result) return result + + +@typechecked +def translate_dataframe(df: DataFrame, dmm: dict) -> DataFrame: + # Translate results into Kestrel OCSF data model + # The column names of df are already mapped + df = df.replace({np.nan: None}) + for col in df.columns: + mapping = dpath.get(dmm, col, separator=".") + if isinstance(mapping, dict): + transformer_name = mapping.get("ocsf_value") + df[col] = run_transformer_on_series(transformer_name, df[col]) + return df diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py index 8fe177a6..cfa2d9a8 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -1,10 +1,13 @@ import pytest +import pandas as pd + from kestrel.mapping.data_model import ( load_mapping, reverse_mapping, translate_comparison_to_native, translate_comparison_to_ocsf, + translate_dataframe, translate_projection_to_native, ) @@ -187,3 +190,11 @@ def test_translate_comparison_to_ocsf(dmm, field, op, value, expected_result): ) def test_translate_projection_to_native(dmm, entity, field, expected_result): assert translate_projection_to_native(dmm, entity, field) == expected_result + + +def test_translate_dataframe(): #TODO: more testing here + df = pd.DataFrame({"file.path": [r"C:\Windows\System32\cmd.exe", r"C:\TMP"], + "pid": [1, 2]}) + dmm = load_mapping("ecs") + df = translate_dataframe(df, dmm["process"]) + #TODO:assert df["file.name"].iloc[0] == "cmd.exe" diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index 8b7de865..de1b5c40 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -2,8 +2,6 @@ from typing import Iterable, Mapping, Optional from uuid import UUID -import dpath -import numpy as np from opensearchpy import OpenSearch from pandas import DataFrame, Series, concat @@ -21,7 +19,7 @@ TransformingInstruction, SolePredecessorTransformingInstruction, ) -from kestrel.mapping.transformers import run_transformer_on_series +from kestrel.mapping.data_model import translate_dataframe from kestrel_interface_opensearch.config import load_config from kestrel_interface_opensearch.ossql import OpenSearchTranslator @@ -36,19 +34,6 @@ def _jdbc2df(schema: dict, datarows: dict) -> DataFrame: return DataFrame(datarows, columns=columns) -def _translate_df(df: DataFrame, dmm: dict) -> DataFrame: - # Translate results into Kestrel OCSF data model - # The column names of df are already mapped - df = df.replace({np.nan: None}) - for col in df.columns: - mapping = dpath.get(dmm, col, separator=".") - if isinstance(mapping, dict): - transformer_name = mapping.get("ocsf_value") - df[col] = run_transformer_on_series(transformer_name, df[col]) - - return df - - def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFrame: """Execute `sql` and return the results as a DataFrame, a la pandas.read_sql""" # https://opensearch.org/docs/latest/search-plugins/sql/sql-ppl-api/#query-api @@ -77,7 +62,7 @@ def read_sql(sql: str, conn: OpenSearch, dmm: Optional[dict] = None) -> DataFram df = _jdbc2df(schema, query_resp["datarows"]) if dmm is not None: # Need to use Data Model Map to do results translation - dfs.append(_translate_df(df, dmm)) + dfs.append(translate_dataframe(df, dmm)) else: dfs.append(df) cursor = query_resp.get("cursor") diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 56fb5f57..8e5d7763 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -2,7 +2,6 @@ from functools import reduce from typing import Optional, Union -import dpath from typeguard import typechecked from kestrel.exceptions import UnsupportedOperatorError @@ -10,7 +9,6 @@ BoolExp, ExpOp, FComparison, - ListComparison, ListOp, MultiComp, NumCompOp, @@ -28,7 +26,6 @@ SortDirection, ) from kestrel.mapping.data_model import ( - reverse_mapping, translate_comparison_to_native, translate_projection_to_native, ) diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py index 323eccb2..9a379d34 100644 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py @@ -1,33 +1,23 @@ -import json import logging from functools import reduce -import re from typing import Callable, Iterable, Mapping, Optional from uuid import UUID -# import numpy as np -import dpath -import numpy as np from pandas import DataFrame, read_sql import sqlalchemy -from sqlalchemy import and_, column, or_, select, FromClause, asc, desc -from sqlalchemy.engine import Compiled, default -from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList -from sqlalchemy.sql.expression import ColumnClause, ColumnOperators -from sqlalchemy.sql.selectable import Select +from sqlalchemy import and_, column, or_ +from sqlalchemy.sql.elements import BooleanClauseList +from sqlalchemy.sql.expression import ColumnClause from typeguard import typechecked from kestrel.display import GraphletExplanation -from kestrel.exceptions import DataSourceError from kestrel.interface import AbstractInterface from kestrel.interface.codegen.sql import SqlTranslator, comp2func from kestrel.ir.filter import ( BoolExp, ExpOp, FComparison, - ListOp, MultiComp, - NumCompOp, StrComparison, StrCompOp, ) @@ -45,11 +35,10 @@ Variable, ) from kestrel.mapping.data_model import ( - reverse_mapping, translate_comparison_to_native, + translate_dataframe, translate_projection_to_native, ) -from kestrel.mapping.transformers import run_transformer_on_series from kestrel_interface_sqlalchemy.config import load_config @@ -57,20 +46,6 @@ _logger = logging.getLogger(__name__) -# TODO: move this someplace common? -def _translate_df(df: DataFrame, dmm: dict) -> DataFrame: - # Translate results into Kestrel OCSF data model - # The column names of df are already mapped - df = df.replace({np.nan: None}) - for col in df.columns: - mapping = dpath.get(dmm, col, separator=".") - if isinstance(mapping, dict): - transformer_name = mapping.get("ocsf_value") - df[col] = run_transformer_on_series(transformer_name, df[col]) - - return df - - @typechecked class SQLAlchemyTranslator(SqlTranslator): def __init__( @@ -232,7 +207,7 @@ def evaluate_graph( dmm = translator.dmm[ translator.entity_type ] # TODO: need a method for this? - mapping[instruction.id] = _translate_df(df, dmm) + mapping[instruction.id] = translate_dataframe(df, dmm) return mapping def explain_graph( From 88dd030998a95ef11fe7ad035ba96155e67f801e Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Wed, 27 Mar 2024 09:18:20 -0400 Subject: [PATCH 39/61] Move dpath dependency to core --- packages-nextgen/kestrel_core/pyproject.toml | 1 + packages-nextgen/kestrel_interface_opensearch/pyproject.toml | 1 - packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_core/pyproject.toml b/packages-nextgen/kestrel_core/pyproject.toml index 61f48941..e57a5bca 100644 --- a/packages-nextgen/kestrel_core/pyproject.toml +++ b/packages-nextgen/kestrel_core/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "mashumaro>=3.10", "networkx>=3.1", # networkx==3.2.1 only for Python>=3.9 "SQLAlchemy>=2.0.23", + "dpath>=2.1.6", ] [project.optional-dependencies] diff --git a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml index ad815d8b..6270f6d0 100644 --- a/packages-nextgen/kestrel_interface_opensearch/pyproject.toml +++ b/packages-nextgen/kestrel_interface_opensearch/pyproject.toml @@ -26,7 +26,6 @@ classifiers = [ ] dependencies = [ - "dpath>=2.1.6", "kestrel_core>=2.0.0", "opensearch-py>=2.4.2", ] diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml index 7a42ede0..c4309e70 100644 --- a/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml +++ b/packages-nextgen/kestrel_interface_sqlalchemy/pyproject.toml @@ -26,7 +26,6 @@ classifiers = [ ] dependencies = [ - "dpath>=2.1.6", "kestrel_core>=2.0.0", ] From 739a93dd846ffba19b1dc08ba756499e08452bd2 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 28 Mar 2024 12:39:48 -0400 Subject: [PATCH 40/61] opensearch: prune native fields from projection if not in schema --- .../kestrel/mapping/entityattribute/ecs.yaml | 28 +++++++++++++++++-- .../kestrel_interface_opensearch/interface.py | 6 ++-- .../src/kestrel_interface_opensearch/ossql.py | 7 ++++- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index e9050d80..40304462 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -107,7 +107,7 @@ process: # src_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -src_endpoint: +src_endpoint: &src_ref domain: - client.domain - source.domain @@ -120,6 +120,10 @@ src_endpoint: mac: - client.mac - source.mac + port: + - client.port + - source.port + # endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint endpoint: @@ -143,9 +147,15 @@ endpoint: - source.mac - server.mac - destination.mac + port: + - client.port + - source.port + - server.port + - destination.port + # dst_endpoint: see https://schema.ocsf.io/1.1.0/objects/endpoint -dst_endpoint: +dst_endpoint: &dst_ref domain: - server.domain - destination.domain @@ -158,10 +168,14 @@ dst_endpoint: mac: - server.mac - destination.mac + port: + - server.port + - destination.port # https://schema.ocsf.io/1.1.0/objects/network_traffic -traffic: # should be `network_traffic`? +# should be `network_traffic`? +traffic: &traffic bytes: network.bytes bytes_in: - destination.bytes @@ -209,3 +223,11 @@ user: name: user-account:account_login type: user-account:account_type uid: user-account:user_id + + +# https://schema.ocsf.io/1.1.0/classes/network_activity +# Network Activity [4001] Class +network_activity: + src_endpoint: *src_ref + dst_endpoint: *dst_ref + traffic: *traffic diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index de1b5c40..5b1dee62 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -202,7 +202,7 @@ def get_schema(self, index: str) -> dict: client = self._get_client_for_index(index) if index not in self.schemas: df = read_sql(f"DESCRIBE TABLES LIKE {index}", client) - self.schemas[index] = Series( - df["TYPE_NAME"], index=df["COLUMN_NAME"] - ).to_dict() + self.schemas[index] = (df[["TYPE_NAME", "COLUMN_NAME"]] + .set_index("COLUMN_NAME").T.to_dict("records")[0]) + _logger.debug("%s schema:\n%s", index, self.schemas[index]) return self.schemas[index] diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 8e5d7763..78b5eda3 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -194,7 +194,12 @@ def _render_proj(self): name_pairs = translate_projection_to_native( self.from_ocsf_map, self.entity, projection ) - proj = [f"`{k}` AS `{v}`" if k != v else f"`{k}`" for k, v in name_pairs] + proj = [ + f"`{k}` AS `{v}`" + if k != v else f"`{k}`" + for k, v in name_pairs + if k in self.schema # Ignore mapped attrs the index doesn't have + ] if not proj: # If this is still empty, then the attr projection must be for attrs "outside" to entity projection? proj = [f"`{attr}`" for attr in self.project.attrs] From 06b8034e0661dda5314b53d644b60655f8aaf1ed Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Thu, 28 Mar 2024 13:53:21 -0400 Subject: [PATCH 41/61] sqlalchemy: find and/or in multi_comp --- .../src/kestrel_interface_sqlalchemy/interface.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py index 9a379d34..6197ab5e 100644 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/interface.py @@ -69,7 +69,6 @@ def _render_comp(self, comp: FComparison): else "" ) ocsf_field = f"{prefix}{comp.field}" - _logger.debug("PC: ent=%s ocsf_field=%s", self.entity_type, ocsf_field) comps = translate_comparison_to_native( self.dmm, ocsf_field, comp.op, comp.value ) @@ -86,7 +85,7 @@ def _render_comp(self, comp: FComparison): @typechecked def _render_multi_comp(self, comps: MultiComp): - op = _and if comps.op == ExpOp.AND else _or + op = and_ if comps.op == ExpOp.AND else or_ return reduce(op, map(self._render_comp, comps.comps)) # This is copied verbatim from sql.py but we need to supply our own _render_comp From e0e9acf38f65c57aad140b2f818529dcffb7be52 Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Fri, 29 Mar 2024 08:42:36 -0400 Subject: [PATCH 42/61] ecs: fix user mapping --- .../src/kestrel/mapping/entityattribute/ecs.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml index 40304462..d4a1bf75 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/entityattribute/ecs.yaml @@ -219,10 +219,10 @@ certificate: # https://schema.ocsf.io/1.1.0/objects/user user: - full_name: user-account:display_name - name: user-account:account_login - type: user-account:account_type - uid: user-account:user_id + domain: user.domain + full_name: user.full_name + name: user.name + uid: user.id # https://schema.ocsf.io/1.1.0/classes/network_activity From a46cf656aac1c929ca94e726c592caeb1e0fd72f Mon Sep 17 00:00:00 2001 From: Paul Coccoli Date: Fri, 29 Mar 2024 09:03:16 -0400 Subject: [PATCH 43/61] transformers: add to_epoch_ms --- .../kestrel_core/src/kestrel/mapping/transformers.py | 12 ++++++++++++ .../kestrel_core/tests/test_mapping_transformers.py | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py index 3d0a0885..82202dcb 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/transformers.py @@ -1,5 +1,6 @@ """Kestrel Data Model Map value transformers""" +from datetime import datetime, timezone from typing import Callable from pandas import Series @@ -15,6 +16,17 @@ def transformer(func: Callable) -> Callable: return func +@transformer +def to_epoch_ms(value: str) -> int: + """Convert a time value to milliseconds since the epoch""" + if "." in value: + time_pattern = "%Y-%m-%dT%H:%M:%S.%fZ" + else: + time_pattern = "%Y-%m-%dT%H:%M:%SZ" + dt = datetime.strptime(value, time_pattern).replace(tzinfo=timezone.utc) + return int(dt.timestamp() * 1000) + + @transformer def dirname(path: str) -> str: # TODO: rename to winpath_dirname? """Get the directory part of `path`""" diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py index 90db4d28..9e454925 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_transformers.py @@ -19,6 +19,10 @@ ("to_int", "0x4d2", 1234), ("to_str", "1234", "1234"), ("to_str", 1234, "1234"), + ("to_epoch_ms", "2024-03-29T12:57:56.926Z", 1711717076926), + ("to_epoch_ms", "2024-03-29T12:57:56.92Z", 1711717076920), + ("to_epoch_ms", "2024-03-29T12:57:56.9Z", 1711717076900), + ("to_epoch_ms", "2024-03-29T12:57:56Z", 1711717076000), ] ) def test_run_transformer(transform, value, expected): From fcefc07065121331ff76299eaec53fd08f9c2716 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 15 Apr 2024 10:49:24 -0400 Subject: [PATCH 44/61] type check K2 config module --- .../kestrel_core/src/kestrel/config/utils.py | 24 +++++++++++-------- .../kestrel_core/src/kestrel/utils.py | 10 ++++---- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py index 8911b8a7..b0acc001 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py @@ -1,24 +1,31 @@ import os import yaml -import pathlib +from pathlib import Path import logging +from typeguard import typechecked +from typing import Mapping, Union from kestrel.utils import update_nested_dict, load_data_file -CONFIG_DIR_DEFAULT = pathlib.Path.home() / ".config" / "kestrel" +CONFIG_DIR_DEFAULT = Path.home() / ".config" / "kestrel" CONFIG_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "kestrel.yaml" CONFIG_PATH_ENV_VAR = "KESTREL_CONFIG" # override CONFIG_PATH_DEFAULT if provided _logger = logging.getLogger(__name__) -def load_default_config(): +@typechecked +def load_default_config() -> Mapping: _logger.debug(f"Loading default config file...") default_config = load_data_file("kestrel.config", "kestrel.yaml") - return yaml.safe_load(os.path.expandvars(default_config)) + config_with_envvar_expanded = os.path.expandvars(default_config) + config_content = yaml.safe_load(config_with_envvar_expanded) + return config_content -def load_user_config(config_path_env_var, config_path_default): +@typechecked +def load_user_config(config_path_env_var: str, config_path_default: Union[str, Path]) -> Mapping: + config_path_default = config_path_default.absolute().as_posix() config_path = os.getenv(config_path_env_var, config_path_default) config_path = os.path.expanduser(config_path) config = {} @@ -32,13 +39,10 @@ def load_user_config(config_path_env_var, config_path_default): return config -def load_config(): +@typechecked +def load_config() -> Mapping: config_default = load_default_config() config_user = load_user_config(CONFIG_PATH_ENV_VAR, CONFIG_PATH_DEFAULT) _logger.debug(f"User configuration loaded: {config_user}") _logger.debug(f"Updating default config with user config...") return update_nested_dict(config_default, config_user) - - -if __name__ == "__main__": - ... diff --git a/packages-nextgen/kestrel_core/src/kestrel/utils.py b/packages-nextgen/kestrel_core/src/kestrel/utils.py index 70db2ae3..f843b408 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/utils.py @@ -5,10 +5,11 @@ from pathlib import Path from pkgutil import get_data from typeguard import typechecked -from typing import Union, Mapping +from typing import Optional, Mapping, Iterable -def load_data_file(package_name, file_name): +@typechecked +def load_data_file(package_name: str, file_name: str) -> str: try: # resources.files() is introduced in Python 3.9 content = resources.files(package_name).joinpath(file_name).read_text() @@ -20,7 +21,8 @@ def load_data_file(package_name, file_name): return content -def list_folder_files(package_name, folder_name, prefix=None, suffix=None): +@typechecked +def list_folder_files(package_name:str , folder_name:str , prefix:Optional[str]=None, suffix:Optional[str]=None) -> Iterable[str]: try: file_paths = resources.files(package_name).joinpath(folder_name).iterdir() except AttributeError: @@ -57,7 +59,7 @@ def unescape_quoted_string(s: str) -> str: @typechecked -def update_nested_dict(dict_old: Mapping, dict_new: Union[Mapping, None]): +def update_nested_dict(dict_old: Mapping, dict_new: Optional[Mapping]) -> Mapping: if dict_new: for k, v in dict_new.items(): if isinstance(v, collections.abc.Mapping) and k in dict_old: From ca3a322be42bbfc0406ffa3a7c66d2495800e67f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 15 Apr 2024 17:14:58 -0400 Subject: [PATCH 45/61] remove obsolete mapping utils code and black code --- .../kestrel_core/src/kestrel/config/utils.py | 4 +- .../src/kestrel/mapping/data_model.py | 4 +- .../kestrel_core/src/kestrel/mapping/utils.py | 74 ------------------- .../kestrel_core/src/kestrel/utils.py | 12 ++- .../kestrel_core/tests/test_mapping.py | 19 ----- .../tests/test_mapping_data_model.py | 6 +- .../kestrel_interface_opensearch/config.py | 4 +- .../kestrel_interface_opensearch/interface.py | 7 +- .../src/kestrel_interface_opensearch/ossql.py | 3 +- .../kestrel_interface_sqlalchemy/config.py | 4 +- 10 files changed, 28 insertions(+), 109 deletions(-) delete mode 100644 packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py delete mode 100644 packages-nextgen/kestrel_core/tests/test_mapping.py diff --git a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py index b0acc001..0b912e7a 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/config/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/config/utils.py @@ -24,7 +24,9 @@ def load_default_config() -> Mapping: @typechecked -def load_user_config(config_path_env_var: str, config_path_default: Union[str, Path]) -> Mapping: +def load_user_config( + config_path_env_var: str, config_path_default: Union[str, Path] +) -> Mapping: config_path_default = config_path_default.absolute().as_posix() config_path = os.getenv(config_path_env_var, config_path_default) config_path = os.path.expanduser(config_path) diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py index e3942104..d05bd943 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py +++ b/packages-nextgen/kestrel_core/src/kestrel/mapping/data_model.py @@ -196,14 +196,14 @@ def translate_comparison_to_ocsf( @typechecked -def load_mapping( +def load_default_mapping( data_model_name: str, mapping_pkg: str = "kestrel.mapping", submodule: str = "entityattribute", ): result = {} entityattr_mapping_files = list_folder_files( - mapping_pkg, submodule, prefix=data_model_name, suffix=".yaml" + mapping_pkg, submodule, prefix=data_model_name, extension="yaml" ) for f in entityattr_mapping_files: with open(f, "r") as fp: diff --git a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py b/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py deleted file mode 100644 index d06a5854..00000000 --- a/packages-nextgen/kestrel_core/src/kestrel/mapping/utils.py +++ /dev/null @@ -1,74 +0,0 @@ -import logging -import os -from typing import ( - Iterable, - Union, -) - -from typeguard import typechecked -import yaml - -from kestrel.exceptions import MappingParseError -from kestrel.utils import load_data_file, list_folder_files - - -_logger = logging.getLogger(__name__) - - -# _entityname_mapping is dictionaries that contain -# the info needed to translate: -# a. queries between: -# 1. STIX and OCSF -# 2. ECS and OCSF -# 3. OCSF and ECS -# b. results between: -# 1. ECS and OCSF -_entityname_mapping = {} - - -@typechecked -def load_standard_config(mapping_pkg: str): - global _entityname_mapping - if len(_entityname_mapping) > 0: - return - entityname_mapping_files = list_folder_files( - mapping_pkg, "entityname", suffix=".yaml" - ) - for f in entityname_mapping_files: - parse_entityname_mapping_file(mapping_pkg, f.name) - - -@typechecked -def parse_entityname_mapping_file(mapping_pkg: str, filename: str): - global _entityname_mapping - mapping_fpath = os.path.join("entityname", filename) - filename_no_ext, _ = filename.split(".") - src_lang = "stix" if filename_no_ext == "alias" else filename_no_ext - dst_lang = "ocsf" - src_dict = _entityname_mapping.get(src_lang, {}) - dst_dict = src_dict.get(dst_lang, {}) - try: - mapping_str = load_data_file(mapping_pkg, mapping_fpath) - mapping = yaml.safe_load(mapping_str) - dst_dict.update(mapping) - except Exception as ex: - raise MappingParseError() from ex - src_dict[dst_lang] = dst_dict - _entityname_mapping[src_lang] = src_dict - - -def load_custom_config(): - # ~/.config/kestrel/mapping/entity/*.yaml - # ~/.config/kestrel/mapping/property/*.yaml - return - - -@typechecked -def normalize_entity( - entityname: str, src_lang: str, dst_lang: str -) -> Union[str, Iterable[str]]: - return ( - _entityname_mapping.get(src_lang, {}) - .get(dst_lang, {}) - .get(entityname, entityname) - ) diff --git a/packages-nextgen/kestrel_core/src/kestrel/utils.py b/packages-nextgen/kestrel_core/src/kestrel/utils.py index f843b408..02cbb5b3 100644 --- a/packages-nextgen/kestrel_core/src/kestrel/utils.py +++ b/packages-nextgen/kestrel_core/src/kestrel/utils.py @@ -22,7 +22,15 @@ def load_data_file(package_name: str, file_name: str) -> str: @typechecked -def list_folder_files(package_name:str , folder_name:str , prefix:Optional[str]=None, suffix:Optional[str]=None) -> Iterable[str]: +def list_folder_files( + package_name: str, + folder_name: str, + prefix: Optional[str] = None, + extension: Optional[str] = None, +) -> Iterable[str]: + # preprocesss extension to add dot it not there + if extension and extension[0] != ".": + extension = "." + extension try: file_paths = resources.files(package_name).joinpath(folder_name).iterdir() except AttributeError: @@ -43,7 +51,7 @@ def list_folder_files(package_name:str , folder_name:str , prefix:Optional[str]= for f in file_paths if ( f.is_file() - and (f.name.endswith(suffix) if suffix else True) + and (f.name.endswith(extension) if extension else True) and (f.name.startswith(prefix) if prefix else True) ) ) diff --git a/packages-nextgen/kestrel_core/tests/test_mapping.py b/packages-nextgen/kestrel_core/tests/test_mapping.py deleted file mode 100644 index 1f4a07df..00000000 --- a/packages-nextgen/kestrel_core/tests/test_mapping.py +++ /dev/null @@ -1,19 +0,0 @@ -import kestrel.mapping.utils as mapping_utils - - -def test_mapping_load_config(): - mapping_utils.load_standard_config("kestrel.mapping") - entity_name_map = mapping_utils._entityname_mapping - assert "stix" in entity_name_map - assert "ocsf" in entity_name_map.get("stix", {}) - assert "ecs" in entity_name_map - assert "ocsf" in entity_name_map.get("ecs", {}) - - -def test_mapping_entity_names(): - res = mapping_utils.normalize_entity("process", "ecs", "ocsf") - assert res == "process" - res = mapping_utils.normalize_entity("i_dont_exist", "ecs", "ocsf") - assert res == "i_dont_exist" - res = mapping_utils.normalize_entity("network", "ecs", "ocsf") - assert res == "network_activity" diff --git a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py index cfa2d9a8..93abe83e 100644 --- a/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py +++ b/packages-nextgen/kestrel_core/tests/test_mapping_data_model.py @@ -3,7 +3,7 @@ import pandas as pd from kestrel.mapping.data_model import ( - load_mapping, + load_default_mapping, reverse_mapping, translate_comparison_to_native, translate_comparison_to_ocsf, @@ -96,7 +96,7 @@ # This mapping is used in 2 places: # - frontend comparison from ECS to OCSF # - backend comparison from OCSF to ECS (datasource) -ECS_MAPPING = load_mapping("ecs") +ECS_MAPPING = load_default_mapping("ecs") def test_reverse_mapping_ipv4(): @@ -195,6 +195,6 @@ def test_translate_projection_to_native(dmm, entity, field, expected_result): def test_translate_dataframe(): #TODO: more testing here df = pd.DataFrame({"file.path": [r"C:\Windows\System32\cmd.exe", r"C:\TMP"], "pid": [1, 2]}) - dmm = load_mapping("ecs") + dmm = load_default_mapping("ecs") df = translate_dataframe(df, dmm["process"]) #TODO:assert df["file.name"].iloc[0] == "cmd.exe" diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py index 16eddf92..26d02ccf 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/config.py @@ -10,7 +10,7 @@ load_user_config, ) from kestrel.exceptions import InterfaceNotConfigured -from kestrel.mapping.data_model import load_mapping +from kestrel.mapping.data_model import load_default_mapping PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "opensearch.yaml" @@ -49,7 +49,7 @@ def __post_init__(self): self.data_model_map = yaml.safe_load(fp) else: # Default to the built-in ECS mapping - self.data_model_map = load_mapping("ecs") + self.data_model_map = load_default_mapping("ecs") @dataclass diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py index 5b1dee62..8c70eb95 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/interface.py @@ -202,7 +202,10 @@ def get_schema(self, index: str) -> dict: client = self._get_client_for_index(index) if index not in self.schemas: df = read_sql(f"DESCRIBE TABLES LIKE {index}", client) - self.schemas[index] = (df[["TYPE_NAME", "COLUMN_NAME"]] - .set_index("COLUMN_NAME").T.to_dict("records")[0]) + self.schemas[index] = ( + df[["TYPE_NAME", "COLUMN_NAME"]] + .set_index("COLUMN_NAME") + .T.to_dict("records")[0] + ) _logger.debug("%s schema:\n%s", index, self.schemas[index]) return self.schemas[index] diff --git a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py index 78b5eda3..018cd4c8 100644 --- a/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py +++ b/packages-nextgen/kestrel_interface_opensearch/src/kestrel_interface_opensearch/ossql.py @@ -195,8 +195,7 @@ def _render_proj(self): self.from_ocsf_map, self.entity, projection ) proj = [ - f"`{k}` AS `{v}`" - if k != v else f"`{k}`" + f"`{k}` AS `{v}`" if k != v else f"`{k}`" for k, v in name_pairs if k in self.schema # Ignore mapped attrs the index doesn't have ] diff --git a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py index 05c2557a..e9d148e4 100644 --- a/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py +++ b/packages-nextgen/kestrel_interface_sqlalchemy/src/kestrel_interface_sqlalchemy/config.py @@ -10,7 +10,7 @@ load_user_config, ) from kestrel.exceptions import InterfaceNotConfigured -from kestrel.mapping.data_model import load_mapping +from kestrel.mapping.data_model import load_default_mapping PROFILE_PATH_DEFAULT = CONFIG_DIR_DEFAULT / "sqlalchemy.yaml" @@ -38,7 +38,7 @@ def __post_init__(self): self.data_model_map = yaml.safe_load(fp) else: # Default to the built-in ECS mapping - self.data_model_map = load_mapping("ecs") # FIXME: need a default? + self.data_model_map = load_default_mapping("ecs") # FIXME: need a default? @dataclass From e9319f056449a5995d26b9c5fda3e2ac8fa5b023 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 16 Apr 2024 10:19:06 -0400 Subject: [PATCH 46/61] fix badge layout in README --- README.rst | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index 1edf91ad..cdcd4c06 100644 --- a/README.rst +++ b/README.rst @@ -2,31 +2,11 @@ :width: 460 :alt: Kestrel Threat Hunting Language -.. image:: https://readthedocs.org/projects/kestrel/badge/?version=latest - :target: https://kestrel.readthedocs.io/en/latest/?badge=latest - :alt: Documentation Status - -.. image:: https://img.shields.io/pypi/v/kestrel-jupyter - :target: https://pypi.python.org/pypi/kestrel-jupyter - :alt: Latest Version - -.. image:: https://img.shields.io/pypi/dm/kestrel-core - :target: https://pypistats.org/packages/kestrel-core - :alt: PyPI Downloads - -.. image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3 - :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang - :alt: Code Coverage - -.. image:: https://img.shields.io/badge/code%20style-black-000000.svg - :target: https://github.com/psf/black - :alt: Code Style: Black - | -**[News]** Kestrel session at `Black Hat USA 2023`_ +|readthedocs| |pypi| |downloads| |codecoverage| |black| --------- +| Kestrel is a threat hunting language aiming to make cyber threat hunting *fast* by providing a layer of abstraction to build reusable, composable, and @@ -215,3 +195,24 @@ Connecting With The Community .. _contributing guideline: CONTRIBUTING.rst .. _governance documentation: GOVERNANCE.rst .. _Apache License 2.0: LICENSE.md + + +.. |readthedocs| image:: https://readthedocs.org/projects/kestrel/badge/?version=latest + :target: https://kestrel.readthedocs.io/en/latest/?badge=latest + :alt: Documentation Status + +.. |pypi| image:: https://img.shields.io/pypi/v/kestrel-jupyter + :target: https://pypi.python.org/pypi/kestrel-jupyter + :alt: Latest Version + +.. |downloads| image:: https://img.shields.io/pypi/dm/kestrel-core + :target: https://pypistats.org/packages/kestrel-core + :alt: PyPI Downloads + +.. |codecoverage| image:: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang/branch/develop/graph/badge.svg?token=HM4ax10IW3 + :target: https://codecov.io/gh/opencybersecurityalliance/kestrel-lang + :alt: Code Coverage + +.. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg + :target: https://github.com/psf/black + :alt: Code Style: Black From 8e9f5d8b076ca5202704caa83332c2597ba2cb8f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 16 Apr 2024 22:49:09 -0400 Subject: [PATCH 47/61] fix PyPI verification issue --- .../src/kestrel_datasource_stixshifter/connector.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py index 370eecea..a4ad58ad 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py @@ -12,8 +12,8 @@ _logger = logging.getLogger(__name__) -XPATH_PYPI_PKG_HOME = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[1]/a/@href" -XPATH_PYPI_PKG_SOURCE = "/html/body/main/div[4]/div/div/div[1]/div[2]/ul/li[2]/a/@href" +XPATH_PYPI_PKG_HOME = [f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[1]/a/@href" for i in range(5)] +XPATH_PYPI_PKG_SOURCE = [f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[2]/a/@href" for i in range(5)] STIX_SHIFTER_HOMEPAGE = "https://github.com/opencybersecurityalliance/stix-shifter" @@ -39,8 +39,8 @@ def verify_package_origin(connector_name, stixshifter_version, requests_verify=T ) try: - p_homepage = pypi_etree.xpath(XPATH_PYPI_PKG_HOME)[0] - p_source = pypi_etree.xpath(XPATH_PYPI_PKG_SOURCE)[0] + p_homepage = [urls for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_HOME] if urls][0][0] + p_source = [urls for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_SOURCE] if urls][0][0] except: raise DataSourceError( f'STIX-shifter connector for "{connector_name}" is not installed ' From dcd0a1fbf0af39a1fceb85074d2c5d7753c849c9 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Tue, 16 Apr 2024 22:52:47 -0400 Subject: [PATCH 48/61] style update --- .../connector.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py index a4ad58ad..d090f003 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/connector.py @@ -12,8 +12,12 @@ _logger = logging.getLogger(__name__) -XPATH_PYPI_PKG_HOME = [f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[1]/a/@href" for i in range(5)] -XPATH_PYPI_PKG_SOURCE = [f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[2]/a/@href" for i in range(5)] +XPATH_PYPI_PKG_HOME = [ + f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[1]/a/@href" for i in range(5) +] +XPATH_PYPI_PKG_SOURCE = [ + f"/html/body/main/div[4]/div/div/div[1]/div[{i}]/ul/li[2]/a/@href" for i in range(5) +] STIX_SHIFTER_HOMEPAGE = "https://github.com/opencybersecurityalliance/stix-shifter" @@ -39,8 +43,16 @@ def verify_package_origin(connector_name, stixshifter_version, requests_verify=T ) try: - p_homepage = [urls for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_HOME] if urls][0][0] - p_source = [urls for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_SOURCE] if urls][0][0] + p_homepage = [ + urls + for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_HOME] + if urls + ][0][0] + p_source = [ + urls + for urls in [pypi_etree.xpath(xpath) for xpath in XPATH_PYPI_PKG_SOURCE] + if urls + ][0][0] except: raise DataSourceError( f'STIX-shifter connector for "{connector_name}" is not installed ' From 4d6e443d37780f3c8c4a31430784225287b9705f Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 18 Apr 2024 16:30:37 -0400 Subject: [PATCH 49/61] upgrade to stix-shifter v7 and hack verify_cert --- .../pyproject.toml | 4 ++-- .../kestrel_datasource_stixshifter/config.py | 10 ++++++++++ .../diagnosis.py | 6 ++++++ .../interface.py | 2 +- .../multiproc.py | 2 ++ .../kestrel_datasource_stixshifter/query.py | 2 ++ .../worker/transmitter.py | 12 +++++++++++- .../worker/utils.py | 18 ++++++++++++++++++ .../tests/test_stixshifter.py | 7 +++++-- 9 files changed, 57 insertions(+), 6 deletions(-) diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index b4e4f830..0a632566 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -31,8 +31,8 @@ dependencies = [ "lxml>=4.9.3", "requests>=2.31.0", "nest-asyncio>=1.5.8", - "stix-shifter==6.2.2", - "stix-shifter-utils==6.2.2", + "stix-shifter==7.0.6", + "stix-shifter-utils==7.0.6", ] [project.optional-dependencies] diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py index 27df919a..73eb8ff8 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/config.py @@ -22,6 +22,7 @@ SINGLE_BATCH_TIMEOUT = 60 COOL_DOWN_AFTER_TRANSMISSION = 0 ALLOW_DEV_CONNECTOR = False +VERIFY_CERT = True FAST_TRANSLATE_CONNECTORS = [] # Suggested: ["qradar", "elastic_ecs"] @@ -175,6 +176,14 @@ def get_datasource_from_profiles(profile_name, profiles): profile_name, ) + verify_cert = _extract_param_from_connection_config( + "verify_cert", + bool, + VERIFY_CERT, + connection, + profile_name, + ) + return ( connector_name, connection, @@ -182,6 +191,7 @@ def get_datasource_from_profiles(profile_name, profiles): retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, + verify_cert, ) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py index 487f7944..6b34ebf9 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py @@ -11,6 +11,7 @@ from kestrel_datasource_stixshifter.worker import STOP_SIGN from kestrel_datasource_stixshifter.query import translate_query from kestrel_datasource_stixshifter.worker.transmitter import Transmitter +from kestrel_datasource_stixshifter.worker.utils import disable_cert_verification_on_transmission from stix_shifter.stix_transmission import stix_transmission @@ -26,6 +27,7 @@ def __init__(self, datasource_name): self.retrieval_batch_size, self.cool_down_after_transmission, self.allow_dev_connector, + self.verify_cert, ) = get_datasource_from_profiles(datasource_name, self.profiles) self.if_fast_translation = ( self.connector_name in self.kestrel_options["fast_translate"] @@ -72,6 +74,9 @@ def diagnose_ping(self): self.configuration_dict, ) + if not self.verify_cert: + disable_cert_verification_on_transmission(transmission) + result = transmission.ping() print() @@ -125,6 +130,7 @@ def diagnose_run_query_and_retrieval_result(self, stix_patterns, max_batch_cnt): self.configuration_dict, self.retrieval_batch_size, self.cool_down_after_transmission, + self.verify_cert, query, result_queue, max_batch_cnt * self.retrieval_batch_size, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py index a662ea4b..f783953e 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py @@ -26,9 +26,9 @@ connection: host: elastic.securitylog.company.com port: 9200 - selfSignedCert: false # this means do NOT check cert indices: host101 options: # use any of this section when needed + verify_cert: false # allow invalid/expired/self-signed certificate retrieval_batch_size: 10000 # set to 10000 to match default Elasticsearch page size; Kestrel default across connectors: 2000 single_batch_timeout: 120 # increase it if hit 60 seconds (Kestrel default) timeout error for each batch of retrieval cool_down_after_transmission: 2 # seconds to cool down between data source API calls, required by some API such as sentinelone; Kestrel default: 0 diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py index aeadfc83..cdb1a719 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/multiproc.py @@ -22,6 +22,7 @@ def transmit( retrieval_batch_size: int, translators_count: int, cool_down_after_transmission: int, + verify_cert: bool, queries: list, raw_records_queue: Queue, limit: Optional[int], @@ -34,6 +35,7 @@ def transmit( retrieval_batch_size, translators_count, cool_down_after_transmission, + verify_cert, queries, raw_records_queue, limit, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py index fa0d61e5..46b07b7f 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/query.py @@ -83,6 +83,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, + verify_cert, ) = map( copy.deepcopy, get_datasource_from_profiles(profile, config["profiles"]) ) @@ -123,6 +124,7 @@ def query_datasource(uri, pattern, session_id, config, store, limit=None): retrieval_batch_size, config["options"]["translation_workers_count"], cool_down_after_transmission, + verify_cert, dsl["queries"], raw_records_queue, profile_limit, diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py index ca4cd1c0..794625d7 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py @@ -6,7 +6,7 @@ from stix_shifter.stix_transmission import stix_transmission from kestrel_datasource_stixshifter.worker import STOP_SIGN -from kestrel_datasource_stixshifter.worker.utils import TransmissionResult, WorkerLog +from kestrel_datasource_stixshifter.worker.utils import TransmissionResult, WorkerLog, disable_cert_verification_on_transmission @typechecked @@ -19,6 +19,7 @@ def __init__( retrieval_batch_size: int, number_of_translators: int, cool_down_after_transmission: int, + verify_cert: bool, queries: list, output_queue: Queue, limit: Optional[int], @@ -31,6 +32,7 @@ def __init__( self.retrieval_batch_size = retrieval_batch_size self.number_of_translators = number_of_translators self.cool_down_after_transmission = cool_down_after_transmission + self.verify_cert = verify_cert self.queries = queries self.queue = output_queue self.limit = limit @@ -43,6 +45,7 @@ def run(self): self.configuration_dict, self.retrieval_batch_size, self.cool_down_after_transmission, + self.verify_cert, query, self.queue, self.limit, @@ -65,6 +68,7 @@ def __init__( configuration_dict: dict, retrieval_batch_size: int, cool_down_after_transmission: int, + verify_cert: bool, query: str, output_queue: Queue, limit: Optional[int], @@ -76,6 +80,7 @@ def __init__( self.configuration_dict = configuration_dict self.retrieval_batch_size = retrieval_batch_size self.cool_down_after_transmission = cool_down_after_transmission + self.verify_cert = verify_cert self.query = query self.queue = output_queue self.limit = limit @@ -87,6 +92,11 @@ def run(self): self.connection_dict, self.configuration_dict, ) + + # hack stix-shifter v7 to support "disable certificate verification" + if not self.verify_cert: + disable_cert_verification_on_transmission(self.transmission) + search_meta_result = self.transmission.query(self.query) if search_meta_result["success"]: diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py index 9a8d00af..420487ab 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py @@ -1,6 +1,8 @@ +import ssl from typing import Optional, Union, List from dataclasses import dataclass from pandas import DataFrame +from stix_shifter.stix_transmission.stix_transmission import StixTransmission STOP_SIGN = "STOP" @@ -30,3 +32,19 @@ class TranslationResult: success: bool data: Union[None, dict, DataFrame] log: Optional[WorkerLog] + + +def disable_cert_verification_on_transmission(trans: StixTransmission): + ot = trans.entry_point.transmission() + + # currently all the following attributes point to the same object + # iterate through them in case stix-shifter code changes in the future + for eps in [ + "_BaseEntryPoint__ping_connector", + "_BaseEntryPoint__query_connector", + "_BaseEntryPoint__results_connector", + "_BaseEntryPoint__status_connector", + ]: + ep = getattr(ot, eps) + ep.api_client.client.ssl_context.check_hostname = False + ep.api_client.client.ssl_context.verify_mode = ssl.CERT_NONE diff --git a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py index 89b62efa..610a513c 100644 --- a/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py +++ b/packages/kestrel_datasource_stixshifter/tests/test_stixshifter.py @@ -78,6 +78,7 @@ def test_yaml_profiles_refresh(tmp_path): single_batch_timeout: 120 cool_down_after_transmission: 5 allow_dev_connector: True + verify_cert: false dialects: - beats config: @@ -106,7 +107,7 @@ def test_yaml_profiles_refresh(tmp_path): ss_config = s.config["datasources"]["kestrel_datasource_stixshifter"] ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileA" assert configuration["auth"]["api_key"] == "qwer" @@ -114,6 +115,7 @@ def test_yaml_profiles_refresh(tmp_path): assert connection["options"]["result_limit"] == 2000 * 2 assert retrieval_batch_size == 2000 assert cool_down_after_transmission == 0 + assert verify_cert == True with open(profile_file, "w") as pf: pf.write(profileB) @@ -122,7 +124,7 @@ def test_yaml_profiles_refresh(tmp_path): # need to refresh the pointers since the dict is updated ss_profiles = ss_config["profiles"] - connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector = get_datasource_from_profiles("host101", ss_profiles) + connector_name, connection, configuration, retrieval_batch_size, cool_down_after_transmission, allow_dev_connector, verify_cert = get_datasource_from_profiles("host101", ss_profiles) assert connector_name == "elastic_ecs" assert configuration["auth"]["id"] == "profileB" assert configuration["auth"]["api_key"] == "xxxxxx" @@ -131,5 +133,6 @@ def test_yaml_profiles_refresh(tmp_path): assert retrieval_batch_size == 10000 assert cool_down_after_transmission == 5 assert allow_dev_connector == True + assert verify_cert == False del os.environ["KESTREL_STIXSHIFTER_CONFIG"] From cbda884a46c9c809fc5cccbfab5c766646da3f32 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 18 Apr 2024 16:35:32 -0400 Subject: [PATCH 50/61] code style update --- .../src/kestrel_datasource_stixshifter/diagnosis.py | 4 +++- .../kestrel_datasource_stixshifter/worker/transmitter.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py index 6b34ebf9..c3631f7a 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/diagnosis.py @@ -11,7 +11,9 @@ from kestrel_datasource_stixshifter.worker import STOP_SIGN from kestrel_datasource_stixshifter.query import translate_query from kestrel_datasource_stixshifter.worker.transmitter import Transmitter -from kestrel_datasource_stixshifter.worker.utils import disable_cert_verification_on_transmission +from kestrel_datasource_stixshifter.worker.utils import ( + disable_cert_verification_on_transmission, +) from stix_shifter.stix_transmission import stix_transmission diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py index 794625d7..31534781 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/transmitter.py @@ -6,7 +6,11 @@ from stix_shifter.stix_transmission import stix_transmission from kestrel_datasource_stixshifter.worker import STOP_SIGN -from kestrel_datasource_stixshifter.worker.utils import TransmissionResult, WorkerLog, disable_cert_verification_on_transmission +from kestrel_datasource_stixshifter.worker.utils import ( + TransmissionResult, + WorkerLog, + disable_cert_verification_on_transmission, +) @typechecked From 8ac3f6505e499d9f456be15f38bbb1b4cb79d108 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Thu, 18 Apr 2024 17:22:20 -0400 Subject: [PATCH 51/61] trigger integration testing --- .../src/kestrel_datasource_stixshifter/interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py index f783953e..27a255ad 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py @@ -28,7 +28,7 @@ port: 9200 indices: host101 options: # use any of this section when needed - verify_cert: false # allow invalid/expired/self-signed certificate + verify_cert: false # allow invalid/expired/self-signed certificate retrieval_batch_size: 10000 # set to 10000 to match default Elasticsearch page size; Kestrel default across connectors: 2000 single_batch_timeout: 120 # increase it if hit 60 seconds (Kestrel default) timeout error for each batch of retrieval cool_down_after_transmission: 2 # seconds to cool down between data source API calls, required by some API such as sentinelone; Kestrel default: 0 From 1a60dcf99f1d3ca9c2a9e2c5f830f0b0e1cd5a66 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 10:02:39 -0400 Subject: [PATCH 52/61] generalize locating objects to patch in the hack --- .../worker/utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py index 420487ab..406b4570 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/worker/utils.py @@ -39,12 +39,11 @@ def disable_cert_verification_on_transmission(trans: StixTransmission): # currently all the following attributes point to the same object # iterate through them in case stix-shifter code changes in the future - for eps in [ - "_BaseEntryPoint__ping_connector", - "_BaseEntryPoint__query_connector", - "_BaseEntryPoint__results_connector", - "_BaseEntryPoint__status_connector", + for attr in [ + x + for x in dir(ot) + if x.startswith("_BaseEntryPoint__") and x.endswith("_connector") ]: - ep = getattr(ot, eps) - ep.api_client.client.ssl_context.check_hostname = False - ep.api_client.client.ssl_context.verify_mode = ssl.CERT_NONE + c = getattr(ot, attr) + c.api_client.client.ssl_context.check_hostname = False + c.api_client.client.ssl_context.verify_mode = ssl.CERT_NONE From f538fce13deae31a21f84006982680b55735ee0e Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 10:27:22 -0400 Subject: [PATCH 53/61] enable unit test for Python 3.12 --- .github/workflows/unit-testing.yml | 8 ++++---- packages/kestrel_core/pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 1733bfba..2c0f3a8e 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -23,7 +23,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: @@ -52,7 +52,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11.6'] + python-version: ['3.8', '3.9', '3.10', '3.11.6', '3.12'] runs-on: ${{ matrix.os }} defaults: run: @@ -78,7 +78,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: @@ -107,7 +107,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml index e8fcfa87..a73e75b3 100644 --- a/packages/kestrel_core/pyproject.toml +++ b/packages/kestrel_core/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "pandas>=2.0.3", "pyarrow>=13.0.0", "tabulate>=0.9.0", - "firepit>=2.3.32", + "firepit>=2.3.33", ] [project.optional-dependencies] From dee581a0ad3b83f39be0f68bc256a12dadb386ba Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 13:59:57 -0400 Subject: [PATCH 54/61] update python setup env --- .github/workflows/unit-testing.yml | 16 ++++++++-------- .../pyproject.toml | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index 2c0f3a8e..a7bb6d69 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -30,9 +30,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_core steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -59,9 +59,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_datasource_stixshifter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -85,9 +85,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_analytics_python steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -114,9 +114,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_jupyter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index 0a632566..c418eaf5 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "kestrel_core>=1.8.1", "lxml>=4.9.3", "requests>=2.31.0", - "nest-asyncio>=1.5.8", + "nest-asyncio>=1.6.0", "stix-shifter==7.0.6", "stix-shifter-utils==7.0.6", ] From 29fc7a4ebd01b36dcb051b1efc58b3a65afbe761 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 14:29:28 -0400 Subject: [PATCH 55/61] try to fix Python 3.12 issue --- .../src/kestrel_datasource_stixshifter/interface.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py index 27a255ad..9435cebe 100644 --- a/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py +++ b/packages/kestrel_datasource_stixshifter/src/kestrel_datasource_stixshifter/interface.py @@ -127,11 +127,15 @@ """ +import multiprocessing from kestrel.datasource import AbstractDataSourceInterface from kestrel_datasource_stixshifter.config import load_profiles from kestrel_datasource_stixshifter.query import query_datasource +multiprocessing.set_start_method("spawn", force=True) + + class StixShifterInterface(AbstractDataSourceInterface): @staticmethod def schemes(): From f93077a7940654901517e241cf03d384ba9b8101 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 14:35:32 -0400 Subject: [PATCH 56/61] upgrade github action module version --- .github/workflows/code-coverage.yml | 4 ++-- .github/workflows/code-style.yml | 4 ++-- .github/workflows/kaas-docker-image.yml | 2 +- .github/workflows/publish-to-pypi.yml | 4 ++-- .github/workflows/stixshifter-module-verification.yml | 4 ++-- .github/workflows/unit-testing-kestrel2.yml | 8 ++++---- .github/workflows/unused-import.yml | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index bb769852..91a98060 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -22,9 +22,9 @@ jobs: codecov: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Python Tools diff --git a/.github/workflows/code-style.yml b/.github/workflows/code-style.yml index 5adbab58..6c4e59e7 100644 --- a/.github/workflows/code-style.yml +++ b/.github/workflows/code-style.yml @@ -22,9 +22,9 @@ jobs: codestyle: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Kestrel package diff --git a/.github/workflows/kaas-docker-image.yml b/.github/workflows/kaas-docker-image.yml index 1d36879e..1738aa07 100644 --- a/.github/workflows/kaas-docker-image.yml +++ b/.github/workflows/kaas-docker-image.yml @@ -14,7 +14,7 @@ jobs: run: sleep 600s shell: bash - name: Checkout - uses: actions/checkout@v3.5.3 + uses: actions/checkout@v4 - name: Info run: echo "Parameters. ${{ github.event.base_ref }}, ${{ github.ref_type }}, ${{ github.ref }}" - name: Log in to Docker Hub diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 47c52fb2..343fcf29 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -22,8 +22,8 @@ jobs: shell: bash working-directory: ./packages/${{ matrix.package }} steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install building environment diff --git a/.github/workflows/stixshifter-module-verification.yml b/.github/workflows/stixshifter-module-verification.yml index f9b1265c..66949595 100644 --- a/.github/workflows/stixshifter-module-verification.yml +++ b/.github/workflows/stixshifter-module-verification.yml @@ -15,9 +15,9 @@ jobs: shell: bash working-directory: ./packages/kestrel_datasource_stixshifter steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Python Tools diff --git a/.github/workflows/unit-testing-kestrel2.yml b/.github/workflows/unit-testing-kestrel2.yml index 65c73044..4113a1e1 100644 --- a/.github/workflows/unit-testing-kestrel2.yml +++ b/.github/workflows/unit-testing-kestrel2.yml @@ -30,9 +30,9 @@ jobs: shell: bash working-directory: ./packages-nextgen/kestrel_core steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools @@ -53,9 +53,9 @@ jobs: shell: bash working-directory: ./packages-nextgen/kestrel_interface_opensearch steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install Python Tools diff --git a/.github/workflows/unused-import.yml b/.github/workflows/unused-import.yml index 150c9b34..e1174ba5 100644 --- a/.github/workflows/unused-import.yml +++ b/.github/workflows/unused-import.yml @@ -22,9 +22,9 @@ jobs: unusedimports: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install Kestrel package From 11422a91f94f7d48f1328a7a6837b8f63536e078 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 15:58:33 -0400 Subject: [PATCH 57/61] bump lib versions --- docs/installation/runtime.rst | 7 ++++++- packages/kestrel_analytics_docker/pyproject.toml | 2 +- packages/kestrel_core/pyproject.toml | 8 ++++---- packages/kestrel_datasource_stixshifter/pyproject.toml | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/installation/runtime.rst b/docs/installation/runtime.rst index c220f264..40842b80 100644 --- a/docs/installation/runtime.rst +++ b/docs/installation/runtime.rst @@ -8,7 +8,11 @@ please use Python inside Windows Subsystem for Linux (WSL). General Requirements ==================== -Python 3.8 is required. Follow the `Python installation guide`_ to install or upgrade Python. +Python 3 + +* End-of-life Python versions are not supported. Check `Python releases`_. + +* Follow the `Python installation guide`_ to install or upgrade Python. OS-specific Requirements ======================== @@ -190,6 +194,7 @@ What's to Do Next - :doc:`../language/index` .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ +.. _Python releases: https://devguide.python.org/versions/ .. _Python virtual environment: https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/ .. _Xcode: https://developer.apple.com/xcode/ .. _kestrel-lang: http://github.com/opencybersecurityalliance/kestrel-lang diff --git a/packages/kestrel_analytics_docker/pyproject.toml b/packages/kestrel_analytics_docker/pyproject.toml index 1f668918..70422692 100644 --- a/packages/kestrel_analytics_docker/pyproject.toml +++ b/packages/kestrel_analytics_docker/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ dependencies = [ "kestrel_core>=1.8.0", - "docker>=6.1.3", + "docker>=7.0.0", ] [project.urls] diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml index a73e75b3..31f330ac 100644 --- a/packages/kestrel_core/pyproject.toml +++ b/packages/kestrel_core/pyproject.toml @@ -30,11 +30,11 @@ classifiers = [ ] dependencies = [ - "typeguard>=4.1.5", + "typeguard>=4.2.1", "pyyaml>=6.0.1", - "lark>=1.1.7", - "pandas>=2.0.3", - "pyarrow>=13.0.0", + "lark>=1.1.9", + "pandas>=2.2.2", + "pyarrow>=15.0.2", "tabulate>=0.9.0", "firepit>=2.3.33", ] diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index c418eaf5..227d0db9 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ dependencies = [ "kestrel_core>=1.8.1", - "lxml>=4.9.3", + "lxml>=5.2.1", "requests>=2.31.0", "nest-asyncio>=1.6.0", "stix-shifter==7.0.6", From acbf94b459d45b04632eab01736b7749b0f209fd Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 16:03:28 -0400 Subject: [PATCH 58/61] pandas dep version fix --- packages/kestrel_core/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml index 31f330ac..f8d76414 100644 --- a/packages/kestrel_core/pyproject.toml +++ b/packages/kestrel_core/pyproject.toml @@ -33,7 +33,7 @@ dependencies = [ "typeguard>=4.2.1", "pyyaml>=6.0.1", "lark>=1.1.9", - "pandas>=2.2.2", + "pandas>=2.0.3", # last version supporting Python 3.8 "pyarrow>=15.0.2", "tabulate>=0.9.0", "firepit>=2.3.33", From 77c2532b6d2bc6c92455a8b06c8987a108449b15 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 16:26:06 -0400 Subject: [PATCH 59/61] update unit test Python version to default 3.11 --- .github/workflows/unit-testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-testing.yml b/.github/workflows/unit-testing.yml index a7bb6d69..8af6b843 100644 --- a/.github/workflows/unit-testing.yml +++ b/.github/workflows/unit-testing.yml @@ -52,7 +52,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - python-version: ['3.8', '3.9', '3.10', '3.11.6', '3.12'] + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] runs-on: ${{ matrix.os }} defaults: run: From 1175953ed4be8413e12f525b6e5f2a41f997dbe3 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Fri, 19 Apr 2024 16:28:29 -0400 Subject: [PATCH 60/61] trigger github CI/CD unit tests --- docs/installation/runtime.rst | 2 +- packages/kestrel_core/src/kestrel/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/installation/runtime.rst b/docs/installation/runtime.rst index 40842b80..b70d4072 100644 --- a/docs/installation/runtime.rst +++ b/docs/installation/runtime.rst @@ -8,7 +8,7 @@ please use Python inside Windows Subsystem for Linux (WSL). General Requirements ==================== -Python 3 +Python 3 is required. * End-of-life Python versions are not supported. Check `Python releases`_. diff --git a/packages/kestrel_core/src/kestrel/config.yaml b/packages/kestrel_core/src/kestrel/config.yaml index 182ddfe9..2470f465 100644 --- a/packages/kestrel_core/src/kestrel/config.yaml +++ b/packages/kestrel_core/src/kestrel/config.yaml @@ -5,7 +5,7 @@ language: default_datasource_schema: "stixshifter" default_analytics_schema: "python" -# how a Kestrel session is executed +# Kestrel session execution session: cache_directory_prefix: "kestrel-session-" # under system temp directory local_database_path: "local.db" From 33955f72a5cdcb642c223bdd19d13989eeb0da60 Mon Sep 17 00:00:00 2001 From: Xiaokui Shu Date: Mon, 22 Apr 2024 10:53:32 -0400 Subject: [PATCH 61/61] v1.8.3 --- CHANGELOG.rst | 16 ++++++++++++++++ packages/kestrel_analytics_docker/pyproject.toml | 2 +- packages/kestrel_core/pyproject.toml | 2 +- .../pyproject.toml | 2 +- packages/kestrel_jupyter/pyproject.toml | 8 ++++---- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 083ad89d..bf88f8dc 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,22 @@ The format is based on `Keep a Changelog`_. Unreleased ========== +1.8.3 (2024-04-22) +================== + +Added +----- + +- Support of disabling certificate verification of stix-shifter v7 with config option `verify_cert` +- Documentation on how to use the `verify_cert` option in the stix-shifter interface +- Python 3.12 support (multiprocessing library behavior steering to avoid a CPU-blocking issue) +- More generic HTML parsing of PyPI for stix-shfiter connector verification + +Changed +------- + +- stix-shifter upgraded to v7 (v7.0.6), the first version abandoning invalid certificate support + 1.8.2 (2024-02-20) ================== diff --git a/packages/kestrel_analytics_docker/pyproject.toml b/packages/kestrel_analytics_docker/pyproject.toml index 70422692..3b9c9283 100644 --- a/packages/kestrel_analytics_docker/pyproject.toml +++ b/packages/kestrel_analytics_docker/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_analytics_docker" -version = "1.8.0" +version = "1.8.1" description = "Kestrel Docker Analytics Interface" readme = "README.rst" requires-python = ">=3.8" diff --git a/packages/kestrel_core/pyproject.toml b/packages/kestrel_core/pyproject.toml index f8d76414..6d38a007 100644 --- a/packages/kestrel_core/pyproject.toml +++ b/packages/kestrel_core/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_core" -version = "1.8.1" +version = "1.8.2" description = "Kestrel Threat Hunting Language" readme = "README.rst" requires-python = ">=3.8" diff --git a/packages/kestrel_datasource_stixshifter/pyproject.toml b/packages/kestrel_datasource_stixshifter/pyproject.toml index 227d0db9..05e831f7 100644 --- a/packages/kestrel_datasource_stixshifter/pyproject.toml +++ b/packages/kestrel_datasource_stixshifter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_datasource_stixshifter" -version = "1.8.1" +version = "1.8.2" description = "Kestrel STIX-shifter Datasource Interface" readme = "README.rst" requires-python = ">=3.8" diff --git a/packages/kestrel_jupyter/pyproject.toml b/packages/kestrel_jupyter/pyproject.toml index 70887889..888a3cac 100644 --- a/packages/kestrel_jupyter/pyproject.toml +++ b/packages/kestrel_jupyter/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "kestrel_jupyter" -version = "1.8.2" +version = "1.8.3" description = "Kestrel Jupyter Kernel" readme = "README.rst" requires-python = ">=3.8" @@ -26,11 +26,11 @@ classifiers = [ ] dependencies = [ - "kestrel_core==1.8.1", + "kestrel_core==1.8.2", "kestrel_datasource_stixbundle==1.8.0", - "kestrel_datasource_stixshifter==1.8.1", + "kestrel_datasource_stixshifter==1.8.2", "kestrel_analytics_python==1.8.0", - "kestrel_analytics_docker==1.8.0", + "kestrel_analytics_docker==1.8.1", "jupyterlab-server", "jupyterlab", "jupyter_client",