From 7c443d232bcee65535b1cd8ffdb95987d0a76a30 Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:19:32 -0500 Subject: [PATCH 1/7] added batching to the delete call --- .../backend/public_api_client.py | 95 +++++++++++++------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index 7fd4c96c..ca906f0a 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -24,7 +24,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Generator, Iterable, List, Optional from urllib.parse import urlparse from gql import Client as GraphQLClient @@ -329,23 +329,35 @@ def transpile_filters( def delete_saved_queries( self, params: DeleteSavedQueriesParams ) -> BackendResponse[DeleteSavedQueriesResponse]: - query = self._requests.delete_saved_queries() - delete_params = { - "input": { - "dryRun": params.dry_run, - "includeDetections": params.include_detections, - "names": params.names, - } + data = { + "names": [], + "detectionIDs": [] } - res = self._execute(query, variable_values=delete_params) + # backend's delete function can only handle 100 IDs at a time, + # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters + # Separate ID list into batches of 100 + for name_batch in _batched(params.names, 100): + gql_params = { + "input": { + "dryRun": params.dry_run, + "includeDetections": params.include_detections, + "names": name_batch, + } + } + res = self._execute(self._requests.delete_saved_queries(), variable_values=gql_params) - if res.errors: - raise BackendError(res.errors) + if res.errors: + for err in res.errors: + logging.error(err.message) + + raise BackendError(res.errors) - if res.data is None: - raise BackendError("empty data") + if res.data is None: + raise BackendError("empty data") - data = res.data.get("deleteSavedQueriesByName", {}) + query_data = res.data.get("deleteSavedQueriesByName", {}) + for field in ("names", "detectionIDs"): + data[field] += query_data.get(field) or [] return BackendResponse( status_code=200, @@ -358,24 +370,35 @@ def delete_saved_queries( def delete_detections( self, params: DeleteDetectionsParams ) -> BackendResponse[DeleteDetectionsResponse]: - gql_params = { - "input": { - "dryRun": params.dry_run, - "includeSavedQueries": params.include_saved_queries, - "ids": params.ids, - } + data = { + "ids": [], + "savedQueryNames": [] } - res = self._execute(self._requests.delete_detections_query(), gql_params) - if res.errors: - for err in res.errors: - logging.error(err.message) + # backend's delete function can only handle 100 IDs at a time, + # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters + # Separate ID list into batches of 100 + for id_batch in _batched(params.id, 100): + gql_params = { + "input": { + "dryRun": params.dry_run, + "includeSavedQueries": params.include_saved_queries, + "ids": id_batch, + } + } + res = self._execute(self._requests.delete_detections_query(), gql_params) - raise BackendError(res.errors) + if res.errors: + for err in res.errors: + logging.error(err.message) - if res.data is None: - raise BackendError("empty data") + raise BackendError(res.errors) + + if res.data is None: + raise BackendError("empty data") - data = res.data.get("deleteDetections", {}) + query_data = res.data.get("deleteDetections", {}) + for field in ("ids", "savedQueryNames"): + data[field] += query_data.get(field) or [] return BackendResponse( status_code=200, @@ -693,3 +716,19 @@ def _build_api_url(host: str) -> str: def _get_graphql_content_filepath(name: str) -> str: work_dir = os.path.dirname(__file__) return os.path.join(work_dir, "graphql", f"{name}.graphql") + + +def _batched(iterable: Iterable, n: int = 1) -> Generator[Iterable, None, None]: + """ Batch data from 'iterable' into chunks of length 'n'. The last batch may be shorter than 'n'. + Inspired by itertools.batched in Python version 3.12+. + + Args: + iterable (any iterable): a sequence or other iterable to be batched + n (int, optional): the maximum size of each batch. default=1 + + Yields: + out (iterable): a batch of size 'n' or smaller + """ + length = len(iterable) + for idx in range(0, length, n): + yield iterable[idx:min(idx+n, length)] \ No newline at end of file From da22a1e18b31ad35653663d69e608dd90a38a000 Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:28:52 -0500 Subject: [PATCH 2/7] fixed small typo --- panther_analysis_tool/backend/public_api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index ca906f0a..feeabca4 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -377,7 +377,7 @@ def delete_detections( # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 - for id_batch in _batched(params.id, 100): + for id_batch in _batched(params.ids, 100): gql_params = { "input": { "dryRun": params.dry_run, From e233c657bb604c4f6e238e0bc23fd7421fd2f450 Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:49:17 -0500 Subject: [PATCH 3/7] added unit test and did formatting --- .../backend/public_api_client.py | 30 +++++------- tests/unit/panther_analysis_tool/test_util.py | 48 +++++++++++++++++++ 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index feeabca4..0069dfad 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -24,7 +24,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Generator, Iterable, List, Optional +from typing import Any, Dict, Generator, List, Optional, Sequence from urllib.parse import urlparse from gql import Client as GraphQLClient @@ -329,10 +329,7 @@ def transpile_filters( def delete_saved_queries( self, params: DeleteSavedQueriesParams ) -> BackendResponse[DeleteSavedQueriesResponse]: - data = { - "names": [], - "detectionIDs": [] - } + data: Dict = {"names": [], "detectionIDs": []} # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 @@ -349,7 +346,7 @@ def delete_saved_queries( if res.errors: for err in res.errors: logging.error(err.message) - + raise BackendError(res.errors) if res.data is None: @@ -370,10 +367,7 @@ def delete_saved_queries( def delete_detections( self, params: DeleteDetectionsParams ) -> BackendResponse[DeleteDetectionsResponse]: - data = { - "ids": [], - "savedQueryNames": [] - } + data: Dict = {"ids": [], "savedQueryNames": []} # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 @@ -718,17 +712,17 @@ def _get_graphql_content_filepath(name: str) -> str: return os.path.join(work_dir, "graphql", f"{name}.graphql") -def _batched(iterable: Iterable, n: int = 1) -> Generator[Iterable, None, None]: - """ Batch data from 'iterable' into chunks of length 'n'. The last batch may be shorter than 'n'. +def _batched(iterable: Sequence, size: int = 1) -> Generator[Sequence, None, None]: + """Batch data from 'iterable' into chunks of length 'size'. The last batch may be shorter than 'size'. Inspired by itertools.batched in Python version 3.12+. - + Args: iterable (any iterable): a sequence or other iterable to be batched - n (int, optional): the maximum size of each batch. default=1 - + size (int, optional): the maximum size of each batch. default=1 + Yields: - out (iterable): a batch of size 'n' or smaller + out (iterable): a batch of size 'size' or smaller """ length = len(iterable) - for idx in range(0, length, n): - yield iterable[idx:min(idx+n, length)] \ No newline at end of file + for idx in range(0, length, size): + yield iterable[idx : min(idx + size, length)] diff --git a/tests/unit/panther_analysis_tool/test_util.py b/tests/unit/panther_analysis_tool/test_util.py index 93fad87a..3edaec20 100644 --- a/tests/unit/panther_analysis_tool/test_util.py +++ b/tests/unit/panther_analysis_tool/test_util.py @@ -5,6 +5,7 @@ import panther_analysis_tool.constants from panther_analysis_tool import util as pat_utils +from panther_analysis_tool.backend.public_api_client import _batched from panther_analysis_tool.util import convert_unicode @@ -200,3 +201,50 @@ def test_is_policy(self): for case in test_cases: res = pat_utils.is_policy(case["analysis_type"]) self.assertEqual(case["expected"], res) + + +class TestBatched(unittest.TestCase): + def test_batched_with_remainder(self): + iterable = [1] * 12 + n = 5 + expected_batches = 3 + modulo = 2 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) + + def test_batched_with_no_remainder(self): + iterable = [1] * 100 + n = 10 + expected_batches = 10 + modulo = 10 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) + + def test_batched_with_no_full_batches(self): + iterable = [1] * 3 + n = 5 + expected_batches = 1 + modulo = 3 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) From d34b7ecaf8ed1974ba58a9885b45cd91c5ad7e11 Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:19:32 -0500 Subject: [PATCH 4/7] added batching to the delete call --- .../backend/public_api_client.py | 95 +++++++++++++------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index 7fd4c96c..ca906f0a 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -24,7 +24,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Generator, Iterable, List, Optional from urllib.parse import urlparse from gql import Client as GraphQLClient @@ -329,23 +329,35 @@ def transpile_filters( def delete_saved_queries( self, params: DeleteSavedQueriesParams ) -> BackendResponse[DeleteSavedQueriesResponse]: - query = self._requests.delete_saved_queries() - delete_params = { - "input": { - "dryRun": params.dry_run, - "includeDetections": params.include_detections, - "names": params.names, - } + data = { + "names": [], + "detectionIDs": [] } - res = self._execute(query, variable_values=delete_params) + # backend's delete function can only handle 100 IDs at a time, + # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters + # Separate ID list into batches of 100 + for name_batch in _batched(params.names, 100): + gql_params = { + "input": { + "dryRun": params.dry_run, + "includeDetections": params.include_detections, + "names": name_batch, + } + } + res = self._execute(self._requests.delete_saved_queries(), variable_values=gql_params) - if res.errors: - raise BackendError(res.errors) + if res.errors: + for err in res.errors: + logging.error(err.message) + + raise BackendError(res.errors) - if res.data is None: - raise BackendError("empty data") + if res.data is None: + raise BackendError("empty data") - data = res.data.get("deleteSavedQueriesByName", {}) + query_data = res.data.get("deleteSavedQueriesByName", {}) + for field in ("names", "detectionIDs"): + data[field] += query_data.get(field) or [] return BackendResponse( status_code=200, @@ -358,24 +370,35 @@ def delete_saved_queries( def delete_detections( self, params: DeleteDetectionsParams ) -> BackendResponse[DeleteDetectionsResponse]: - gql_params = { - "input": { - "dryRun": params.dry_run, - "includeSavedQueries": params.include_saved_queries, - "ids": params.ids, - } + data = { + "ids": [], + "savedQueryNames": [] } - res = self._execute(self._requests.delete_detections_query(), gql_params) - if res.errors: - for err in res.errors: - logging.error(err.message) + # backend's delete function can only handle 100 IDs at a time, + # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters + # Separate ID list into batches of 100 + for id_batch in _batched(params.id, 100): + gql_params = { + "input": { + "dryRun": params.dry_run, + "includeSavedQueries": params.include_saved_queries, + "ids": id_batch, + } + } + res = self._execute(self._requests.delete_detections_query(), gql_params) - raise BackendError(res.errors) + if res.errors: + for err in res.errors: + logging.error(err.message) - if res.data is None: - raise BackendError("empty data") + raise BackendError(res.errors) + + if res.data is None: + raise BackendError("empty data") - data = res.data.get("deleteDetections", {}) + query_data = res.data.get("deleteDetections", {}) + for field in ("ids", "savedQueryNames"): + data[field] += query_data.get(field) or [] return BackendResponse( status_code=200, @@ -693,3 +716,19 @@ def _build_api_url(host: str) -> str: def _get_graphql_content_filepath(name: str) -> str: work_dir = os.path.dirname(__file__) return os.path.join(work_dir, "graphql", f"{name}.graphql") + + +def _batched(iterable: Iterable, n: int = 1) -> Generator[Iterable, None, None]: + """ Batch data from 'iterable' into chunks of length 'n'. The last batch may be shorter than 'n'. + Inspired by itertools.batched in Python version 3.12+. + + Args: + iterable (any iterable): a sequence or other iterable to be batched + n (int, optional): the maximum size of each batch. default=1 + + Yields: + out (iterable): a batch of size 'n' or smaller + """ + length = len(iterable) + for idx in range(0, length, n): + yield iterable[idx:min(idx+n, length)] \ No newline at end of file From 8d0ec126a9ad9edb1ad1fef12b5e23ebd027378e Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:28:52 -0500 Subject: [PATCH 5/7] fixed small typo --- panther_analysis_tool/backend/public_api_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index ca906f0a..feeabca4 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -377,7 +377,7 @@ def delete_detections( # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 - for id_batch in _batched(params.id, 100): + for id_batch in _batched(params.ids, 100): gql_params = { "input": { "dryRun": params.dry_run, From cce690a629b8bba3e5799ade847e86e9f43457a6 Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Mon, 16 Sep 2024 16:49:17 -0500 Subject: [PATCH 6/7] added unit test and did formatting --- .../backend/public_api_client.py | 30 +++++------- tests/unit/panther_analysis_tool/test_util.py | 48 +++++++++++++++++++ 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index feeabca4..0069dfad 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -24,7 +24,7 @@ import time from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Generator, Iterable, List, Optional +from typing import Any, Dict, Generator, List, Optional, Sequence from urllib.parse import urlparse from gql import Client as GraphQLClient @@ -329,10 +329,7 @@ def transpile_filters( def delete_saved_queries( self, params: DeleteSavedQueriesParams ) -> BackendResponse[DeleteSavedQueriesResponse]: - data = { - "names": [], - "detectionIDs": [] - } + data: Dict = {"names": [], "detectionIDs": []} # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 @@ -349,7 +346,7 @@ def delete_saved_queries( if res.errors: for err in res.errors: logging.error(err.message) - + raise BackendError(res.errors) if res.data is None: @@ -370,10 +367,7 @@ def delete_saved_queries( def delete_detections( self, params: DeleteDetectionsParams ) -> BackendResponse[DeleteDetectionsResponse]: - data = { - "ids": [], - "savedQueryNames": [] - } + data: Dict = {"ids": [], "savedQueryNames": []} # backend's delete function can only handle 100 IDs at a time, # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters # Separate ID list into batches of 100 @@ -718,17 +712,17 @@ def _get_graphql_content_filepath(name: str) -> str: return os.path.join(work_dir, "graphql", f"{name}.graphql") -def _batched(iterable: Iterable, n: int = 1) -> Generator[Iterable, None, None]: - """ Batch data from 'iterable' into chunks of length 'n'. The last batch may be shorter than 'n'. +def _batched(iterable: Sequence, size: int = 1) -> Generator[Sequence, None, None]: + """Batch data from 'iterable' into chunks of length 'size'. The last batch may be shorter than 'size'. Inspired by itertools.batched in Python version 3.12+. - + Args: iterable (any iterable): a sequence or other iterable to be batched - n (int, optional): the maximum size of each batch. default=1 - + size (int, optional): the maximum size of each batch. default=1 + Yields: - out (iterable): a batch of size 'n' or smaller + out (iterable): a batch of size 'size' or smaller """ length = len(iterable) - for idx in range(0, length, n): - yield iterable[idx:min(idx+n, length)] \ No newline at end of file + for idx in range(0, length, size): + yield iterable[idx : min(idx + size, length)] diff --git a/tests/unit/panther_analysis_tool/test_util.py b/tests/unit/panther_analysis_tool/test_util.py index 93fad87a..3edaec20 100644 --- a/tests/unit/panther_analysis_tool/test_util.py +++ b/tests/unit/panther_analysis_tool/test_util.py @@ -5,6 +5,7 @@ import panther_analysis_tool.constants from panther_analysis_tool import util as pat_utils +from panther_analysis_tool.backend.public_api_client import _batched from panther_analysis_tool.util import convert_unicode @@ -200,3 +201,50 @@ def test_is_policy(self): for case in test_cases: res = pat_utils.is_policy(case["analysis_type"]) self.assertEqual(case["expected"], res) + + +class TestBatched(unittest.TestCase): + def test_batched_with_remainder(self): + iterable = [1] * 12 + n = 5 + expected_batches = 3 + modulo = 2 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) + + def test_batched_with_no_remainder(self): + iterable = [1] * 100 + n = 10 + expected_batches = 10 + modulo = 10 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) + + def test_batched_with_no_full_batches(self): + iterable = [1] * 3 + n = 5 + expected_batches = 1 + modulo = 3 # Size of last batch + + batches = list(_batched(iterable, n)) + # Ensure we recieved the expected number of batches + self.assertEqual(len(batches), expected_batches) + # Confirm all but the last batch have the same size + for batch in batches[:-1]: + self.assertEqual(len(list(batch)), n) + # Confirm the last batch has the expected number of entries + self.assertEqual(len(list(batches[-1])), modulo) From 682094317cb038ddcd048908d837615cf6fb6d9f Mon Sep 17 00:00:00 2001 From: Ben Airey Date: Thu, 19 Sep 2024 11:06:43 -0500 Subject: [PATCH 7/7] fix magic number --- panther_analysis_tool/backend/public_api_client.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/panther_analysis_tool/backend/public_api_client.py b/panther_analysis_tool/backend/public_api_client.py index 0069dfad..f4d41dd9 100644 --- a/panther_analysis_tool/backend/public_api_client.py +++ b/panther_analysis_tool/backend/public_api_client.py @@ -166,6 +166,10 @@ class PublicAPIClient(Client): # pylint: disable=too-many-public-methods _requests: PublicAPIRequests _gql_client: GraphQLClient + # backend's delete function can only handle 100 IDs at a time, due to DynamoDB restrictions + # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters + _DELETE_BATCH_SIZE = 100 + def __init__(self, opts: PublicAPIClientOptions): self._user_id = opts.user_id self._requests = PublicAPIRequests() @@ -330,10 +334,7 @@ def delete_saved_queries( self, params: DeleteSavedQueriesParams ) -> BackendResponse[DeleteSavedQueriesResponse]: data: Dict = {"names": [], "detectionIDs": []} - # backend's delete function can only handle 100 IDs at a time, - # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters - # Separate ID list into batches of 100 - for name_batch in _batched(params.names, 100): + for name_batch in _batched(params.names, self._DELETE_BATCH_SIZE): gql_params = { "input": { "dryRun": params.dry_run, @@ -368,10 +369,7 @@ def delete_detections( self, params: DeleteDetectionsParams ) -> BackendResponse[DeleteDetectionsResponse]: data: Dict = {"ids": [], "savedQueryNames": []} - # backend's delete function can only handle 100 IDs at a time, - # https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/ServiceQuotas.html#limits-expression-parameters - # Separate ID list into batches of 100 - for id_batch in _batched(params.ids, 100): + for id_batch in _batched(params.ids, self._DELETE_BATCH_SIZE): gql_params = { "input": { "dryRun": params.dry_run,