From ffdd46c363453abc8d5a2f3edbc54aedc7b44f73 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Wed, 30 Oct 2024 22:00:44 +0100 Subject: [PATCH 01/21] get offset and matched_length --- src/tasks.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/tasks.py b/src/tasks.py index 4a0a88bf..a9b6aa6b 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -96,7 +96,17 @@ def update_metadata( match = Match(file=orig_name, meta=metadata, matches=matches) self.db.add_match(job, match) + @staticmethod + def get_readable_string(data): + try: + # Próbujemy zdekodować dane jako UTF-8, jeśli to możliwe + return data.decode('utf-8') + except UnicodeDecodeError: + # Jeśli nie można zdekodować, zwracamy jako ciąg heksadecymalny + return data.hex() + def execute_yara(self, job: Job, files: List[str]) -> None: + logging.info("########################################################") rule = yara.compile(source=job.raw_yara) num_matches = 0 num_errors = 0 @@ -104,11 +114,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None: self.db.job_start_work(job.id, num_files) for orig_name in files: + try: path = self.plugins.filter(orig_name) if not path: continue matches = rule.match(path) + logging.info(f"matches: {matches}") + + for rule in matches: + logging.info(f"Dopasowana reguła: {rule}") + for string_match in rule.strings: + logging.info(string_match.identifier) + logging.info(string_match.instances) + logging.info("_------------------------------") + for string in string_match.instances: + logging.info(string) + logging.info(string.matched_data) + logging.info(string.matched_length) + logging.info(string.offset) + logging.info(string.xor_key) + logging.info(string.plaintext()) + if matches: self.update_metadata( job.id, orig_name, path, [r.rule for r in matches] From 511a2551a117213002d5b57f30f528d0826f1d1c Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Wed, 13 Nov 2024 16:11:31 +0100 Subject: [PATCH 02/21] Draft code witch context example --- src/models/match.py | 1 + src/tasks.py | 37 ++++++++++++++++++++----------------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/models/match.py b/src/models/match.py index fde9af51..1a1816d5 100644 --- a/src/models/match.py +++ b/src/models/match.py @@ -22,3 +22,4 @@ class Match(SQLModel, table=True): ) ) job: Job = Relationship(back_populates="matches") + context: Dict[str, List[str]] = Field(sa_column=Column(JSON)) diff --git a/src/tasks.py b/src/tasks.py index a9b6aa6b..97506efd 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast +from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore from redis import Redis @@ -68,7 +68,7 @@ def get_datasets(self) -> List[str]: return list(result["result"]["datasets"].keys()) def update_metadata( - self, job: JobId, orig_name: str, path: str, matches: List[str] + self, job: JobId, orig_name: str, path: str, matches: List[str], context: Dict[str, List[str]] ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -93,7 +93,7 @@ def update_metadata( del metadata["path"] # Update the database. - match = Match(file=orig_name, meta=metadata, matches=matches) + match = Match(file=orig_name, meta=metadata, matches=matches, context=context) self.db.add_match(job, match) @staticmethod @@ -120,25 +120,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None: if not path: continue matches = rule.match(path) - logging.info(f"matches: {matches}") + context = {} for rule in matches: - logging.info(f"Dopasowana reguła: {rule}") + match_string_data = [] for string_match in rule.strings: - logging.info(string_match.identifier) - logging.info(string_match.instances) - logging.info("_------------------------------") - for string in string_match.instances: - logging.info(string) - logging.info(string.matched_data) - logging.info(string.matched_length) - logging.info(string.offset) - logging.info(string.xor_key) - logging.info(string.plaintext()) - + expression_keys = [] + for expression_key in string_match.instances: + if str(expression_key) not in expression_keys: + match_string_data.append( + f"{expression_key.offset}:{expression_key.matched_length}" + f":{string_match.identifier} {expression_key}" + ) + context.update( + { + rule: match_string_data, + } + ) + expression_keys.append(str(expression_key)) + logging.info(f"context {context}") if matches: self.update_metadata( - job.id, orig_name, path, [r.rule for r in matches] + job.id, orig_name, path, [r.rule for r in matches], context ) num_matches += 1 except yara.Error: From 106eff09799eee2dcd361ad0d3df7969d7325068 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Thu, 14 Nov 2024 12:00:40 +0100 Subject: [PATCH 03/21] Matches offest, len --- src/db.py | 2 ++ ...0_added_context_column_into_match_table.py | 27 +++++++++++++++++++ src/tasks.py | 7 ++--- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py diff --git a/src/db.py b/src/db.py index 4c85d1be..9a207be4 100644 --- a/src/db.py +++ b/src/db.py @@ -1,3 +1,5 @@ +import logging + from alembic.config import Config from alembic import command from pathlib import Path diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py new file mode 100644 index 00000000..8b42a6b4 --- /dev/null +++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py @@ -0,0 +1,27 @@ +"""Added context column into match table +Revision ID: f623e1057b00 +Revises: 6b495d5a4855 +Create Date: 2024-11-13 15:14:14.618258 +""" +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision = 'f623e1057b00' +down_revision = '6b495d5a4855' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('match', sa.Column('context', sa.JSON(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('match', 'context') + # ### end Alembic commands ### diff --git a/src/tasks.py b/src/tasks.py index 97506efd..c5a4816c 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,3 +1,4 @@ +import json from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore @@ -134,14 +135,14 @@ def execute_yara(self, job: Job, files: List[str]) -> None: ) context.update( { - rule: match_string_data, + str(rule): match_string_data, } ) expression_keys.append(str(expression_key)) - logging.info(f"context {context}") if matches: + logging.info(f"context {context}") self.update_metadata( - job.id, orig_name, path, [r.rule for r in matches], context + job=job.id, orig_name=orig_name, path=path, matches=[r.rule for r in matches], context=context ) num_matches += 1 except yara.Error: From 8021242564edc87af33f31fae4f0a2e755331d58 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 00:12:52 +0100 Subject: [PATCH 04/21] Matching witch before and after ocntext. --- ...0_added_context_column_into_match_table.py | 8 +- src/tasks.py | 96 ++++++++++++------- 2 files changed, 68 insertions(+), 36 deletions(-) diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py index 8b42a6b4..48eb7df4 100644 --- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py +++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py @@ -9,19 +9,19 @@ # revision identifiers, used by Alembic. -revision = 'f623e1057b00' -down_revision = '6b495d5a4855' +revision = "f623e1057b00" +down_revision = "6b495d5a4855" branch_labels = None depends_on = None def upgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.add_column('match', sa.Column('context', sa.JSON(), nullable=True)) + op.add_column("match", sa.Column("context", sa.JSON(), nullable=True)) # ### end Alembic commands ### def downgrade() -> None: # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('match', 'context') + op.drop_column("match", "context") # ### end Alembic commands ### diff --git a/src/tasks.py b/src/tasks.py index c5a4816c..b0bd2364 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,5 +1,4 @@ -import json -from typing import List, Optional, cast, Dict +from typing import List, Optional, cast, Dict, Any import logging from rq import get_current_job, Queue # type: ignore from redis import Redis @@ -69,7 +68,12 @@ def get_datasets(self) -> List[str]: return list(result["result"]["datasets"].keys()) def update_metadata( - self, job: JobId, orig_name: str, path: str, matches: List[str], context: Dict[str, List[str]] + self, + job: JobId, + orig_name: str, + path: str, + matches: List[str], + context: Dict[str, List[Dict[str, bytes]]] ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -94,20 +98,45 @@ def update_metadata( del metadata["path"] # Update the database. - match = Match(file=orig_name, meta=metadata, matches=matches, context=context) + match = Match( + file=orig_name, meta=metadata, matches=matches, context=context + ) self.db.add_match(job, match) @staticmethod - def get_readable_string(data): - try: - # Próbujemy zdekodować dane jako UTF-8, jeśli to możliwe - return data.decode('utf-8') - except UnicodeDecodeError: - # Jeśli nie można zdekodować, zwracamy jako ciąg heksadecymalny - return data.hex() + def read_file(file_path: str) -> bytes: + """Reads the entire file content. + + Returns: + bytes: The content of the file. + """ + with open(file_path, "rb") as file: + return file.read() + + @staticmethod + def read_bytes_from_offset( + data: bytes, matched_length: int, offset: int, byte_range: int = 32 + ) -> tuple[bytes, bytes, bytes]: + """Reads a specific range of bytes from the already loaded file content around a given offset. + + Args: + data (bytes): Data to read. + matched_length (int): Number of bytes to read. + offset (int): The offset in bytes from which to start reading. + byte_range (int): The range in bytes to read around the offset (default is 32). + + Returns: + bytes: A chunk of bytes from the file, starting from the given offset minus bit_range + and ending at offset plus matched_length and byte_range. + """ + + before = data[offset - byte_range: offset] + matching = data[offset: offset + matched_length] + after = data[offset + matched_length: offset + matched_length + byte_range] + + return before, matching, after def execute_yara(self, job: Job, files: List[str]) -> None: - logging.info("########################################################") rule = yara.compile(source=job.raw_yara) num_matches = 0 num_errors = 0 @@ -115,34 +144,37 @@ def execute_yara(self, job: Job, files: List[str]) -> None: self.db.job_start_work(job.id, num_files) for orig_name in files: - try: path = self.plugins.filter(orig_name) if not path: continue matches = rule.match(path) - context = {} - for rule in matches: - match_string_data = [] - for string_match in rule.strings: - expression_keys = [] - for expression_key in string_match.instances: - if str(expression_key) not in expression_keys: - match_string_data.append( - f"{expression_key.offset}:{expression_key.matched_length}" - f":{string_match.identifier} {expression_key}" - ) - context.update( - { - str(rule): match_string_data, - } - ) - expression_keys.append(str(expression_key)) if matches: - logging.info(f"context {context}") + data = self.read_file(path) + context = {} + + for rule in matches: + match_context = [] + for string_match in rule.strings: + expression_keys = [] + for expression_key in string_match.instances: + if expression_key not in expression_keys: + before, matching, after = self.read_bytes_from_offset( + data=data, + offset=expression_key.offset, + matched_length=expression_key.matched_length, + ) + match_context.append({"before": before, "matching": matching, "after": after}) + context.update({str(rule): match_context}) + expression_keys.append(expression_key) + self.update_metadata( - job=job.id, orig_name=orig_name, path=path, matches=[r.rule for r in matches], context=context + job=job.id, + orig_name=orig_name, + path=path, + matches=[r.rule for r in matches], + context=context, ) num_matches += 1 except yara.Error: From e041af2aa20a0a7f78063a4212e5225a240eb74d Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 00:17:04 +0100 Subject: [PATCH 05/21] lint --- src/tasks.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index b0bd2364..d198396a 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -73,7 +73,7 @@ def update_metadata( orig_name: str, path: str, matches: List[str], - context: Dict[str, List[Dict[str, bytes]]] + context: Dict[str, List[Dict[str, bytes]]], ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -130,9 +130,11 @@ def read_bytes_from_offset( and ending at offset plus matched_length and byte_range. """ - before = data[offset - byte_range: offset] + before = data[offset - byte_range : offset] matching = data[offset: offset + matched_length] - after = data[offset + matched_length: offset + matched_length + byte_range] + after = data[ + offset + matched_length : offset + matched_length + byte_range + ] return before, matching, after @@ -160,12 +162,22 @@ def execute_yara(self, job: Job, files: List[str]) -> None: expression_keys = [] for expression_key in string_match.instances: if expression_key not in expression_keys: - before, matching, after = self.read_bytes_from_offset( + ( + before, + matching, + after, + ) = self.read_bytes_from_offset( data=data, offset=expression_key.offset, matched_length=expression_key.matched_length, ) - match_context.append({"before": before, "matching": matching, "after": after}) + match_context.append( + { + "before": before, + "matching": matching, + "after": after, + } + ) context.update({str(rule): match_context}) expression_keys.append(expression_key) From 7b1b2e8404b9cc1eebbb735319fb2e67220f75f2 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 00:18:46 +0100 Subject: [PATCH 06/21] lint --- src/db.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/db.py b/src/db.py index 9a207be4..4c85d1be 100644 --- a/src/db.py +++ b/src/db.py @@ -1,5 +1,3 @@ -import logging - from alembic.config import Config from alembic import command from pathlib import Path From ebc9277a1e1e295dfd4ec4a9a3f3f1b790464184 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 00:20:32 +0100 Subject: [PATCH 07/21] lint --- .../f623e1057b00_added_context_column_into_match_table.py | 1 - src/tasks.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py index 48eb7df4..50e67f73 100644 --- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py +++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py @@ -5,7 +5,6 @@ """ from alembic import op import sqlalchemy as sa -import sqlmodel # revision identifiers, used by Alembic. diff --git a/src/tasks.py b/src/tasks.py index d198396a..6f3dbdec 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast, Dict, Any +from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore from redis import Redis From a536f95fb255bbb1105d5181f7c292b0cb968fe3 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 00:22:03 +0100 Subject: [PATCH 08/21] lint --- src/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index 6f3dbdec..1afc2938 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -131,7 +131,7 @@ def read_bytes_from_offset( """ before = data[offset - byte_range : offset] - matching = data[offset: offset + matched_length] + matching = data[offset : offset + matched_length] after = data[ offset + matched_length : offset + matched_length + byte_range ] From acd85875049df277afe37f580385b7076e33e133 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 10:33:53 +0100 Subject: [PATCH 09/21] bytes into base64 modified --- src/models/match.py | 2 +- src/tasks.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/models/match.py b/src/models/match.py index 1a1816d5..3fd47df7 100644 --- a/src/models/match.py +++ b/src/models/match.py @@ -22,4 +22,4 @@ class Match(SQLModel, table=True): ) ) job: Job = Relationship(back_populates="matches") - context: Dict[str, List[str]] = Field(sa_column=Column(JSON)) + context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON)) diff --git a/src/tasks.py b/src/tasks.py index 1afc2938..9e95ed62 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,3 +1,4 @@ +import base64 from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore @@ -73,7 +74,7 @@ def update_metadata( orig_name: str, path: str, matches: List[str], - context: Dict[str, List[Dict[str, bytes]]], + context: Dict[str, List[Dict[str, str]]], ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -173,9 +174,15 @@ def execute_yara(self, job: Job, files: List[str]) -> None: ) match_context.append( { - "before": before, - "matching": matching, - "after": after, + "before": base64.b64encode( + before + ).decode("utf-8"), + "matching": base64.b64encode( + matching + ).decode("utf-8"), + "after": base64.b64encode( + after + ).decode("utf-8"), } ) context.update({str(rule): match_context}) From c253dfad7c71de433300ca960f4760b3593af20f Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 12:17:22 +0100 Subject: [PATCH 10/21] . --- src/e2etests/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/e2etests/test_api.py b/src/e2etests/test_api.py index 6f49cf99..3aa3680a 100644 --- a/src/e2etests/test_api.py +++ b/src/e2etests/test_api.py @@ -120,6 +120,7 @@ def test_query_two_results(add_files_to_index): res = request_query(log, i) m = res.json()["matches"] + log.info(f"-------------------------> {m}") assert len(m) == 2 with open(m[0]["file"], "r") as file: text1 = file.read() From 12228bd4bac5745bbb1c7f62351c36fa5f4564d5 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 15:21:07 +0100 Subject: [PATCH 11/21] Name of rule in loop modified --- src/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index 9e95ed62..ec4f3f0e 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -157,9 +157,9 @@ def execute_yara(self, job: Job, files: List[str]) -> None: data = self.read_file(path) context = {} - for rule in matches: + for rule_name in matches: match_context = [] - for string_match in rule.strings: + for string_match in rule_name.strings: expression_keys = [] for expression_key in string_match.instances: if expression_key not in expression_keys: From c1ce7689448df96010dca9c3c5ea7ff0c6343674 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 16:56:25 +0100 Subject: [PATCH 12/21] bug fixed, refactoring --- src/tasks.py | 71 +++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index ec4f3f0e..598576ca 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -131,10 +131,10 @@ def read_bytes_from_offset( and ending at offset plus matched_length and byte_range. """ - before = data[offset - byte_range : offset] + before = data[max(0, offset - byte_range) : offset] matching = data[offset : offset + matched_length] after = data[ - offset + matched_length : offset + matched_length + byte_range + offset + matched_length : min(len(data), offset + matched_length + byte_range) ] return before, matching, after @@ -155,38 +155,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None: if matches: data = self.read_file(path) - context = {} - - for rule_name in matches: - match_context = [] - for string_match in rule_name.strings: - expression_keys = [] - for expression_key in string_match.instances: - if expression_key not in expression_keys: - ( - before, - matching, - after, - ) = self.read_bytes_from_offset( - data=data, - offset=expression_key.offset, - matched_length=expression_key.matched_length, - ) - match_context.append( - { - "before": base64.b64encode( - before - ).decode("utf-8"), - "matching": base64.b64encode( - matching - ).decode("utf-8"), - "after": base64.b64encode( - after - ).decode("utf-8"), - } - ) - context.update({str(rule): match_context}) - expression_keys.append(expression_key) + context = self.get_match_context(data, matches) self.update_metadata( job=job.id, @@ -222,6 +191,40 @@ def execute_yara(self, job: Job, files: List[str]) -> None: f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.", ) + def get_match_context(self, data, matches): + context = {} + for rule_name in matches: + match_context = [] + for string_match in rule_name.strings: + expression_keys = [] + for expression_key in string_match.instances: + if expression_key not in expression_keys: + ( + before, + matching, + after, + ) = self.read_bytes_from_offset( + data=data, + offset=expression_key.offset, + matched_length=expression_key.matched_length, + ) + match_context.append( + { + "before": base64.b64encode( + before + ).decode("utf-8"), + "matching": base64.b64encode( + matching + ).decode("utf-8"), + "after": base64.b64encode( + after + ).decode("utf-8"), + } + ) + context.update({str(rule_name): match_context}) + expression_keys.append(expression_key) + return context + def init_search(self, job: Job, tasks: int) -> None: self.db.init_jobagent(job, self.db_id, tasks) From e2598075f0a5ab7d448560c80c4d77b02a7351fd Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 23:27:43 +0100 Subject: [PATCH 13/21] refactoring --- src/tasks.py | 59 ++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index 598576ca..b54a2e15 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -134,7 +134,10 @@ def read_bytes_from_offset( before = data[max(0, offset - byte_range) : offset] matching = data[offset : offset + matched_length] after = data[ - offset + matched_length : min(len(data), offset + matched_length + byte_range) + offset + + matched_length : min( + len(data), offset + matched_length + byte_range + ) ] return before, matching, after @@ -151,8 +154,8 @@ def execute_yara(self, job: Job, files: List[str]) -> None: path = self.plugins.filter(orig_name) if not path: continue - matches = rule.match(path) + matches = rule.match(path) if matches: data = self.read_file(path) context = self.get_match_context(data, matches) @@ -191,38 +194,34 @@ def execute_yara(self, job: Job, files: List[str]) -> None: f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.", ) - def get_match_context(self, data, matches): + def get_match_context( + self, data: bytes, matches: List[yara.Match] + ) -> dict: context = {} - for rule_name in matches: + for yara_match in matches: match_context = [] - for string_match in rule_name.strings: + for string_match in yara_match.strings: expression_keys = [] for expression_key in string_match.instances: - if expression_key not in expression_keys: - ( - before, - matching, - after, - ) = self.read_bytes_from_offset( - data=data, - offset=expression_key.offset, - matched_length=expression_key.matched_length, - ) - match_context.append( - { - "before": base64.b64encode( - before - ).decode("utf-8"), - "matching": base64.b64encode( - matching - ).decode("utf-8"), - "after": base64.b64encode( - after - ).decode("utf-8"), - } - ) - context.update({str(rule_name): match_context}) - expression_keys.append(expression_key) + if expression_key in expression_keys: + continue + + (before, matching, after,) = self.read_bytes_from_offset( + data=data, + offset=expression_key.offset, + matched_length=expression_key.matched_length, + ) + match_context.append( + { + "before": base64.b64encode(before).decode("utf-8"), + "matching": base64.b64encode(matching).decode( + "utf-8" + ), + "after": base64.b64encode(after).decode("utf-8"), + } + ) + context.update({str(yara_match): match_context}) + expression_keys.append(expression_key) return context def init_search(self, job: Job, tasks: int) -> None: From fc5a2f614136d533d9cb19b027a46a9fe79ecd82 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Mon, 2 Dec 2024 23:36:40 +0100 Subject: [PATCH 14/21] log test deleted --- src/e2etests/test_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/e2etests/test_api.py b/src/e2etests/test_api.py index 3aa3680a..6f49cf99 100644 --- a/src/e2etests/test_api.py +++ b/src/e2etests/test_api.py @@ -120,7 +120,6 @@ def test_query_two_results(add_files_to_index): res = request_query(log, i) m = res.json()["matches"] - log.info(f"-------------------------> {m}") assert len(m) == 2 with open(m[0]["file"], "r") as file: text1 = file.read() From 7fdfe9e77c6136a0867e80bb8f67bb467ecf6630 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:13:15 +0100 Subject: [PATCH 15/21] after review --- src/tasks.py | 107 +++++++++++++++++++-------------------------------- 1 file changed, 39 insertions(+), 68 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index b54a2e15..cc000b9c 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -104,44 +104,6 @@ def update_metadata( ) self.db.add_match(job, match) - @staticmethod - def read_file(file_path: str) -> bytes: - """Reads the entire file content. - - Returns: - bytes: The content of the file. - """ - with open(file_path, "rb") as file: - return file.read() - - @staticmethod - def read_bytes_from_offset( - data: bytes, matched_length: int, offset: int, byte_range: int = 32 - ) -> tuple[bytes, bytes, bytes]: - """Reads a specific range of bytes from the already loaded file content around a given offset. - - Args: - data (bytes): Data to read. - matched_length (int): Number of bytes to read. - offset (int): The offset in bytes from which to start reading. - byte_range (int): The range in bytes to read around the offset (default is 32). - - Returns: - bytes: A chunk of bytes from the file, starting from the given offset minus bit_range - and ending at offset plus matched_length and byte_range. - """ - - before = data[max(0, offset - byte_range) : offset] - matching = data[offset : offset + matched_length] - after = data[ - offset - + matched_length : min( - len(data), offset + matched_length + byte_range - ) - ] - - return before, matching, after - def execute_yara(self, job: Job, files: List[str]) -> None: rule = yara.compile(source=job.raw_yara) num_matches = 0 @@ -157,15 +119,16 @@ def execute_yara(self, job: Job, files: List[str]) -> None: matches = rule.match(path) if matches: - data = self.read_file(path) + with open(path, "rb") as file: + data = file.read() context = self.get_match_context(data, matches) self.update_metadata( - job=job.id, - orig_name=orig_name, - path=path, - matches=[r.rule for r in matches], - context=context, + job.id, + orig_name, + path, + [r.rule for r in matches], + context, ) num_matches += 1 except yara.Error: @@ -194,34 +157,27 @@ def execute_yara(self, job: Job, files: List[str]) -> None: f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.", ) + @staticmethod def get_match_context( - self, data: bytes, matches: List[yara.Match] - ) -> dict: + data: bytes, matches: List[yara.Match] + ) -> Dict[str, Dict[str, Dict[str, base64.b64decode]]]: context = {} for yara_match in matches: - match_context = [] + match_context = {} for string_match in yara_match.strings: - expression_keys = [] - for expression_key in string_match.instances: - if expression_key in expression_keys: - continue - - (before, matching, after,) = self.read_bytes_from_offset( - data=data, - offset=expression_key.offset, - matched_length=expression_key.matched_length, - ) - match_context.append( - { - "before": base64.b64encode(before).decode("utf-8"), - "matching": base64.b64encode(matching).decode( - "utf-8" - ), - "after": base64.b64encode(after).decode("utf-8"), - } - ) - context.update({str(yara_match): match_context}) - expression_keys.append(expression_key) + expression_key = string_match.instances[0] + + (before, matching, after,) = read_bytes_with_context( + data, expression_key.matched_length, expression_key.offset + ) + match_context[expression_key] = { + "before": base64.b64encode(before).decode("utf-8"), + "matching": base64.b64encode(matching).decode("utf-8"), + "after": base64.b64encode(after).decode("utf-8"), + } + + context[yara_match.rule] = match_context + logging.error(f"Match context: {context}") return context def init_search(self, job: Job, tasks: int) -> None: @@ -374,3 +330,18 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None: agent.execute_yara(job, pop_result.files) agent.add_tasks_in_progress(job, -1) + + +def read_bytes_with_context( + data: bytes, matched_length: int, offset: int, byte_range: int = 32 +) -> tuple[bytes, bytes, bytes]: + """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match.""" + + before = data[max(0, offset - byte_range) : offset] + matching = data[offset : offset + matched_length] + after = data[ + offset + + matched_length : min(len(data), offset + matched_length + byte_range) + ] + + return before, matching, after From 36aa90ecfccaafd579785d4c95975713a23b3b8d Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:14:16 +0100 Subject: [PATCH 16/21] logging context deleted --- src/tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index cc000b9c..f47124da 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -177,7 +177,6 @@ def get_match_context( } context[yara_match.rule] = match_context - logging.error(f"Match context: {context}") return context def init_search(self, job: Job, tasks: int) -> None: From d0cdcd657b515faab357df420b3cf5ccb2b3913c Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:22:14 +0100 Subject: [PATCH 17/21] lint --- src/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index f47124da..dbfc4a26 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -160,7 +160,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None: @staticmethod def get_match_context( data: bytes, matches: List[yara.Match] - ) -> Dict[str, Dict[str, Dict[str, base64.b64decode]]]: + ) -> Dict[str, Dict[str, Dict[str, str]]]: context = {} for yara_match in matches: match_context = {} From 1af4050d2b861359ca82c3ad91271791fb90dbed Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:25:51 +0100 Subject: [PATCH 18/21] lint --- src/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tasks.py b/src/tasks.py index dbfc4a26..e159705d 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,5 +1,5 @@ import base64 -from typing import List, Optional, cast, Dict +from typing import List, Optional, cast, Dict, Any import logging from rq import get_current_job, Queue # type: ignore from redis import Redis @@ -74,7 +74,7 @@ def update_metadata( orig_name: str, path: str, matches: List[str], - context: Dict[str, List[Dict[str, str]]], + context: Dict[str, Dict[str, Dict[str, str]]], ) -> None: """Saves matches to the database, and runs appropriate metadata plugins. @@ -128,7 +128,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None: orig_name, path, [r.rule for r in matches], - context, + context ) num_matches += 1 except yara.Error: From 7bdcef9692217bd6cf07e727d53bf030e8ec339b Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:26:47 +0100 Subject: [PATCH 19/21] lint --- src/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index e159705d..3c664222 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -1,5 +1,5 @@ import base64 -from typing import List, Optional, cast, Dict, Any +from typing import List, Optional, cast, Dict import logging from rq import get_current_job, Queue # type: ignore from redis import Redis From a5a30582d8277b51351159d4330337e061d2064a Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 11:27:58 +0100 Subject: [PATCH 20/21] lint --- src/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tasks.py b/src/tasks.py index 3c664222..8456dbee 100644 --- a/src/tasks.py +++ b/src/tasks.py @@ -128,7 +128,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None: orig_name, path, [r.rule for r in matches], - context + context, ) num_matches += 1 except yara.Error: From 57fd6511021efbc04979995e2a54efb1ebbe2161 Mon Sep 17 00:00:00 2001 From: michalkrzem Date: Fri, 13 Dec 2024 12:46:13 +0100 Subject: [PATCH 21/21] fix migration --- .../f623e1057b00_added_context_column_into_match_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py index 50e67f73..7ac9bf61 100644 --- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py +++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py @@ -9,7 +9,7 @@ # revision identifiers, used by Alembic. revision = "f623e1057b00" -down_revision = "6b495d5a4855" +down_revision = "702d19cfa063" branch_labels = None depends_on = None