From ffdd46c363453abc8d5a2f3edbc54aedc7b44f73 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Wed, 30 Oct 2024 22:00:44 +0100
Subject: [PATCH 01/21] get offset and matched_length

---
 src/tasks.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/tasks.py b/src/tasks.py
index 4a0a88bf..a9b6aa6b 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -96,7 +96,17 @@ def update_metadata(
         match = Match(file=orig_name, meta=metadata, matches=matches)
         self.db.add_match(job, match)
 
+    @staticmethod
+    def get_readable_string(data):
+        try:
+            # Próbujemy zdekodować dane jako UTF-8, jeśli to możliwe
+            return data.decode('utf-8')
+        except UnicodeDecodeError:
+            # Jeśli nie można zdekodować, zwracamy jako ciąg heksadecymalny
+            return data.hex()
+
     def execute_yara(self, job: Job, files: List[str]) -> None:
+        logging.info("########################################################")
         rule = yara.compile(source=job.raw_yara)
         num_matches = 0
         num_errors = 0
@@ -104,11 +114,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
         self.db.job_start_work(job.id, num_files)
 
         for orig_name in files:
+
             try:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
                 matches = rule.match(path)
+                logging.info(f"matches: {matches}")
+
+                for rule in matches:
+                    logging.info(f"Dopasowana reguła: {rule}")
+                    for string_match in rule.strings:
+                        logging.info(string_match.identifier)
+                        logging.info(string_match.instances)
+                        logging.info("_------------------------------")
+                        for string in string_match.instances:
+                            logging.info(string)
+                            logging.info(string.matched_data)
+                            logging.info(string.matched_length)
+                            logging.info(string.offset)
+                            logging.info(string.xor_key)
+                            logging.info(string.plaintext())
+
                 if matches:
                     self.update_metadata(
                         job.id, orig_name, path, [r.rule for r in matches]

From 511a2551a117213002d5b57f30f528d0826f1d1c Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Wed, 13 Nov 2024 16:11:31 +0100
Subject: [PATCH 02/21] Draft code witch context example

---
 src/models/match.py |  1 +
 src/tasks.py        | 37 ++++++++++++++++++++-----------------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/src/models/match.py b/src/models/match.py
index fde9af51..1a1816d5 100644
--- a/src/models/match.py
+++ b/src/models/match.py
@@ -22,3 +22,4 @@ class Match(SQLModel, table=True):
         )
     )
     job: Job = Relationship(back_populates="matches")
+    context: Dict[str, List[str]] = Field(sa_column=Column(JSON))
diff --git a/src/tasks.py b/src/tasks.py
index a9b6aa6b..97506efd 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, cast
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -68,7 +68,7 @@ def get_datasets(self) -> List[str]:
         return list(result["result"]["datasets"].keys())
 
     def update_metadata(
-        self, job: JobId, orig_name: str, path: str, matches: List[str]
+        self, job: JobId, orig_name: str, path: str, matches: List[str], context: Dict[str, List[str]]
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -93,7 +93,7 @@ def update_metadata(
         del metadata["path"]
 
         # Update the database.
-        match = Match(file=orig_name, meta=metadata, matches=matches)
+        match = Match(file=orig_name, meta=metadata, matches=matches, context=context)
         self.db.add_match(job, match)
 
     @staticmethod
@@ -120,25 +120,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 if not path:
                     continue
                 matches = rule.match(path)
-                logging.info(f"matches: {matches}")
 
+                context = {}
                 for rule in matches:
-                    logging.info(f"Dopasowana reguła: {rule}")
+                    match_string_data = []
                     for string_match in rule.strings:
-                        logging.info(string_match.identifier)
-                        logging.info(string_match.instances)
-                        logging.info("_------------------------------")
-                        for string in string_match.instances:
-                            logging.info(string)
-                            logging.info(string.matched_data)
-                            logging.info(string.matched_length)
-                            logging.info(string.offset)
-                            logging.info(string.xor_key)
-                            logging.info(string.plaintext())
-
+                        expression_keys = []
+                        for expression_key in string_match.instances:
+                            if str(expression_key) not in expression_keys:
+                                match_string_data.append(
+                                    f"{expression_key.offset}:{expression_key.matched_length}"
+                                    f":{string_match.identifier} {expression_key}"
+                                )
+                                context.update(
+                                    {
+                                        rule: match_string_data,
+                                    }
+                                )
+                                expression_keys.append(str(expression_key))
+                logging.info(f"context {context}")
                 if matches:
                     self.update_metadata(
-                        job.id, orig_name, path, [r.rule for r in matches]
+                        job.id, orig_name, path, [r.rule for r in matches], context
                     )
                     num_matches += 1
             except yara.Error:

From 106eff09799eee2dcd361ad0d3df7969d7325068 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Thu, 14 Nov 2024 12:00:40 +0100
Subject: [PATCH 03/21] Matches offest, len

---
 src/db.py                                     |  2 ++
 ...0_added_context_column_into_match_table.py | 27 +++++++++++++++++++
 src/tasks.py                                  |  7 ++---
 3 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py

diff --git a/src/db.py b/src/db.py
index 4c85d1be..9a207be4 100644
--- a/src/db.py
+++ b/src/db.py
@@ -1,3 +1,5 @@
+import logging
+
 from alembic.config import Config
 from alembic import command
 from pathlib import Path
diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
new file mode 100644
index 00000000..8b42a6b4
--- /dev/null
+++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -0,0 +1,27 @@
+"""Added context column into match table
+Revision ID: f623e1057b00
+Revises: 6b495d5a4855
+Create Date: 2024-11-13 15:14:14.618258
+"""
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision = 'f623e1057b00'
+down_revision = '6b495d5a4855'
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column('match', sa.Column('context', sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column('match', 'context')
+    # ### end Alembic commands ###
diff --git a/src/tasks.py b/src/tasks.py
index 97506efd..c5a4816c 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,3 +1,4 @@
+import json
 from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
@@ -134,14 +135,14 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                                 )
                                 context.update(
                                     {
-                                        rule: match_string_data,
+                                        str(rule): match_string_data,
                                     }
                                 )
                                 expression_keys.append(str(expression_key))
-                logging.info(f"context {context}")
                 if matches:
+                    logging.info(f"context {context}")
                     self.update_metadata(
-                        job.id, orig_name, path, [r.rule for r in matches], context
+                        job=job.id, orig_name=orig_name, path=path, matches=[r.rule for r in matches], context=context
                     )
                     num_matches += 1
             except yara.Error:

From 8021242564edc87af33f31fae4f0a2e755331d58 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 00:12:52 +0100
Subject: [PATCH 04/21] Matching witch before and after ocntext.

---
 ...0_added_context_column_into_match_table.py |  8 +-
 src/tasks.py                                  | 96 ++++++++++++-------
 2 files changed, 68 insertions(+), 36 deletions(-)

diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
index 8b42a6b4..48eb7df4 100644
--- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
+++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -9,19 +9,19 @@
 
 
 # revision identifiers, used by Alembic.
-revision = 'f623e1057b00'
-down_revision = '6b495d5a4855'
+revision = "f623e1057b00"
+down_revision = "6b495d5a4855"
 branch_labels = None
 depends_on = None
 
 
 def upgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.add_column('match', sa.Column('context', sa.JSON(), nullable=True))
+    op.add_column("match", sa.Column("context", sa.JSON(), nullable=True))
     # ### end Alembic commands ###
 
 
 def downgrade() -> None:
     # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_column('match', 'context')
+    op.drop_column("match", "context")
     # ### end Alembic commands ###
diff --git a/src/tasks.py b/src/tasks.py
index c5a4816c..b0bd2364 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,5 +1,4 @@
-import json
-from typing import List, Optional, cast, Dict
+from typing import List, Optional, cast, Dict, Any
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -69,7 +68,12 @@ def get_datasets(self) -> List[str]:
         return list(result["result"]["datasets"].keys())
 
     def update_metadata(
-        self, job: JobId, orig_name: str, path: str, matches: List[str], context: Dict[str, List[str]]
+        self,
+        job: JobId,
+        orig_name: str,
+        path: str,
+        matches: List[str],
+        context: Dict[str, List[Dict[str, bytes]]]
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -94,20 +98,45 @@ def update_metadata(
         del metadata["path"]
 
         # Update the database.
-        match = Match(file=orig_name, meta=metadata, matches=matches, context=context)
+        match = Match(
+            file=orig_name, meta=metadata, matches=matches, context=context
+        )
         self.db.add_match(job, match)
 
     @staticmethod
-    def get_readable_string(data):
-        try:
-            # Próbujemy zdekodować dane jako UTF-8, jeśli to możliwe
-            return data.decode('utf-8')
-        except UnicodeDecodeError:
-            # Jeśli nie można zdekodować, zwracamy jako ciąg heksadecymalny
-            return data.hex()
+    def read_file(file_path: str) -> bytes:
+        """Reads the entire file content.
+
+        Returns:
+            bytes: The content of the file.
+        """
+        with open(file_path, "rb") as file:
+            return file.read()
+
+    @staticmethod
+    def read_bytes_from_offset(
+        data: bytes, matched_length: int, offset: int, byte_range: int = 32
+    ) -> tuple[bytes, bytes, bytes]:
+        """Reads a specific range of bytes from the already loaded file content around a given offset.
+
+        Args:
+            data (bytes): Data to read.
+            matched_length (int): Number of bytes to read.
+            offset (int): The offset in bytes from which to start reading.
+            byte_range (int): The range in bytes to read around the offset (default is 32).
+
+        Returns:
+            bytes: A chunk of bytes from the file, starting from the given offset minus bit_range
+                   and ending at offset plus matched_length and byte_range.
+        """
+
+        before = data[offset - byte_range: offset]
+        matching = data[offset: offset + matched_length]
+        after = data[offset + matched_length: offset + matched_length + byte_range]
+
+        return before, matching, after
 
     def execute_yara(self, job: Job, files: List[str]) -> None:
-        logging.info("########################################################")
         rule = yara.compile(source=job.raw_yara)
         num_matches = 0
         num_errors = 0
@@ -115,34 +144,37 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
         self.db.job_start_work(job.id, num_files)
 
         for orig_name in files:
-
             try:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
                 matches = rule.match(path)
 
-                context = {}
-                for rule in matches:
-                    match_string_data = []
-                    for string_match in rule.strings:
-                        expression_keys = []
-                        for expression_key in string_match.instances:
-                            if str(expression_key) not in expression_keys:
-                                match_string_data.append(
-                                    f"{expression_key.offset}:{expression_key.matched_length}"
-                                    f":{string_match.identifier} {expression_key}"
-                                )
-                                context.update(
-                                    {
-                                        str(rule): match_string_data,
-                                    }
-                                )
-                                expression_keys.append(str(expression_key))
                 if matches:
-                    logging.info(f"context {context}")
+                    data = self.read_file(path)
+                    context = {}
+
+                    for rule in matches:
+                        match_context = []
+                        for string_match in rule.strings:
+                            expression_keys = []
+                            for expression_key in string_match.instances:
+                                if expression_key not in expression_keys:
+                                    before, matching, after = self.read_bytes_from_offset(
+                                        data=data,
+                                        offset=expression_key.offset,
+                                        matched_length=expression_key.matched_length,
+                                    )
+                                    match_context.append({"before": before, "matching": matching, "after": after})
+                                    context.update({str(rule): match_context})
+                                    expression_keys.append(expression_key)
+
                     self.update_metadata(
-                        job=job.id, orig_name=orig_name, path=path, matches=[r.rule for r in matches], context=context
+                        job=job.id,
+                        orig_name=orig_name,
+                        path=path,
+                        matches=[r.rule for r in matches],
+                        context=context,
                     )
                     num_matches += 1
             except yara.Error:

From e041af2aa20a0a7f78063a4212e5225a240eb74d Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 00:17:04 +0100
Subject: [PATCH 05/21] lint

---
 src/tasks.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index b0bd2364..d198396a 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -73,7 +73,7 @@ def update_metadata(
         orig_name: str,
         path: str,
         matches: List[str],
-        context: Dict[str, List[Dict[str, bytes]]]
+        context: Dict[str, List[Dict[str, bytes]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -130,9 +130,11 @@ def read_bytes_from_offset(
                    and ending at offset plus matched_length and byte_range.
         """
 
-        before = data[offset - byte_range: offset]
+        before = data[offset - byte_range : offset]
         matching = data[offset: offset + matched_length]
-        after = data[offset + matched_length: offset + matched_length + byte_range]
+        after = data[
+            offset + matched_length : offset + matched_length + byte_range
+        ]
 
         return before, matching, after
 
@@ -160,12 +162,22 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                             expression_keys = []
                             for expression_key in string_match.instances:
                                 if expression_key not in expression_keys:
-                                    before, matching, after = self.read_bytes_from_offset(
+                                    (
+                                        before,
+                                        matching,
+                                        after,
+                                    ) = self.read_bytes_from_offset(
                                         data=data,
                                         offset=expression_key.offset,
                                         matched_length=expression_key.matched_length,
                                     )
-                                    match_context.append({"before": before, "matching": matching, "after": after})
+                                    match_context.append(
+                                        {
+                                            "before": before,
+                                            "matching": matching,
+                                            "after": after,
+                                        }
+                                    )
                                     context.update({str(rule): match_context})
                                     expression_keys.append(expression_key)
 

From 7b1b2e8404b9cc1eebbb735319fb2e67220f75f2 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 00:18:46 +0100
Subject: [PATCH 06/21] lint

---
 src/db.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/db.py b/src/db.py
index 9a207be4..4c85d1be 100644
--- a/src/db.py
+++ b/src/db.py
@@ -1,5 +1,3 @@
-import logging
-
 from alembic.config import Config
 from alembic import command
 from pathlib import Path

From ebc9277a1e1e295dfd4ec4a9a3f3f1b790464184 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 00:20:32 +0100
Subject: [PATCH 07/21] lint

---
 .../f623e1057b00_added_context_column_into_match_table.py       | 1 -
 src/tasks.py                                                    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
index 48eb7df4..50e67f73 100644
--- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
+++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -5,7 +5,6 @@
 """
 from alembic import op
 import sqlalchemy as sa
-import sqlmodel
 
 
 # revision identifiers, used by Alembic.
diff --git a/src/tasks.py b/src/tasks.py
index d198396a..6f3dbdec 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, cast, Dict, Any
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis

From a536f95fb255bbb1105d5181f7c292b0cb968fe3 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 00:22:03 +0100
Subject: [PATCH 08/21] lint

---
 src/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tasks.py b/src/tasks.py
index 6f3dbdec..1afc2938 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -131,7 +131,7 @@ def read_bytes_from_offset(
         """
 
         before = data[offset - byte_range : offset]
-        matching = data[offset: offset + matched_length]
+        matching = data[offset : offset + matched_length]
         after = data[
             offset + matched_length : offset + matched_length + byte_range
         ]

From acd85875049df277afe37f580385b7076e33e133 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 10:33:53 +0100
Subject: [PATCH 09/21] bytes into base64 modified

---
 src/models/match.py |  2 +-
 src/tasks.py        | 15 +++++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/models/match.py b/src/models/match.py
index 1a1816d5..3fd47df7 100644
--- a/src/models/match.py
+++ b/src/models/match.py
@@ -22,4 +22,4 @@ class Match(SQLModel, table=True):
         )
     )
     job: Job = Relationship(back_populates="matches")
-    context: Dict[str, List[str]] = Field(sa_column=Column(JSON))
+    context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))
diff --git a/src/tasks.py b/src/tasks.py
index 1afc2938..9e95ed62 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,3 +1,4 @@
+import base64
 from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
@@ -73,7 +74,7 @@ def update_metadata(
         orig_name: str,
         path: str,
         matches: List[str],
-        context: Dict[str, List[Dict[str, bytes]]],
+        context: Dict[str, List[Dict[str, str]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -173,9 +174,15 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                                     )
                                     match_context.append(
                                         {
-                                            "before": before,
-                                            "matching": matching,
-                                            "after": after,
+                                            "before": base64.b64encode(
+                                                before
+                                            ).decode("utf-8"),
+                                            "matching": base64.b64encode(
+                                                matching
+                                            ).decode("utf-8"),
+                                            "after": base64.b64encode(
+                                                after
+                                            ).decode("utf-8"),
                                         }
                                     )
                                     context.update({str(rule): match_context})

From c253dfad7c71de433300ca960f4760b3593af20f Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 12:17:22 +0100
Subject: [PATCH 10/21] .

---
 src/e2etests/test_api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/e2etests/test_api.py b/src/e2etests/test_api.py
index 6f49cf99..3aa3680a 100644
--- a/src/e2etests/test_api.py
+++ b/src/e2etests/test_api.py
@@ -120,6 +120,7 @@ def test_query_two_results(add_files_to_index):
         res = request_query(log, i)
 
         m = res.json()["matches"]
+        log.info(f"-------------------------> {m}")
         assert len(m) == 2
         with open(m[0]["file"], "r") as file:
             text1 = file.read()

From 12228bd4bac5745bbb1c7f62351c36fa5f4564d5 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 15:21:07 +0100
Subject: [PATCH 11/21] Name of rule in loop modified

---
 src/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index 9e95ed62..ec4f3f0e 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -157,9 +157,9 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                     data = self.read_file(path)
                     context = {}
 
-                    for rule in matches:
+                    for rule_name in matches:
                         match_context = []
-                        for string_match in rule.strings:
+                        for string_match in rule_name.strings:
                             expression_keys = []
                             for expression_key in string_match.instances:
                                 if expression_key not in expression_keys:

From c1ce7689448df96010dca9c3c5ea7ff0c6343674 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 16:56:25 +0100
Subject: [PATCH 12/21] bug fixed, refactoring

---
 src/tasks.py | 71 +++++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index ec4f3f0e..598576ca 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -131,10 +131,10 @@ def read_bytes_from_offset(
                    and ending at offset plus matched_length and byte_range.
         """
 
-        before = data[offset - byte_range : offset]
+        before = data[max(0, offset - byte_range) : offset]
         matching = data[offset : offset + matched_length]
         after = data[
-            offset + matched_length : offset + matched_length + byte_range
+            offset + matched_length : min(len(data), offset + matched_length + byte_range)
         ]
 
         return before, matching, after
@@ -155,38 +155,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
 
                 if matches:
                     data = self.read_file(path)
-                    context = {}
-
-                    for rule_name in matches:
-                        match_context = []
-                        for string_match in rule_name.strings:
-                            expression_keys = []
-                            for expression_key in string_match.instances:
-                                if expression_key not in expression_keys:
-                                    (
-                                        before,
-                                        matching,
-                                        after,
-                                    ) = self.read_bytes_from_offset(
-                                        data=data,
-                                        offset=expression_key.offset,
-                                        matched_length=expression_key.matched_length,
-                                    )
-                                    match_context.append(
-                                        {
-                                            "before": base64.b64encode(
-                                                before
-                                            ).decode("utf-8"),
-                                            "matching": base64.b64encode(
-                                                matching
-                                            ).decode("utf-8"),
-                                            "after": base64.b64encode(
-                                                after
-                                            ).decode("utf-8"),
-                                        }
-                                    )
-                                    context.update({str(rule): match_context})
-                                    expression_keys.append(expression_key)
+                    context = self.get_match_context(data, matches)
 
                     self.update_metadata(
                         job=job.id,
@@ -222,6 +191,40 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
             )
 
+    def get_match_context(self, data, matches):
+        context = {}
+        for rule_name in matches:
+            match_context = []
+            for string_match in rule_name.strings:
+                expression_keys = []
+                for expression_key in string_match.instances:
+                    if expression_key not in expression_keys:
+                        (
+                            before,
+                            matching,
+                            after,
+                        ) = self.read_bytes_from_offset(
+                            data=data,
+                            offset=expression_key.offset,
+                            matched_length=expression_key.matched_length,
+                        )
+                        match_context.append(
+                            {
+                                "before": base64.b64encode(
+                                    before
+                                ).decode("utf-8"),
+                                "matching": base64.b64encode(
+                                    matching
+                                ).decode("utf-8"),
+                                "after": base64.b64encode(
+                                    after
+                                ).decode("utf-8"),
+                            }
+                        )
+                        context.update({str(rule_name): match_context})
+                        expression_keys.append(expression_key)
+        return context
+
     def init_search(self, job: Job, tasks: int) -> None:
         self.db.init_jobagent(job, self.db_id, tasks)
 

From e2598075f0a5ab7d448560c80c4d77b02a7351fd Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 23:27:43 +0100
Subject: [PATCH 13/21] refactoring

---
 src/tasks.py | 59 ++++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index 598576ca..b54a2e15 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -134,7 +134,10 @@ def read_bytes_from_offset(
         before = data[max(0, offset - byte_range) : offset]
         matching = data[offset : offset + matched_length]
         after = data[
-            offset + matched_length : min(len(data), offset + matched_length + byte_range)
+            offset
+            + matched_length : min(
+                len(data), offset + matched_length + byte_range
+            )
         ]
 
         return before, matching, after
@@ -151,8 +154,8 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
-                matches = rule.match(path)
 
+                matches = rule.match(path)
                 if matches:
                     data = self.read_file(path)
                     context = self.get_match_context(data, matches)
@@ -191,38 +194,34 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
             )
 
-    def get_match_context(self, data, matches):
+    def get_match_context(
+        self, data: bytes, matches: List[yara.Match]
+    ) -> dict:
         context = {}
-        for rule_name in matches:
+        for yara_match in matches:
             match_context = []
-            for string_match in rule_name.strings:
+            for string_match in yara_match.strings:
                 expression_keys = []
                 for expression_key in string_match.instances:
-                    if expression_key not in expression_keys:
-                        (
-                            before,
-                            matching,
-                            after,
-                        ) = self.read_bytes_from_offset(
-                            data=data,
-                            offset=expression_key.offset,
-                            matched_length=expression_key.matched_length,
-                        )
-                        match_context.append(
-                            {
-                                "before": base64.b64encode(
-                                    before
-                                ).decode("utf-8"),
-                                "matching": base64.b64encode(
-                                    matching
-                                ).decode("utf-8"),
-                                "after": base64.b64encode(
-                                    after
-                                ).decode("utf-8"),
-                            }
-                        )
-                        context.update({str(rule_name): match_context})
-                        expression_keys.append(expression_key)
+                    if expression_key in expression_keys:
+                        continue
+
+                    (before, matching, after,) = self.read_bytes_from_offset(
+                        data=data,
+                        offset=expression_key.offset,
+                        matched_length=expression_key.matched_length,
+                    )
+                    match_context.append(
+                        {
+                            "before": base64.b64encode(before).decode("utf-8"),
+                            "matching": base64.b64encode(matching).decode(
+                                "utf-8"
+                            ),
+                            "after": base64.b64encode(after).decode("utf-8"),
+                        }
+                    )
+                    context.update({str(yara_match): match_context})
+                    expression_keys.append(expression_key)
         return context
 
     def init_search(self, job: Job, tasks: int) -> None:

From fc5a2f614136d533d9cb19b027a46a9fe79ecd82 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Mon, 2 Dec 2024 23:36:40 +0100
Subject: [PATCH 14/21] log test deleted

---
 src/e2etests/test_api.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/e2etests/test_api.py b/src/e2etests/test_api.py
index 3aa3680a..6f49cf99 100644
--- a/src/e2etests/test_api.py
+++ b/src/e2etests/test_api.py
@@ -120,7 +120,6 @@ def test_query_two_results(add_files_to_index):
         res = request_query(log, i)
 
         m = res.json()["matches"]
-        log.info(f"-------------------------> {m}")
         assert len(m) == 2
         with open(m[0]["file"], "r") as file:
             text1 = file.read()

From 7fdfe9e77c6136a0867e80bb8f67bb467ecf6630 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:13:15 +0100
Subject: [PATCH 15/21] after review

---
 src/tasks.py | 107 +++++++++++++++++++--------------------------------
 1 file changed, 39 insertions(+), 68 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index b54a2e15..cc000b9c 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -104,44 +104,6 @@ def update_metadata(
         )
         self.db.add_match(job, match)
 
-    @staticmethod
-    def read_file(file_path: str) -> bytes:
-        """Reads the entire file content.
-
-        Returns:
-            bytes: The content of the file.
-        """
-        with open(file_path, "rb") as file:
-            return file.read()
-
-    @staticmethod
-    def read_bytes_from_offset(
-        data: bytes, matched_length: int, offset: int, byte_range: int = 32
-    ) -> tuple[bytes, bytes, bytes]:
-        """Reads a specific range of bytes from the already loaded file content around a given offset.
-
-        Args:
-            data (bytes): Data to read.
-            matched_length (int): Number of bytes to read.
-            offset (int): The offset in bytes from which to start reading.
-            byte_range (int): The range in bytes to read around the offset (default is 32).
-
-        Returns:
-            bytes: A chunk of bytes from the file, starting from the given offset minus bit_range
-                   and ending at offset plus matched_length and byte_range.
-        """
-
-        before = data[max(0, offset - byte_range) : offset]
-        matching = data[offset : offset + matched_length]
-        after = data[
-            offset
-            + matched_length : min(
-                len(data), offset + matched_length + byte_range
-            )
-        ]
-
-        return before, matching, after
-
     def execute_yara(self, job: Job, files: List[str]) -> None:
         rule = yara.compile(source=job.raw_yara)
         num_matches = 0
@@ -157,15 +119,16 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
 
                 matches = rule.match(path)
                 if matches:
-                    data = self.read_file(path)
+                    with open(path, "rb") as file:
+                        data = file.read()
                     context = self.get_match_context(data, matches)
 
                     self.update_metadata(
-                        job=job.id,
-                        orig_name=orig_name,
-                        path=path,
-                        matches=[r.rule for r in matches],
-                        context=context,
+                        job.id,
+                        orig_name,
+                        path,
+                        [r.rule for r in matches],
+                        context,
                     )
                     num_matches += 1
             except yara.Error:
@@ -194,34 +157,27 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
             )
 
+    @staticmethod
     def get_match_context(
-        self, data: bytes, matches: List[yara.Match]
-    ) -> dict:
+        data: bytes, matches: List[yara.Match]
+    ) -> Dict[str, Dict[str, Dict[str, base64.b64decode]]]:
         context = {}
         for yara_match in matches:
-            match_context = []
+            match_context = {}
             for string_match in yara_match.strings:
-                expression_keys = []
-                for expression_key in string_match.instances:
-                    if expression_key in expression_keys:
-                        continue
-
-                    (before, matching, after,) = self.read_bytes_from_offset(
-                        data=data,
-                        offset=expression_key.offset,
-                        matched_length=expression_key.matched_length,
-                    )
-                    match_context.append(
-                        {
-                            "before": base64.b64encode(before).decode("utf-8"),
-                            "matching": base64.b64encode(matching).decode(
-                                "utf-8"
-                            ),
-                            "after": base64.b64encode(after).decode("utf-8"),
-                        }
-                    )
-                    context.update({str(yara_match): match_context})
-                    expression_keys.append(expression_key)
+                expression_key = string_match.instances[0]
+
+                (before, matching, after,) = read_bytes_with_context(
+                    data, expression_key.matched_length, expression_key.offset
+                )
+                match_context[expression_key] = {
+                    "before": base64.b64encode(before).decode("utf-8"),
+                    "matching": base64.b64encode(matching).decode("utf-8"),
+                    "after": base64.b64encode(after).decode("utf-8"),
+                }
+
+                context[yara_match.rule] = match_context
+        logging.error(f"Match context: {context}")
         return context
 
     def init_search(self, job: Job, tasks: int) -> None:
@@ -374,3 +330,18 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:
 
         agent.execute_yara(job, pop_result.files)
         agent.add_tasks_in_progress(job, -1)
+
+
+def read_bytes_with_context(
+    data: bytes, matched_length: int, offset: int, byte_range: int = 32
+) -> tuple[bytes, bytes, bytes]:
+    """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match."""
+
+    before = data[max(0, offset - byte_range) : offset]
+    matching = data[offset : offset + matched_length]
+    after = data[
+        offset
+        + matched_length : min(len(data), offset + matched_length + byte_range)
+    ]
+
+    return before, matching, after

From 36aa90ecfccaafd579785d4c95975713a23b3b8d Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:14:16 +0100
Subject: [PATCH 16/21] logging context deleted

---
 src/tasks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tasks.py b/src/tasks.py
index cc000b9c..f47124da 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -177,7 +177,6 @@ def get_match_context(
                 }
 
                 context[yara_match.rule] = match_context
-        logging.error(f"Match context: {context}")
         return context
 
     def init_search(self, job: Job, tasks: int) -> None:

From d0cdcd657b515faab357df420b3cf5ccb2b3913c Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:22:14 +0100
Subject: [PATCH 17/21] lint

---
 src/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tasks.py b/src/tasks.py
index f47124da..dbfc4a26 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -160,7 +160,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
     @staticmethod
     def get_match_context(
         data: bytes, matches: List[yara.Match]
-    ) -> Dict[str, Dict[str, Dict[str, base64.b64decode]]]:
+    ) -> Dict[str, Dict[str, Dict[str, str]]]:
         context = {}
         for yara_match in matches:
             match_context = {}

From 1af4050d2b861359ca82c3ad91271791fb90dbed Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:25:51 +0100
Subject: [PATCH 18/21] lint

---
 src/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/tasks.py b/src/tasks.py
index dbfc4a26..e159705d 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,5 +1,5 @@
 import base64
-from typing import List, Optional, cast, Dict
+from typing import List, Optional, cast, Dict, Any
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -74,7 +74,7 @@ def update_metadata(
         orig_name: str,
         path: str,
         matches: List[str],
-        context: Dict[str, List[Dict[str, str]]],
+        context: Dict[str, Dict[str, Dict[str, str]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -128,7 +128,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                         orig_name,
                         path,
                         [r.rule for r in matches],
-                        context,
+                        context
                     )
                     num_matches += 1
             except yara.Error:

From 7bdcef9692217bd6cf07e727d53bf030e8ec339b Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:26:47 +0100
Subject: [PATCH 19/21] lint

---
 src/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tasks.py b/src/tasks.py
index e159705d..3c664222 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -1,5 +1,5 @@
 import base64
-from typing import List, Optional, cast, Dict, Any
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis

From a5a30582d8277b51351159d4330337e061d2064a Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 11:27:58 +0100
Subject: [PATCH 20/21] lint

---
 src/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tasks.py b/src/tasks.py
index 3c664222..8456dbee 100644
--- a/src/tasks.py
+++ b/src/tasks.py
@@ -128,7 +128,7 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                         orig_name,
                         path,
                         [r.rule for r in matches],
-                        context
+                        context,
                     )
                     num_matches += 1
             except yara.Error:

From 57fd6511021efbc04979995e2a54efb1ebbe2161 Mon Sep 17 00:00:00 2001
From: michalkrzem <michal_krzem@wp.pl>
Date: Fri, 13 Dec 2024 12:46:13 +0100
Subject: [PATCH 21/21] fix migration

---
 .../f623e1057b00_added_context_column_into_match_table.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
index 50e67f73..7ac9bf61 100644
--- a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
+++ b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -9,7 +9,7 @@
 
 # revision identifiers, used by Alembic.
 revision = "f623e1057b00"
-down_revision = "6b495d5a4855"
+down_revision = "702d19cfa063"
 branch_labels = None
 depends_on = None