CERT-Polska · michalkrzem · Oct 30, 2024 · Nov 13, 2024 · Nov 14, 2024 · Dec 1, 2024
diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -0,0 +1,26 @@
+"""Added context column into match table
+Revision ID: f623e1057b00
+Revises: 6b495d5a4855
+Create Date: 2024-11-13 15:14:14.618258
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "f623e1057b00"
+down_revision = "6b495d5a4855"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("match", sa.Column("context", sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("match", "context")
+    # ### end Alembic commands ###
diff --git a/src/models/match.py b/src/models/match.py
@@ -22,3 +22,4 @@ class Match(SQLModel, table=True):
         )
     )
     job: Job = Relationship(back_populates="matches")
+    context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))
diff --git a/src/tasks.py b/src/tasks.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, cast
+import base64
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]:
         return list(result["result"]["datasets"].keys())
 
     def update_metadata(
-        self, job: JobId, orig_name: str, path: str, matches: List[str]
+        self,
+        job: JobId,
+        orig_name: str,
+        path: str,
+        matches: List[str],
+        context: Dict[str, List[Dict[str, str]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -93,9 +99,49 @@ def update_metadata(
         del metadata["path"]
 
         # Update the database.
-        match = Match(file=orig_name, meta=metadata, matches=matches)
+        match = Match(
+            file=orig_name, meta=metadata, matches=matches, context=context
+        )
         self.db.add_match(job, match)
 
+    @staticmethod
+    def read_file(file_path: str) -> bytes:
+        """Reads the entire file content.
+
+        Returns:
+            bytes: The content of the file.
+        """
+        with open(file_path, "rb") as file:
+            return file.read()
+
+    @staticmethod
+    def read_bytes_from_offset(
+        data: bytes, matched_length: int, offset: int, byte_range: int = 32
+    ) -> tuple[bytes, bytes, bytes]:
+        """Reads a specific range of bytes from the already loaded file content around a given offset.
+
+        Args:
+            data (bytes): Data to read.
+            matched_length (int): Number of bytes to read.
+            offset (int): The offset in bytes from which to start reading.
+            byte_range (int): The range in bytes to read around the offset (default is 32).
+
+        Returns:
+            bytes: A chunk of bytes from the file, starting from the given offset minus bit_range
+                   and ending at offset plus matched_length and byte_range.
+        """
-        """Reads a specific range of bytes from the already loaded file content around a given offset.
-
-        Args:
-            data (bytes): Data to read.
-            matched_length (int): Number of bytes to read.
-            offset (int): The offset in bytes from which to start reading.
-            byte_range (int): The range in bytes to read around the offset (default is 32).
-
-        Returns:
-            bytes: A chunk of bytes from the file, starting from the given offset minus bit_range
-                   and ending at offset plus matched_length and byte_range.
+        """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match.
-        """Reads a specific range of bytes from the already loaded file content around a given offset.
-
-        Args:
-            data (bytes): Data to read.
-            matched_length (int): Number of bytes to read.
-            offset (int): The offset in bytes from which to start reading.
-            byte_range (int): The range in bytes to read around the offset (default is 32).
-
-        Returns:
-            bytes: A chunk of bytes from the file, starting from the given offset minus bit_range
-                   and ending at offset plus matched_length and byte_range.
+        """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match.
+
+        before = data[max(0, offset - byte_range) : offset]
+        matching = data[offset : offset + matched_length]
+        after = data[
+            offset
+            + matched_length : min(
+                len(data), offset + matched_length + byte_range
+            )
+        ]
+
+        return before, matching, after
+
     def execute_yara(self, job: Job, files: List[str]) -> None:
         rule = yara.compile(source=job.raw_yara)
         num_matches = 0
@@ -108,10 +154,18 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
+
                 matches = rule.match(path)
                 if matches:
+                    data = self.read_file(path)
+                    context = self.get_match_context(data, matches)
+
                     self.update_metadata(
-                        job.id, orig_name, path, [r.rule for r in matches]
+                        job=job.id,
+                        orig_name=orig_name,
+                        path=path,
+                        matches=[r.rule for r in matches],
+                        context=context,
                     )
                     num_matches += 1
             except yara.Error:
@@ -140,6 +194,36 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
             )
 
+    def get_match_context(
+        self, data: bytes, matches: List[yara.Match]
+    ) -> dict:
+        context = {}
-    ) -> dict:
+    ) -> Dict[str, ???]:
-    ) -> dict:
+    ) -> Dict[str, ???]:
+        for yara_match in matches:
+            match_context = []
+            for string_match in yara_match.strings:
+                expression_keys = []
+                for expression_key in string_match.instances:
+                    if expression_key in expression_keys:
+                        continue
+
+                    (before, matching, after,) = self.read_bytes_from_offset(
+                        data=data,
+                        offset=expression_key.offset,
+                        matched_length=expression_key.matched_length,
+                    )
+                    match_context.append(
-                    (before, matching, after,) = self.read_bytes_from_offset(
-                        data=data,
-                        offset=expression_key.offset,
-                        matched_length=expression_key.matched_length,
-                    )
+                    (before, matching, after) = self.read_bytes_from_offset(
+                        data,
+                        expression_key.offset,
+                        expression_key.matched_length,
+                    )
-                    (before, matching, after,) = self.read_bytes_from_offset(
-                        data=data,
-                        offset=expression_key.offset,
-                        matched_length=expression_key.matched_length,
-                    )
+                    (before, matching, after) = self.read_bytes_from_offset(
+                        data,
+                        expression_key.offset,
+                        expression_key.matched_length,
+                    )
+                        {
+                            "before": base64.b64encode(before).decode("utf-8"),
+                            "matching": base64.b64encode(matching).decode(
+                                "utf-8"
+                            ),
+                            "after": base64.b64encode(after).decode("utf-8"),
+                        }
+                    )
+                    context.update({str(yara_match): match_context})
+                    expression_keys.append(expression_key)
+        return context
+
     def init_search(self, job: Job, tasks: int) -> None:
         self.db.init_jobagent(job, self.db_id, tasks)
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,4 @@ class Match(SQLModel, table=True): @@
             )
         )
         job: Job = Relationship(back_populates="matches")
+        context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))