CERT-Polska · michalkrzem · Oct 30, 2024 · Nov 13, 2024 · Nov 14, 2024 · Dec 1, 2024
diff --git a/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py b/src/migrations/versions/f623e1057b00_added_context_column_into_match_table.py
@@ -0,0 +1,26 @@
+"""Added context column into match table
+Revision ID: f623e1057b00
+Revises: 6b495d5a4855
+Create Date: 2024-11-13 15:14:14.618258
+"""
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "f623e1057b00"
+down_revision = "702d19cfa063"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column("match", sa.Column("context", sa.JSON(), nullable=True))
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("match", "context")
+    # ### end Alembic commands ###
diff --git a/src/models/match.py b/src/models/match.py
@@ -22,3 +22,4 @@ class Match(SQLModel, table=True):
         )
     )
     job: Job = Relationship(back_populates="matches")
+    context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))
diff --git a/src/tasks.py b/src/tasks.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, cast
+import base64
+from typing import List, Optional, cast, Dict
 import logging
 from rq import get_current_job, Queue  # type: ignore
 from redis import Redis
@@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]:
         return list(result["result"]["datasets"].keys())
 
     def update_metadata(
-        self, job: JobId, orig_name: str, path: str, matches: List[str]
+        self,
+        job: JobId,
+        orig_name: str,
+        path: str,
+        matches: List[str],
+        context: Dict[str, Dict[str, Dict[str, str]]],
     ) -> None:
         """Saves matches to the database, and runs appropriate metadata
         plugins.
@@ -93,7 +99,9 @@ def update_metadata(
         del metadata["path"]
 
         # Update the database.
-        match = Match(file=orig_name, meta=metadata, matches=matches)
+        match = Match(
+            file=orig_name, meta=metadata, matches=matches, context=context
+        )
         self.db.add_match(job, match)
 
     def execute_yara(self, job: Job, files: List[str]) -> None:
@@ -108,10 +116,19 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 path = self.plugins.filter(orig_name)
                 if not path:
                     continue
+
                 matches = rule.match(path)
                 if matches:
+                    with open(path, "rb") as file:
+                        data = file.read()
+                    context = self.get_match_context(data, matches)
+
                     self.update_metadata(
-                        job.id, orig_name, path, [r.rule for r in matches]
+                        job.id,
+                        orig_name,
+                        path,
+                        [r.rule for r in matches],
+                        context,
                     )
                     num_matches += 1
             except yara.Error:
@@ -140,6 +157,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
                 f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
             )
 
+    @staticmethod
+    def get_match_context(
+        data: bytes, matches: List[yara.Match]
+    ) -> Dict[str, Dict[str, Dict[str, str]]]:
+        context = {}
+        for yara_match in matches:
+            match_context = {}
+            for string_match in yara_match.strings:
+                expression_key = string_match.instances[0]
+
+                (before, matching, after,) = read_bytes_with_context(
+                    data, expression_key.matched_length, expression_key.offset
+                )
+                match_context[expression_key] = {
+                    "before": base64.b64encode(before).decode("utf-8"),
+                    "matching": base64.b64encode(matching).decode("utf-8"),
+                    "after": base64.b64encode(after).decode("utf-8"),
+                }
+
+                context[yara_match.rule] = match_context
+        return context
+
     def init_search(self, job: Job, tasks: int) -> None:
         self.db.init_jobagent(job, self.db_id, tasks)
 
@@ -290,3 +329,18 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:
 
         agent.execute_yara(job, pop_result.files)
         agent.add_tasks_in_progress(job, -1)
+
+
+def read_bytes_with_context(
+    data: bytes, matched_length: int, offset: int, byte_range: int = 32
+) -> tuple[bytes, bytes, bytes]:
+    """Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match."""
+
+    before = data[max(0, offset - byte_range) : offset]
+    matching = data[offset : offset + matched_length]
+    after = data[
+        offset
+        + matched_length : min(len(data), offset + matched_length + byte_range)
+    ]
+
+    return before, matching, after
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,3 +22,4 @@ class Match(SQLModel, table=True): @@
             )
         )
         job: Job = Relationship(back_populates="matches")
+        context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))