Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

395 add the information about match context to the database #439

Draft
wants to merge 22 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Added context column into match table
Revision ID: f623e1057b00
Revises: 6b495d5a4855
Create Date: 2024-11-13 15:14:14.618258
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "f623e1057b00"
down_revision = "702d19cfa063"
branch_labels = None
depends_on = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column("match", sa.Column("context", sa.JSON(), nullable=True))
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("match", "context")
# ### end Alembic commands ###
1 change: 1 addition & 0 deletions src/models/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ class Match(SQLModel, table=True):
)
)
job: Job = Relationship(back_populates="matches")
context: Dict[str, List[Dict[str, str]]] = Field(sa_column=Column(JSON))
62 changes: 58 additions & 4 deletions src/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import List, Optional, cast
import base64
from typing import List, Optional, cast, Dict
import logging
from rq import get_current_job, Queue # type: ignore
from redis import Redis
Expand Down Expand Up @@ -68,7 +69,12 @@ def get_datasets(self) -> List[str]:
return list(result["result"]["datasets"].keys())

def update_metadata(
self, job: JobId, orig_name: str, path: str, matches: List[str]
self,
job: JobId,
orig_name: str,
path: str,
matches: List[str],
context: Dict[str, Dict[str, Dict[str, str]]],
) -> None:
"""Saves matches to the database, and runs appropriate metadata
plugins.
Expand All @@ -93,7 +99,9 @@ def update_metadata(
del metadata["path"]

# Update the database.
match = Match(file=orig_name, meta=metadata, matches=matches)
match = Match(
file=orig_name, meta=metadata, matches=matches, context=context
)
self.db.add_match(job, match)

def execute_yara(self, job: Job, files: List[str]) -> None:
Expand All @@ -108,10 +116,19 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
path = self.plugins.filter(orig_name)
if not path:
continue

matches = rule.match(path)
if matches:
with open(path, "rb") as file:
data = file.read()
context = self.get_match_context(data, matches)

self.update_metadata(
job.id, orig_name, path, [r.rule for r in matches]
job.id,
orig_name,
path,
[r.rule for r in matches],
context,
)
num_matches += 1
except yara.Error:
Expand Down Expand Up @@ -140,6 +157,28 @@ def execute_yara(self, job: Job, files: List[str]) -> None:
f"in {scanned_datasets}/{job.total_datasets} ({dataset_percent:.0%}) of datasets.",
)

@staticmethod
def get_match_context(
data: bytes, matches: List[yara.Match]
) -> Dict[str, Dict[str, Dict[str, str]]]:
context = {}
for yara_match in matches:
match_context = {}
for string_match in yara_match.strings:
expression_key = string_match.instances[0]

(before, matching, after,) = read_bytes_with_context(
data, expression_key.matched_length, expression_key.offset
)
match_context[expression_key] = {
"before": base64.b64encode(before).decode("utf-8"),
"matching": base64.b64encode(matching).decode("utf-8"),
"after": base64.b64encode(after).decode("utf-8"),
}

context[yara_match.rule] = match_context
return context

def init_search(self, job: Job, tasks: int) -> None:
self.db.init_jobagent(job, self.db_id, tasks)

Expand Down Expand Up @@ -290,3 +329,18 @@ def run_yara_batch(job_id: JobId, iterator: str, batch_size: int) -> None:

agent.execute_yara(job, pop_result.files)
agent.add_tasks_in_progress(job, -1)


def read_bytes_with_context(
data: bytes, matched_length: int, offset: int, byte_range: int = 32
) -> tuple[bytes, bytes, bytes]:
"""Return `matched_length` bytes from `offset`, along with `byte_range` bytes before and after the match."""

before = data[max(0, offset - byte_range) : offset]
matching = data[offset : offset + matched_length]
after = data[
offset
+ matched_length : min(len(data), offset + matched_length + byte_range)
]

return before, matching, after
Loading