From 893378c10e23fe3683847e41e8d87015900062fa Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 12 Dec 2024 09:42:25 +0000 Subject: [PATCH 1/5] record origin_monitor_id for more reliable process association tmp --- capa/features/extractors/vmray/__init__.py | 15 ++++++++++++++- capa/features/extractors/vmray/models.py | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index dc719211a..af59a3208 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -34,6 +34,7 @@ class VMRayMonitorProcess: pid: int # process ID assigned by OS ppid: int # parent process ID assigned by OS monitor_id: int # unique ID assigned to process by VMRay + origin_monitor_id: int # unique VMRay ID of parent process image_name: str filename: str cmd_line: str @@ -165,6 +166,7 @@ def _compute_monitor_processes(self): process.os_pid, ppid, process.monitor_id, + process.origin_monitor_id, process.image_name, process.filename, process.cmd_line, @@ -176,6 +178,7 @@ def _compute_monitor_processes(self): monitor_process.os_pid, monitor_process.os_parent_pid, monitor_process.process_id, + monitor_process.parent_id, monitor_process.image_name, monitor_process.filename, monitor_process.cmd_line, @@ -185,7 +188,17 @@ def _compute_monitor_processes(self): self.monitor_processes[monitor_process.process_id] = vmray_monitor_process else: # we expect monitor processes recorded in both SummaryV2.json and flog.xml to equal - assert self.monitor_processes[monitor_process.process_id] == vmray_monitor_process + # to ensure this, we compare the pid, monitor_id, and origin_monitor_id + # for the other fields we've observed cases with slight deviations, e.g., + # the ppid for a process in flog.xml is not set correctly, all other data is equal + sv2p = self.monitor_processes[monitor_process.process_id] + assert (sv2p.pid, sv2p.monitor_id, sv2p.origin_monitor_id) == ( + vmray_monitor_process.pid, + vmray_monitor_process.monitor_id, + vmray_monitor_process.origin_monitor_id, + ) + if self.monitor_processes[monitor_process.process_id] != vmray_monitor_process: + logger.debug("processes differ: %s (sv2) vs. %s (flog)", sv2p, vmray_monitor_process) def _compute_monitor_threads(self): for monitor_thread in self.flog.analysis.monitor_threads: diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index 755f494fe..027680586 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -314,6 +314,7 @@ class Process(BaseModel): # is_ioc: bool monitor_id: int # monitor_reason: str + origin_monitor_id: int # VMRay ID of parent process os_pid: int filename: SanitizedString image_name: str From 55720ddbfd067f2920fa7a31c1e5c8b0d3cec105 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 12 Dec 2024 09:43:45 +0000 Subject: [PATCH 2/5] make more fields optional for more flexible model tmp --- capa/features/extractors/vmray/__init__.py | 9 +++++---- capa/features/extractors/vmray/models.py | 6 +++--- tests/test_vmray_model.py | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index af59a3208..71d9afd10 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -36,8 +36,8 @@ class VMRayMonitorProcess: monitor_id: int # unique ID assigned to process by VMRay origin_monitor_id: int # unique VMRay ID of parent process image_name: str - filename: str - cmd_line: str + filename: Optional[str] = "" + cmd_line: Optional[str] = "" class VMRayAnalysis: @@ -151,8 +151,9 @@ def _compute_sections(self): for pefile_section in self.sample_file_static_data.pe.sections: self.sections[pefile_section.virtual_address] = pefile_section.name elif self.sample_file_static_data.elf: - for elffile_section in self.sample_file_static_data.elf.sections: - self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name + if self.sample_file_static_data.elf.sections: + for elffile_section in self.sample_file_static_data.elf.sections: + self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name def _compute_monitor_processes(self): for process in self.sv2.processes.values(): diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index 027680586..36cd261e3 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -276,7 +276,7 @@ class ElfFileHeader(BaseModel): class ElfFile(BaseModel): # file_header: ElfFileHeader - sections: list[ElfFileSection] + sections: Optional[list[ElfFileSection]] = None class StaticData(BaseModel): @@ -316,9 +316,9 @@ class Process(BaseModel): # monitor_reason: str origin_monitor_id: int # VMRay ID of parent process os_pid: int - filename: SanitizedString + filename: Optional[SanitizedString] = "" image_name: str - cmd_line: SanitizedString + cmd_line: Optional[SanitizedString] = "" ref_parent_process: Optional[GenericReference] = None diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py index c693b6631..58d8a9ccc 100644 --- a/tests/test_vmray_model.py +++ b/tests/test_vmray_model.py @@ -103,8 +103,8 @@ def test_vmray_model_elffile(): """ ) - assert elffile.sections[0].header.sh_name == "abcd1234" - assert elffile.sections[0].header.sh_addr == 2863311530 + assert elffile.sections and elffile.sections[0].header.sh_name == "abcd1234" + assert elffile.sections and elffile.sections[0].header.sh_addr == 2863311530 def test_vmray_model_pefile(): From 06f0012183daf7692aeff90d0f28ad599eb13712 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 12 Dec 2024 09:48:03 +0000 Subject: [PATCH 3/5] only check file limitations for static file formats --- capa/main.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/capa/main.py b/capa/main.py index 2e3a5900c..4e9dbd13a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -748,15 +748,13 @@ def find_file_limitations_from_cli(args, rules: RuleSet, file_extractors: list[F args: args: The parsed command line arguments from `install_common_args`. + Dynamic feature extractors can handle packed samples and do not need to be considered here. + raises: ShouldExitError: if the program is invoked incorrectly and should exit. """ found_file_limitation = False for file_extractor in file_extractors: - if isinstance(file_extractor, DynamicFeatureExtractor): - # Dynamic feature extractors can handle packed samples - continue - try: pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {}) except PEFormatError as e: @@ -962,8 +960,11 @@ def main(argv: Optional[list[str]] = None): ensure_input_exists_from_cli(args) input_format = get_input_format_from_cli(args) rules = get_rules_from_cli(args) - file_extractors = get_file_extractors_from_cli(args, input_format) - found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) + found_file_limitation = False + if input_format in STATIC_FORMATS: + # only static extractors have file limitations + file_extractors = get_file_extractors_from_cli(args, input_format) + found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) except ShouldExitError as e: return e.status_code From 1f34795fce9ce784ad1424383b3546736f3abcd0 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Thu, 12 Dec 2024 10:12:14 +0000 Subject: [PATCH 4/5] vmray and dynamic updates --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0c53ad9f..ec3369aef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,9 @@ ### Bug Fixes +- vmray: load more analysis archives @mr-tz +- dynamic: only check file limitations for static file formats @mr-tz + ### capa Explorer Web ### capa Explorer IDA Pro plugin From 51d606bc0d5c10f2df7b0e036b3b951c7c1cfbb5 Mon Sep 17 00:00:00 2001 From: mr-tz Date: Fri, 13 Dec 2024 11:51:47 +0000 Subject: [PATCH 5/5] use default emptry list for ElfFileSection --- capa/features/extractors/vmray/__init__.py | 10 +++++----- capa/features/extractors/vmray/models.py | 2 +- tests/test_vmray_model.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index 71d9afd10..94226a5af 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -151,9 +151,8 @@ def _compute_sections(self): for pefile_section in self.sample_file_static_data.pe.sections: self.sections[pefile_section.virtual_address] = pefile_section.name elif self.sample_file_static_data.elf: - if self.sample_file_static_data.elf.sections: - for elffile_section in self.sample_file_static_data.elf.sections: - self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name + for elffile_section in self.sample_file_static_data.elf.sections: + self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name def _compute_monitor_processes(self): for process in self.sv2.processes.values(): @@ -193,13 +192,14 @@ def _compute_monitor_processes(self): # for the other fields we've observed cases with slight deviations, e.g., # the ppid for a process in flog.xml is not set correctly, all other data is equal sv2p = self.monitor_processes[monitor_process.process_id] + if self.monitor_processes[monitor_process.process_id] != vmray_monitor_process: + logger.debug("processes differ: %s (sv2) vs. %s (flog)", sv2p, vmray_monitor_process) + assert (sv2p.pid, sv2p.monitor_id, sv2p.origin_monitor_id) == ( vmray_monitor_process.pid, vmray_monitor_process.monitor_id, vmray_monitor_process.origin_monitor_id, ) - if self.monitor_processes[monitor_process.process_id] != vmray_monitor_process: - logger.debug("processes differ: %s (sv2) vs. %s (flog)", sv2p, vmray_monitor_process) def _compute_monitor_threads(self): for monitor_thread in self.flog.analysis.monitor_threads: diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index 36cd261e3..761a879ba 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -276,7 +276,7 @@ class ElfFileHeader(BaseModel): class ElfFile(BaseModel): # file_header: ElfFileHeader - sections: Optional[list[ElfFileSection]] = None + sections: list[ElfFileSection] = [] class StaticData(BaseModel): diff --git a/tests/test_vmray_model.py b/tests/test_vmray_model.py index 58d8a9ccc..c693b6631 100644 --- a/tests/test_vmray_model.py +++ b/tests/test_vmray_model.py @@ -103,8 +103,8 @@ def test_vmray_model_elffile(): """ ) - assert elffile.sections and elffile.sections[0].header.sh_name == "abcd1234" - assert elffile.sections and elffile.sections[0].header.sh_addr == 2863311530 + assert elffile.sections[0].header.sh_name == "abcd1234" + assert elffile.sections[0].header.sh_addr == 2863311530 def test_vmray_model_pefile():