Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix VMRay missing process data #2396

Merged
merged 13 commits into from
Sep 26, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

- use Python 3.12 to build extra standalone build on Linux #2383 @williballenthin
- bump minimum Python version to 3.8.1 to satisfy uv #2387 @williballenthin
- collect more process information from flog.xml #2394 @mr-tz
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

### capa explorer IDA Pro plugin

Expand Down
89 changes: 58 additions & 31 deletions capa/features/extractors/vmray/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path
from zipfile import ZipFile
from collections import defaultdict
from dataclasses import dataclass

from capa.exceptions import UnsupportedFormatError
from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall, xml_to_dict
Expand All @@ -21,6 +22,21 @@
SUPPORTED_FLOG_VERSIONS = ("2",)


@dataclass
class VMRayMonitorThread:
tid: int # thread ID assigned by OS
monitor_id: int # unique ID assigned to thread by VMRay
process_monitor_id: int # unqiue ID assigned to containing process by VMRay
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved


@dataclass
class VMRayMonitorProcess:
pid: int # process ID assigned by OS
ppid: int # parent process ID assigned by OS
monitor_id: int # unique ID assigned to process by VMRay
image_name: str


class VMRayAnalysis:
def __init__(self, zipfile_path: Path):
self.zipfile = ZipFile(zipfile_path, "r")
Expand All @@ -45,9 +61,11 @@ def __init__(self, zipfile_path: Path):
self.exports: Dict[int, str] = {}
self.imports: Dict[int, Tuple[str, str]] = {}
self.sections: Dict[int, str] = {}
self.process_ids: Dict[int, int] = {}
self.process_threads: Dict[int, List[int]] = defaultdict(list)
self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
self.monitor_processes: Dict[int, VMRayMonitorProcess] = {}
self.monitor_threads: Dict[int, VMRayMonitorThread] = {}
self.monitor_threads_by_monitor_process: Dict[int, List[int]] = defaultdict(list)
self.monitor_process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list))
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

self.base_address: int

self.sample_file_name: Optional[str] = None
Expand Down Expand Up @@ -79,13 +97,14 @@ def __init__(self, zipfile_path: Path):

self.sample_file_buf: bytes = self.zipfile.read(sample_file_path, pwd=DEFAULT_ARCHIVE_PASSWORD)

# do not change order, it matters
self._compute_base_address()
self._compute_imports()
self._compute_exports()
self._compute_sections()
self._compute_process_ids()
self._compute_process_threads()
self._compute_process_calls()
self._compute_monitor_processes()
self._compute_monitor_threads()
self._compute_monitor_process_calls()

def _find_sample_file(self):
for file_name, file_analysis in self.sv2.files.items():
Expand Down Expand Up @@ -128,34 +147,42 @@ def _compute_sections(self):
for elffile_section in self.sample_file_static_data.elf.sections:
self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name

def _compute_process_ids(self):
def _compute_monitor_processes(self):
for process in self.sv2.processes.values():
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
# we expect VMRay's monitor IDs to be unique, but OS PIDs may be reused
assert process.monitor_id not in self.process_ids.keys()
self.process_ids[process.monitor_id] = process.os_pid

def _compute_process_threads(self):
# logs/flog.xml appears to be the only file that contains thread-related data
# so we use it here to map processes to threads
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id
# we expect monitor IDs to be unique
assert process.monitor_id not in self.monitor_processes.keys()
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

assert isinstance(pid, int)
assert isinstance(tid, int)

if tid not in self.process_threads[pid]:
self.process_threads[pid].append(tid)
ppid: int = (
self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0
)
self.monitor_processes[process.monitor_id] = VMRayMonitorProcess(
process.os_pid, ppid, process.monitor_id, process.image_name
)

def _compute_process_calls(self):
for function_call in self.flog.analysis.function_calls:
pid: int = self.get_process_os_pid(function_call.process_id) # flog.xml uses process monitor ID, not OS PID
tid: int = function_call.thread_id
# not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394
for monitor_process in self.flog.analysis.monitor_processes:
if monitor_process.process_id not in self.monitor_processes.keys():
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
self.monitor_processes[monitor_process.process_id] = VMRayMonitorProcess(
monitor_process.os_pid,
monitor_process.os_parent_pid,
monitor_process.process_id,
monitor_process.image_name,
)

def _compute_monitor_threads(self):
for monitor_thread in self.flog.analysis.monitor_threads:
# we expect monitor IDs to be unique
assert monitor_thread.thread_id not in self.monitor_threads.keys()

self.monitor_threads[monitor_thread.thread_id] = VMRayMonitorThread(
monitor_thread.os_tid, monitor_thread.thread_id, monitor_thread.process_id
)

assert isinstance(pid, int)
assert isinstance(tid, int)
# we expect 1 monitor thread per monitor process
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
assert monitor_thread.thread_id not in self.monitor_threads_by_monitor_process[monitor_thread.thread_id]

self.process_calls[pid][tid].append(function_call)
self.monitor_threads_by_monitor_process[monitor_thread.process_id].append(monitor_thread.thread_id)

def get_process_os_pid(self, monitor_id: int) -> int:
return self.process_ids[monitor_id]
def _compute_monitor_process_calls(self):
for function_call in self.flog.analysis.function_calls:
self.monitor_process_calls[function_call.process_id][function_call.thread_id].append(function_call)
31 changes: 21 additions & 10 deletions capa/features/extractors/vmray/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,16 @@
import capa.features.extractors.vmray.file
import capa.features.extractors.vmray.global_
from capa.features.common import Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, Process, ParamList, FunctionCall
from capa.features.address import (
NO_ADDRESS,
Address,
ThreadAddress,
ProcessAddress,
DynamicCallAddress,
AbsoluteVirtualAddress,
)
from capa.features.extractors.vmray import VMRayAnalysis, VMRayMonitorThread, VMRayMonitorProcess
from capa.features.extractors.vmray.models import PARAM_TYPE_STR, ParamList, FunctionCall
from capa.features.extractors.base_extractor import (
CallHandle,
SampleHashes,
Expand Down Expand Up @@ -69,20 +76,24 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from self.global_features

def get_processes(self) -> Iterator[ProcessHandle]:
yield from capa.features.extractors.vmray.file.get_processes(self.analysis)
for monitor_process in self.analysis.monitor_processes.values():
address: ProcessAddress = ProcessAddress(pid=monitor_process.pid, ppid=monitor_process.ppid)
yield ProcessHandle(address, inner=monitor_process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
# we have not identified process-specific features for VMRay yet
yield from []

def get_process_name(self, ph) -> str:
process: Process = ph.inner
return process.image_name
monitor_process: VMRayMonitorProcess = ph.inner
return monitor_process.image_name

def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
for thread in self.analysis.process_threads[ph.address.pid]:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
yield ThreadHandle(address=address, inner={})
for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]:
monitor_thread: VMRayMonitorThread = self.analysis.monitor_threads[monitor_thread_id]

address: ThreadAddress = ThreadAddress(process=ph.address, tid=monitor_thread.tid)
yield ThreadHandle(address=address, inner=monitor_thread)

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
if False:
Expand All @@ -92,7 +103,7 @@ def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterat
return

def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]:
for function_call in self.analysis.monitor_process_calls[ph.inner.monitor_id][th.inner.monitor_id]:
addr = DynamicCallAddress(thread=th.address, id=function_call.fncall_id)
yield CallHandle(address=addr, inner=function_call)

Expand Down
23 changes: 2 additions & 21 deletions capa/features/extractors/vmray/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,37 +6,18 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import Dict, Tuple, Iterator
from typing import Tuple, Iterator

import capa.features.extractors.common
from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress
from capa.features.extractors.vmray import VMRayAnalysis
from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.vmray.models import Process
from capa.features.extractors.base_extractor import ProcessHandle

logger = logging.getLogger(__name__)


def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]:
processes: Dict[str, Process] = analysis.sv2.processes

for process in processes.values():
# we map VMRay's monitor ID to the OS PID to make it easier for users
# to follow the processes in capa's output
pid: int = analysis.get_process_os_pid(process.monitor_id)
ppid: int = (
analysis.get_process_os_pid(processes[process.ref_parent_process.path[1]].monitor_id)
if process.ref_parent_process
else 0
)

addr: ProcessAddress = ProcessAddress(pid=pid, ppid=ppid)
yield ProcessHandle(address=addr, inner=process)


def extract_export_names(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
for addr, name in analysis.exports.items():
yield Export(name), AbsoluteVirtualAddress(addr)
Expand Down
40 changes: 37 additions & 3 deletions capa/features/extractors/vmray/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ class Param(BaseModel):
deref: Optional[ParamDeref] = None


def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
def validate_ensure_is_list(value: Union[List[Param], Param]) -> List[Param]:
if isinstance(value, list):
return value
else:
Expand All @@ -97,7 +97,7 @@ def validate_param_list(value: Union[List[Param], Param]) -> List[Param]:
# params may be stored as a list of Param or a single Param so we convert
# the input value to Python list type before the inner validation (List[Param])
# is called
ParamList = Annotated[List[Param], BeforeValidator(validate_param_list)]
ParamList = Annotated[List[Param], BeforeValidator(validate_ensure_is_list)]


class Params(BaseModel):
Expand Down Expand Up @@ -137,12 +137,46 @@ class FunctionReturn(BaseModel):
from_addr: HexInt = Field(alias="from")


class MonitorProcess(BaseModel):
ts: HexInt
process_id: int
image_name: str
filename: str
# page_root: HexInt
os_pid: HexInt
# os_integrity_level: HexInt
# os_privileges: HexInt
monitor_reason: str
parent_id: int
os_parent_pid: HexInt
# cmd_line: str
# cur_dir: str
# os_username: str
# bitness: int
# os_groups: str


class MonitorThread(BaseModel):
ts: HexInt
thread_id: int
process_id: int
os_tid: HexInt


# handle if there's only single entries, but the model expects a list
MonitorProcessList = Annotated[List[MonitorProcess], BeforeValidator(validate_ensure_is_list)]
MonitorThreadList = Annotated[List[MonitorThread], BeforeValidator(validate_ensure_is_list)]
FunctionCallList = Annotated[List[FunctionCall], BeforeValidator(validate_ensure_is_list)]


class Analysis(BaseModel):
log_version: str # tested 2
analyzer_version: str # tested 2024.2.1
# analysis_date: str

function_calls: List[FunctionCall] = Field(alias="fncall", default=[])
monitor_processes: MonitorProcessList = Field(alias="monitor_process", default=[])
monitor_threads: MonitorThreadList = Field(alias="monitor_thread", default=[])
function_calls: FunctionCallList = Field(alias="fncall", default=[])
# function_returns: List[FunctionReturn] = Field(alias="fnret", default=[])


Expand Down
8 changes: 8 additions & 0 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,14 @@ def get_data_path_by_name(name) -> Path:
/ "vmray"
/ "93b2d1840566f45fab674ebc79a9d19c88993bcb645e0357f3cb584d16e7c795_min_archive.zip"
)
elif name.startswith("2f8a79-vmray"):
return (
CD
/ "data"
/ "dynamic"
/ "vmray"
/ "2f8a79b12a7a989ac7e5f6ec65050036588a92e65aeb6841e08dc228ff0e21b4_min_archive.zip"
)
elif name.startswith("ea2876"):
return CD / "data" / "ea2876e9175410b6f6719f80ee44b9553960758c7d0f7bed73c0fe9a78d8e669.dll_"
elif name.startswith("1038a2"):
Expand Down
39 changes: 23 additions & 16 deletions tests/test_vmray_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,21 @@
# file/imports
("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), True),
# thread/api calls
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("DoesNotExist"), False),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("DoesNotExist"), False),
# call/api
("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), True),
# call/string argument
(
"93b2d1-vmray",
"process=(2176:0),thread=7,call=10323",
"process=(2176:0),thread=2420,call=10323",
capa.features.common.String("raw.githubusercontent.com"),
True,
),
# call/number argument
# VirtualAlloc(4096, 4)
("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4096), True),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2358", capa.features.insn.Number(4), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4096), True),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2358", capa.features.insn.Number(4), True),
],
# order tests by (file, item)
# so that our LRU cache is most effective.
Expand All @@ -46,24 +46,24 @@
# file/imports
("93b2d1-vmray", "file", capa.features.file.Import("GetAddrInfoW"), 1),
# thread/api calls
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=7", capa.features.insn.API("GetAddrInfoW"), 5),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420", capa.features.insn.API("GetAddrInfoW"), 5),
# call/api
("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
("93b2d1-vmray", "process=(2176:0),thread=7,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("free"), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2345", capa.features.insn.API("GetAddrInfoW"), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=2361", capa.features.insn.API("GetAddrInfoW"), 1),
# call/string argument
(
"93b2d1-vmray",
"process=(2176:0),thread=7,call=10323",
"process=(2176:0),thread=2420,call=10323",
capa.features.common.String("raw.githubusercontent.com"),
1,
),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10323", capa.features.common.String("non_existant"), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10323", capa.features.common.String("non_existant"), 0),
# call/number argument
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4096), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(4), 1),
("93b2d1-vmray", "process=(2176:0),thread=7,call=10315", capa.features.insn.Number(404), 0),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4096), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(4), 1),
("93b2d1-vmray", "process=(2176:0),thread=2420,call=10315", capa.features.insn.Number(404), 0),
],
# order tests by (file, item)
# so that our LRU cache is most effective.
Expand All @@ -87,3 +87,10 @@ def test_vmray_features(sample, scope, feature, expected):
)
def test_vmray_feature_counts(sample, scope, feature, expected):
fixtures.do_test_feature_count(fixtures.get_vmray_extractor, sample, scope, feature, expected)


def test_vmray_processes():
# see #2394
path = fixtures.get_data_path_by_name("2f8a79-vmray")
vmre = fixtures.get_vmray_extractor(path)
assert len(vmre.analysis.monitor_processes) == 9