diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8b77d3524..262b600e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: hooks: - id: isort name: isort - stages: [commit, push, manual] + stages: [pre-commit, pre-push, manual] language: system entry: isort args: @@ -46,7 +46,7 @@ repos: hooks: - id: black name: black - stages: [commit, push, manual] + stages: [pre-commit, pre-push, manual] language: system entry: black args: @@ -64,7 +64,7 @@ repos: hooks: - id: ruff name: ruff - stages: [commit, push, manual] + stages: [pre-commit, pre-push, manual] language: system entry: ruff args: @@ -82,7 +82,7 @@ repos: hooks: - id: flake8 name: flake8 - stages: [push, manual] + stages: [pre-push, manual] language: system entry: flake8 args: @@ -101,7 +101,7 @@ repos: hooks: - id: mypy name: mypy - stages: [push, manual] + stages: [pre-push, manual] language: system entry: mypy args: @@ -119,7 +119,7 @@ repos: hooks: - id: deptry name: deptry - stages: [push, manual] + stages: [pre-push, manual] language: system entry: deptry . always_run: true diff --git a/CHANGELOG.md b/CHANGELOG.md index d4aa64f14..5c82e2507 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,12 +6,13 @@ - allow call as valid subscope for call scoped rules @mr-tz - support loading and analyzing a Binary Ninja database #2496 @xusheng6 +- vmray: record process command line details @mr-tz ### Breaking Changes - remove support for Python 3.8 and use Python 3.10 as minimum now #1966 @mr-tz -### New Rules (10) +### New Rules (18) - nursery/get-shadow-password-file-entry-on-linux jonathanlepore@google.com - nursery/set-shadow-password-file-entry-on-linux jonathanlepore@google.com @@ -23,6 +24,14 @@ - nursery/persist-via-print-processors-registry-key j.j.vannielen@utwente.nl - linking/static/touchsocket/linked-against-touchsocket still@teamt5.org - runtime/dotnet/compiled-with-dotnet-aot still@teamt5.org +- nursery/persist-via-errorhandler-script j.j.vannielen@utwente.nl +- nursery/persist-via-get-variable-hijack j.j.vannielen@utwente.nl +- nursery/persist-via-iphlpapi-dll-hijack j.j.vannielen@utwente.nl +- nursery/persist-via-lnk-shortcut j.j.vannielen@utwente.nl +- nursery/persist-via-powershell-profile j.j.vannielen@utwente.nl +- nursery/persist-via-windows-accessibility-tools j.j.vannielen@utwente.nl +- nursery/persist-via-windows-terminal-profile j.j.vannielen@utwente.nl +- nursery/write-to-browser-extension-directory j.j.vannielen@utwente.nl - ### Bug Fixes @@ -33,6 +42,9 @@ - binja: support loading raw x86/x86_64 shellcode #2489 @xusheng6 - binja: fix crash when the IL of certain functions are not available. #2249 @xusheng6 - binja: major performance improvement on the binja extractor. #1414 @xusheng6 +- cape: make Process model flexible and procmemory optional to load newest reports #2466 @mr-tz +- binja: fix unit test failure by fixing up the analysis for file al-khaser_x64.exe_ #2507 @xusheng6 +- binja: move the stack string detection to function level #2516 @xusheng6 ### capa Explorer Web diff --git a/capa/features/extractors/binja/basicblock.py b/capa/features/extractors/binja/basicblock.py index 5cb8ca138..2e47770b5 100644 --- a/capa/features/extractors/binja/basicblock.py +++ b/capa/features/extractors/binja/basicblock.py @@ -5,111 +5,21 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - -import string from typing import Iterator -from binaryninja import Function from binaryninja import BasicBlock as BinjaBasicBlock -from binaryninja import ( - BinaryView, - SymbolType, - RegisterValueType, - VariableSourceType, - MediumLevelILOperation, - MediumLevelILBasicBlock, - MediumLevelILInstruction, -) from capa.features.common import Feature, Characteristic from capa.features.address import Address from capa.features.basicblock import BasicBlock -from capa.features.extractors.helpers import MIN_STACKSTRING_LEN from capa.features.extractors.base_extractor import BBHandle, FunctionHandle -def get_printable_len_ascii(s: bytes) -> int: - """Return string length if all operand bytes are ascii or utf16-le printable""" - count = 0 - for c in s: - if c == 0: - return count - if c < 127 and chr(c) in string.printable: - count += 1 - return count - - -def get_printable_len_wide(s: bytes) -> int: - """Return string length if all operand bytes are ascii or utf16-le printable""" - if all(c == 0x00 for c in s[1::2]): - return get_printable_len_ascii(s[::2]) - return 0 - - -def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int: - bv: BinaryView = f.view - - if il.operation != MediumLevelILOperation.MLIL_CALL: - return 0 - - target = il.dest - if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]: - return 0 - - addr = target.value.value - sym = bv.get_symbol_at(addr) - if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]: - return 0 - - if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]: - return 0 - - if len(il.params) < 2: - return 0 - - dest = il.params[0] - if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]: - var = dest.src - else: - return 0 - - if var.source_type != VariableSourceType.StackVariableSourceType: - return 0 - - src = il.params[1] - if src.value.type != RegisterValueType.ConstantDataAggregateValue: - return 0 - - s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value) - return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s))) - - -def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool: - """check basic block for stackstring indicators - - true if basic block contains enough moves of constant bytes to the stack - """ - count = 0 - for il in bb: - count += get_stack_string_len(f, il) - if count > MIN_STACKSTRING_LEN: - return True - - return False - - -def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: - """extract stackstring indicators from basic block""" - bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner - if bb[1] is not None and bb_contains_stackstring(fh.inner, bb[1]): - yield Characteristic("stack string"), bbh.address - - def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: """extract tight loop indicators from a basic block""" - bb: tuple[BinjaBasicBlock, MediumLevelILBasicBlock] = bbh.inner - for edge in bb[0].outgoing_edges: - if edge.target.start == bb[0].start: + bb: BinjaBasicBlock = bbh.inner + for edge in bb.outgoing_edges: + if edge.target.start == bb.start: yield Characteristic("tight loop"), bbh.address @@ -121,7 +31,4 @@ def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Featur yield BasicBlock(), bbh.address -BASIC_BLOCK_HANDLERS = ( - extract_bb_tight_loop, - extract_bb_stackstring, -) +BASIC_BLOCK_HANDLERS = (extract_bb_tight_loop,) diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py index 1d4dd6bd7..953cde76e 100644 --- a/capa/features/extractors/binja/extractor.py +++ b/capa/features/extractors/binja/extractor.py @@ -8,7 +8,6 @@ from typing import Iterator import binaryninja as binja -from binaryninja import ILException import capa.features.extractors.elf import capa.features.extractors.binja.file @@ -54,23 +53,8 @@ def extract_function_features(self, fh: FunctionHandle) -> Iterator[tuple[Featur def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]: f: binja.Function = fh.inner - # Set up a MLIL basic block dict look up to associate the disassembly basic block with its MLIL basic block - mlil_lookup = {} - try: - mlil = f.mlil - except ILException: - return - - if mlil is None: - return - - for mlil_bb in mlil.basic_blocks: - mlil_lookup[mlil_bb.source_block.start] = mlil_bb - for bb in f.basic_blocks: - mlil_bb = mlil_lookup.get(bb.start) - - yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=(bb, mlil_bb)) + yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb) def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[tuple[Feature, Address]]: yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh) @@ -78,10 +62,10 @@ def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Ite def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]: import capa.features.extractors.binja.helpers as binja_helpers - bb: tuple[binja.BasicBlock, binja.MediumLevelILBasicBlock] = bbh.inner - addr = bb[0].start + bb: binja.BasicBlock = bbh.inner + addr = bb.start - for text, length in bb[0]: + for text, length in bb: insn = binja_helpers.DisassemblyInstruction(addr, length, text) yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn) addr += length diff --git a/capa/features/extractors/binja/function.py b/capa/features/extractors/binja/function.py index 18973539b..c7c017d1b 100644 --- a/capa/features/extractors/binja/function.py +++ b/capa/features/extractors/binja/function.py @@ -5,14 +5,27 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import string from typing import Iterator -from binaryninja import Function, BinaryView, SymbolType, LowLevelILOperation +from binaryninja import ( + Function, + BinaryView, + SymbolType, + ILException, + RegisterValueType, + VariableSourceType, + LowLevelILOperation, + MediumLevelILOperation, + MediumLevelILBasicBlock, + MediumLevelILInstruction, +) from capa.features.file import FunctionName from capa.features.common import Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors import loops +from capa.features.extractors.helpers import MIN_STACKSTRING_LEN from capa.features.extractors.binja.helpers import get_llil_instr_at_addr from capa.features.extractors.base_extractor import FunctionHandle @@ -95,10 +108,103 @@ def extract_function_name(fh: FunctionHandle): yield FunctionName(name[1:]), sym.address +def get_printable_len_ascii(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + count = 0 + for c in s: + if c == 0: + return count + if c < 127 and chr(c) in string.printable: + count += 1 + return count + + +def get_printable_len_wide(s: bytes) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + if all(c == 0x00 for c in s[1::2]): + return get_printable_len_ascii(s[::2]) + return 0 + + +def get_stack_string_len(f: Function, il: MediumLevelILInstruction) -> int: + bv: BinaryView = f.view + + if il.operation != MediumLevelILOperation.MLIL_CALL: + return 0 + + target = il.dest + if target.operation not in [MediumLevelILOperation.MLIL_CONST, MediumLevelILOperation.MLIL_CONST_PTR]: + return 0 + + addr = target.value.value + sym = bv.get_symbol_at(addr) + if not sym or sym.type not in [SymbolType.LibraryFunctionSymbol, SymbolType.SymbolicFunctionSymbol]: + return 0 + + if sym.name not in ["__builtin_strncpy", "__builtin_strcpy", "__builtin_wcscpy"]: + return 0 + + if len(il.params) < 2: + return 0 + + dest = il.params[0] + if dest.operation in [MediumLevelILOperation.MLIL_ADDRESS_OF, MediumLevelILOperation.MLIL_VAR]: + var = dest.src + else: + return 0 + + if var.source_type != VariableSourceType.StackVariableSourceType: + return 0 + + src = il.params[1] + if src.value.type != RegisterValueType.ConstantDataAggregateValue: + return 0 + + s = f.get_constant_data(RegisterValueType.ConstantDataAggregateValue, src.value.value) + return max(get_printable_len_ascii(bytes(s)), get_printable_len_wide(bytes(s))) + + +def bb_contains_stackstring(f: Function, bb: MediumLevelILBasicBlock) -> bool: + """check basic block for stackstring indicators + + true if basic block contains enough moves of constant bytes to the stack + """ + count = 0 + for il in bb: + count += get_stack_string_len(f, il) + if count > MIN_STACKSTRING_LEN: + return True + + return False + + +def extract_stackstring(fh: FunctionHandle): + """extract stackstring indicators""" + func: Function = fh.inner + bv: BinaryView = func.view + if bv is None: + return + + try: + mlil = func.mlil + except ILException: + return + + for block in mlil.basic_blocks: + if bb_contains_stackstring(func, block): + yield Characteristic("stack string"), block.source_block.start + + def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: for func_handler in FUNCTION_HANDLERS: for feature, addr in func_handler(fh): yield feature, addr -FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call, extract_function_name) +FUNCTION_HANDLERS = ( + extract_function_calls_to, + extract_function_loop, + extract_recursive_call, + extract_function_name, + extract_stackstring, +) diff --git a/capa/features/extractors/binja/insn.py b/capa/features/extractors/binja/insn.py index 618ee7a13..90be9a55f 100644 --- a/capa/features/extractors/binja/insn.py +++ b/capa/features/extractors/binja/insn.py @@ -45,14 +45,15 @@ def is_stub_function(bv: BinaryView, addr: int) -> Optional[int]: ]: return None - if llil.dest.value.type not in [ - RegisterValueType.ImportedAddressValue, - RegisterValueType.ConstantValue, - RegisterValueType.ConstantPointerValue, + # The LLIL instruction retrieved by `get_llil_instr_at_addr` did not go through a full analysis, so we cannot check + # `llil.dest.value.type` here + if llil.dest.operation not in [ + LowLevelILOperation.LLIL_CONST, + LowLevelILOperation.LLIL_CONST_PTR, ]: return None - return llil.dest.value.value + return llil.dest.constant def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[tuple[Feature, Address]]: @@ -358,7 +359,7 @@ def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index # e.g., , (LLIL_SET_REG). So we do not need to check whether the two operands are the same. if il.operation == LowLevelILOperation.LLIL_XOR: # Exclude cases related to the stack cookie - if is_nzxor_stack_cookie(fh.inner, bbh.inner[0], il): + if is_nzxor_stack_cookie(fh.inner, bbh.inner, il): return False results.append((Characteristic("nzxor"), ih.address)) return False diff --git a/capa/features/extractors/cape/models.py b/capa/features/extractors/cape/models.py index 20bedec24..c37eddd2a 100644 --- a/capa/features/extractors/cape/models.py +++ b/capa/features/extractors/cape/models.py @@ -297,7 +297,10 @@ class Call(ExactModel): id: int -class Process(ExactModel): +# FlexibleModel to account for extended fields +# refs: https://github.com/mandiant/capa/issues/2466 +# https://github.com/kevoreilly/CAPEv2/pull/2199 +class Process(FlexibleModel): process_id: int process_name: str parent_id: int @@ -400,7 +403,7 @@ class CapeReport(FlexibleModel): CAPE: Optional[Union[Cape, list]] = None dropped: Optional[list[File]] = None procdump: Optional[list[ProcessFile]] = None - procmemory: ListTODO + procmemory: Optional[ListTODO] = None # ========================================================================= # information we won't use in capa diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index a8976cd8c..dc719211a 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -35,6 +35,8 @@ class VMRayMonitorProcess: ppid: int # parent process ID assigned by OS monitor_id: int # unique ID assigned to process by VMRay image_name: str + filename: str + cmd_line: str class VMRayAnalysis: @@ -160,7 +162,12 @@ def _compute_monitor_processes(self): self.sv2.processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0 ) self.monitor_processes[process.monitor_id] = VMRayMonitorProcess( - process.os_pid, ppid, process.monitor_id, process.image_name + process.os_pid, + ppid, + process.monitor_id, + process.image_name, + process.filename, + process.cmd_line, ) # not all processes are recorded in SummaryV2.json, get missing data from flog.xml, see #2394 @@ -170,6 +177,8 @@ def _compute_monitor_processes(self): monitor_process.os_parent_pid, monitor_process.process_id, monitor_process.image_name, + monitor_process.filename, + monitor_process.cmd_line, ) if monitor_process.process_id not in self.monitor_processes: diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index a9f0491c9..7f40f25da 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -86,7 +86,7 @@ def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, def get_process_name(self, ph) -> str: monitor_process: VMRayMonitorProcess = ph.inner - return monitor_process.image_name + return f"{monitor_process.image_name} ({monitor_process.cmd_line})" def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: for monitor_thread_id in self.analysis.monitor_threads_by_monitor_process[ph.inner.monitor_id]: diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py index c2d6551aa..755f494fe 100644 --- a/capa/features/extractors/vmray/models.py +++ b/capa/features/extractors/vmray/models.py @@ -136,11 +136,20 @@ class FunctionReturn(BaseModel): from_addr: HexInt = Field(alias="from") +def sanitize_string(value: str) -> str: + # e.g. "cmd_line": "\"C:\\Users\\38lTTV5Kii\\Desktop\\filename.exe\" ", + return value.replace("\\\\", "\\").strip(' "') + + +# unify representation +SanitizedString = Annotated[str, BeforeValidator(sanitize_string)] + + class MonitorProcess(BaseModel): ts: HexInt process_id: int image_name: str - filename: str + filename: SanitizedString # page_root: HexInt os_pid: HexInt # os_integrity_level: HexInt @@ -148,7 +157,7 @@ class MonitorProcess(BaseModel): monitor_reason: str parent_id: int os_parent_pid: HexInt - # cmd_line: str + cmd_line: SanitizedString # cur_dir: str # os_username: str # bitness: int @@ -306,8 +315,9 @@ class Process(BaseModel): monitor_id: int # monitor_reason: str os_pid: int - filename: str + filename: SanitizedString image_name: str + cmd_line: SanitizedString ref_parent_process: Optional[GenericReference] = None diff --git a/requirements.txt b/requirements.txt index eaee96623..67c792066 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ msgpack==1.0.8 networkx==3.4.2 pefile==2024.8.26 pip==24.3.1 -protobuf==5.28.2 +protobuf==5.29.0 pyasn1==0.5.1 pyasn1-modules==0.3.0 pycparser==2.22 diff --git a/rules b/rules index 16492182a..ed816a8e5 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit 16492182a3ce3f95bba56531cb81030e015edaba +Subproject commit ed816a8e53446cd21a9d634b7d0531df664ba1c1 diff --git a/tests/fixtures.py b/tests/fixtures.py index 110b7228e..ad70a1663 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -180,6 +180,12 @@ def get_binja_extractor(path: Path): if path.name.endswith("kernel32-64.dll_"): settings.set_bool("pdb.loadGlobalSymbols", old_pdb) + # TODO(xusheng6): Temporary fix for https://github.com/mandiant/capa/issues/2507. Remove this once it is fixed in + # binja + if "al-khaser_x64.exe_" in path.name: + bv.create_user_function(0x14004B4F0) + bv.update_analysis_and_wait() + extractor = capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv) # overload the extractor so that the fixture exposes `extractor.path`