From 446a500f18506472af68b18d277f0d2ebfdf558e Mon Sep 17 00:00:00 2001 From: Mike Hunhoff Date: Fri, 2 Aug 2024 16:13:57 -0600 Subject: [PATCH] binexport: use masking for Number features --- .../extractors/binexport2/__init__.py | 5 +- .../extractors/binexport2/extractor.py | 20 ++- capa/features/extractors/binexport2/insn.py | 120 +++++++++--------- tests/test_binexport_features.py | 4 +- 4 files changed, 86 insertions(+), 63 deletions(-) diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py index da7bbcaf3..76731e8ac 100644 --- a/capa/features/extractors/binexport2/__init__.py +++ b/capa/features/extractors/binexport2/__init__.py @@ -14,7 +14,7 @@ import hashlib import logging import contextlib -from typing import Dict, List, Tuple, Iterator +from typing import Set, Dict, List, Tuple, Iterator from pathlib import Path from collections import defaultdict from dataclasses import dataclass @@ -390,6 +390,9 @@ class AnalysisContext: class FunctionContext: ctx: AnalysisContext flow_graph_index: int + format: Set[str] + os: Set[str] + arch: Set[str] @dataclass diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py index 1c3c4d393..3ed3b5d07 100644 --- a/capa/features/extractors/binexport2/extractor.py +++ b/capa/features/extractors/binexport2/extractor.py @@ -6,7 +6,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging -from typing import List, Tuple, Iterator +from typing import Set, List, Tuple, Iterator import capa.features.extractors.elf import capa.features.extractors.common @@ -15,7 +15,7 @@ import capa.features.extractors.binexport2.helpers import capa.features.extractors.binexport2.function import capa.features.extractors.binexport2.basicblock -from capa.features.common import Feature +from capa.features.common import OS, Arch, Format, Feature from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.binexport2 import ( AddressSpace, @@ -53,6 +53,20 @@ def __init__(self, be2: BinExport2, buf: bytes): self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf))) self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf))) + self.format: Set[str] = set() + self.os: Set[str] = set() + self.arch: Set[str] = set() + + for feature, _ in self.global_features: + assert isinstance(feature.value, str) + + if isinstance(feature, Format): + self.format.add(feature.value) + elif isinstance(feature, OS): + self.os.add(feature.value) + elif isinstance(feature, Arch): + self.arch.add(feature.value) + # TODO(mr): assert supported file formats, arches # and gradually relax restrictions as they're tested. # https://github.com/mandiant/capa/issues/1755 @@ -82,7 +96,7 @@ def get_functions(self) -> Iterator[FunctionHandle]: yield FunctionHandle( AbsoluteVirtualAddress(flow_graph_address), - inner=FunctionContext(self.ctx, flow_graph_index), + inner=FunctionContext(self.ctx, flow_graph_index, self.format, self.os, self.arch), ) def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 970306756..a8b84a930 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -12,7 +12,7 @@ import capa.features.extractors.strings import capa.features.extractors.binexport2.helpers from capa.features.insn import API, Number, Mnemonic, OperandNumber -from capa.features.common import Bytes, String, Feature, Characteristic +from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors.binexport2 import ( AddressSpace, @@ -34,15 +34,61 @@ SECURITY_COOKIE_BYTES_DELTA: int = 0x40 -def get_operand_expression_register(op_index: int, be2: BinExport2) -> Optional[str]: - op: BinExport2.Operand = be2.operand[op_index] +HAS_ARCH32 = {ARCH_I386} +HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64} + +HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64} +HAS_ARCH_ARM = {ARCH_AARCH64} + + +def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Optional[str]: + op: BinExport2.Operand = fhi.ctx.be2.operand[op_index] if len(op.expression_index) == 1: - exp: BinExport2.Expression = be2.expression[op.expression_index[0]] + exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] if exp.type == BinExport2.Expression.Type.REGISTER: return exp.symbol.lower() return None +def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Optional[int]: + op: BinExport2.Operand = fhi.ctx.be2.operand[op_index] + immediate: Optional[int] = None + + if len(op.expression_index) == 1: + # - type: IMMEDIATE_INT + # immediate: 20588728364 + # parent_index: 0 + + exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] + if BinExport2.Expression.Type.IMMEDIATE_INT == exp.type: + immediate = exp.immediate + + elif len(op.expression_index) == 2: + # from IDA, which provides a size hint for every operand, + # we get the following pattern for immediate constants: + # + # - type: SIZE_PREFIX + # symbol: "b8" + # - type: IMMEDIATE_INT + # immediate: 20588728364 + # parent_index: 0 + + exp0: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] + exp1: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[1]] + + if BinExport2.Expression.Type.SIZE_PREFIX == exp0.type: + if BinExport2.Expression.Type.IMMEDIATE_INT == exp1.type: + immediate = exp1.immediate + + if immediate is not None: + if fhi.arch & HAS_ARCH64: + immediate &= 0xFFFFFFFFFFFFFFFF + elif fhi.arch & HAS_ARCH32: + immediate &= 0xFFFFFFFF + + return immediate + + def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: fhi: FunctionContext = fh.inner ii: InstructionContext = ih.inner @@ -114,65 +160,25 @@ def extract_insn_number_features( # .text:0040116e leave return - # x86 / amd64 - mnemonic = be2.mnemonic[instruction.mnemonic_index] - if mnemonic.name.lower().startswith("ret"): - # skip things like: - # .text:0042250E retn 8 - return + if fhi.arch & HAS_ARCH_INTEL: + mnemonic = be2.mnemonic[instruction.mnemonic_index] + if mnemonic.name.lower().startswith("ret"): + # skip things like: + # .text:0042250E retn 8 + return - register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], be2) - if register is not None: - # x86 / amd64 + if fhi.arch & HAS_ARCH_INTEL: if mnemonic.name.lower().startswith(("add", "sub")): - if register.endswith(("sp", "bp")): - return + register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi) + if register is not None: + if register.endswith(("sp", "bp")): + return for i, operand_index in enumerate(instruction.operand_index): - operand = be2.operand[operand_index] - - if len(operand.expression_index) == 1: - # - type: IMMEDIATE_INT - # immediate: 20588728364 - # parent_index: 0 - - expression0 = be2.expression[operand.expression_index[0]] - - if BinExport2.Expression.Type.IMMEDIATE_INT != expression0.type: - continue - - value = expression0.immediate - - # handling continues below at label: has a value - - elif len(operand.expression_index) == 2: - # from IDA, which provides a size hint for every operand, - # we get the following pattern for immediate constants: - # - # - type: SIZE_PREFIX - # symbol: "b8" - # - type: IMMEDIATE_INT - # immediate: 20588728364 - # parent_index: 0 - - expression0 = be2.expression[operand.expression_index[0]] - expression1 = be2.expression[operand.expression_index[1]] - - if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type: - continue - - if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type: - continue - - value = expression1.immediate - - # handling continues below at label: has a value - - else: + value: Optional[int] = get_operand_expression_immediate(operand_index, fhi) + if value is None: continue - # label: has a value - if analysis.base_address == 0x0: # When the image is mapped at 0x0, # then its hard to tell if numbers are pointers or numbers. diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py index 0d1fea392..4180baaf2 100644 --- a/tests/test_binexport_features.py +++ b/tests/test_binexport_features.py @@ -174,9 +174,9 @@ ( "687e79.ghidra.be2", "function=0x1057f8,bb=0x1057f8", - capa.features.insn.Number(-1), + capa.features.insn.Number(0xFFFFFFFFFFFFFFFF), True, - ), # TODO(mr): this should be unsigned / use two's complement, https://github.com/mandiant/capa/issues/1755 + ), ( "687e79.ghidra.be2", "function=0x1057f8,bb=0x1057f8",