Skip to content

Commit

Permalink
binexport: use masking for Number features
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff committed Aug 2, 2024
1 parent 227fdeb commit 446a500
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 63 deletions.
5 changes: 4 additions & 1 deletion capa/features/extractors/binexport2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import hashlib
import logging
import contextlib
from typing import Dict, List, Tuple, Iterator
from typing import Set, Dict, List, Tuple, Iterator
from pathlib import Path
from collections import defaultdict
from dataclasses import dataclass
Expand Down Expand Up @@ -390,6 +390,9 @@ class AnalysisContext:
class FunctionContext:
ctx: AnalysisContext
flow_graph_index: int
format: Set[str]
os: Set[str]
arch: Set[str]


@dataclass
Expand Down
20 changes: 17 additions & 3 deletions capa/features/extractors/binexport2/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import List, Tuple, Iterator
from typing import Set, List, Tuple, Iterator

import capa.features.extractors.elf
import capa.features.extractors.common
Expand All @@ -15,7 +15,7 @@
import capa.features.extractors.binexport2.helpers
import capa.features.extractors.binexport2.function
import capa.features.extractors.binexport2.basicblock
from capa.features.common import Feature
from capa.features.common import OS, Arch, Format, Feature
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.binexport2 import (
AddressSpace,
Expand Down Expand Up @@ -53,6 +53,20 @@ def __init__(self, be2: BinExport2, buf: bytes):
self.global_features.extend(list(capa.features.extractors.common.extract_os(self.buf)))
self.global_features.extend(list(capa.features.extractors.common.extract_arch(self.buf)))

self.format: Set[str] = set()
self.os: Set[str] = set()
self.arch: Set[str] = set()

for feature, _ in self.global_features:
assert isinstance(feature.value, str)

if isinstance(feature, Format):
self.format.add(feature.value)
elif isinstance(feature, OS):
self.os.add(feature.value)
elif isinstance(feature, Arch):
self.arch.add(feature.value)

# TODO(mr): assert supported file formats, arches
# and gradually relax restrictions as they're tested.
# https://github.com/mandiant/capa/issues/1755
Expand Down Expand Up @@ -82,7 +96,7 @@ def get_functions(self) -> Iterator[FunctionHandle]:

yield FunctionHandle(
AbsoluteVirtualAddress(flow_graph_address),
inner=FunctionContext(self.ctx, flow_graph_index),
inner=FunctionContext(self.ctx, flow_graph_index, self.format, self.os, self.arch),
)

def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
Expand Down
120 changes: 63 additions & 57 deletions capa/features/extractors/binexport2/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import capa.features.extractors.strings
import capa.features.extractors.binexport2.helpers
from capa.features.insn import API, Number, Mnemonic, OperandNumber
from capa.features.common import Bytes, String, Feature, Characteristic
from capa.features.common import ARCH_I386, ARCH_AMD64, ARCH_AARCH64, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.binexport2 import (
AddressSpace,
Expand All @@ -34,15 +34,61 @@
SECURITY_COOKIE_BYTES_DELTA: int = 0x40


def get_operand_expression_register(op_index: int, be2: BinExport2) -> Optional[str]:
op: BinExport2.Operand = be2.operand[op_index]
HAS_ARCH32 = {ARCH_I386}
HAS_ARCH64 = {ARCH_AARCH64, ARCH_AMD64}

HAS_ARCH_INTEL = {ARCH_I386, ARCH_AMD64}
HAS_ARCH_ARM = {ARCH_AARCH64}


def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
if len(op.expression_index) == 1:
exp: BinExport2.Expression = be2.expression[op.expression_index[0]]
exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
if exp.type == BinExport2.Expression.Type.REGISTER:
return exp.symbol.lower()
return None


def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Optional[int]:
op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
immediate: Optional[int] = None

if len(op.expression_index) == 1:
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0

exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
if BinExport2.Expression.Type.IMMEDIATE_INT == exp.type:
immediate = exp.immediate

elif len(op.expression_index) == 2:
# from IDA, which provides a size hint for every operand,
# we get the following pattern for immediate constants:
#
# - type: SIZE_PREFIX
# symbol: "b8"
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0

exp0: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
exp1: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[1]]

if BinExport2.Expression.Type.SIZE_PREFIX == exp0.type:
if BinExport2.Expression.Type.IMMEDIATE_INT == exp1.type:
immediate = exp1.immediate

if immediate is not None:
if fhi.arch & HAS_ARCH64:
immediate &= 0xFFFFFFFFFFFFFFFF
elif fhi.arch & HAS_ARCH32:
immediate &= 0xFFFFFFFF

return immediate


def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner
Expand Down Expand Up @@ -114,65 +160,25 @@ def extract_insn_number_features(
# .text:0040116e leave
return

# x86 / amd64
mnemonic = be2.mnemonic[instruction.mnemonic_index]
if mnemonic.name.lower().startswith("ret"):
# skip things like:
# .text:0042250E retn 8
return
if fhi.arch & HAS_ARCH_INTEL:
mnemonic = be2.mnemonic[instruction.mnemonic_index]
if mnemonic.name.lower().startswith("ret"):
# skip things like:
# .text:0042250E retn 8
return

register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], be2)
if register is not None:
# x86 / amd64
if fhi.arch & HAS_ARCH_INTEL:
if mnemonic.name.lower().startswith(("add", "sub")):
if register.endswith(("sp", "bp")):
return
register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi)
if register is not None:
if register.endswith(("sp", "bp")):
return

for i, operand_index in enumerate(instruction.operand_index):
operand = be2.operand[operand_index]

if len(operand.expression_index) == 1:
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0

expression0 = be2.expression[operand.expression_index[0]]

if BinExport2.Expression.Type.IMMEDIATE_INT != expression0.type:
continue

value = expression0.immediate

# handling continues below at label: has a value

elif len(operand.expression_index) == 2:
# from IDA, which provides a size hint for every operand,
# we get the following pattern for immediate constants:
#
# - type: SIZE_PREFIX
# symbol: "b8"
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0

expression0 = be2.expression[operand.expression_index[0]]
expression1 = be2.expression[operand.expression_index[1]]

if BinExport2.Expression.Type.SIZE_PREFIX != expression0.type:
continue

if BinExport2.Expression.Type.IMMEDIATE_INT != expression1.type:
continue

value = expression1.immediate

# handling continues below at label: has a value

else:
value: Optional[int] = get_operand_expression_immediate(operand_index, fhi)
if value is None:
continue

# label: has a value

if analysis.base_address == 0x0:
# When the image is mapped at 0x0,
# then its hard to tell if numbers are pointers or numbers.
Expand Down
4 changes: 2 additions & 2 deletions tests/test_binexport_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,9 @@
(
"687e79.ghidra.be2",
"function=0x1057f8,bb=0x1057f8",
capa.features.insn.Number(-1),
capa.features.insn.Number(0xFFFFFFFFFFFFFFFF),
True,
), # TODO(mr): this should be unsigned / use two's complement, https://github.com/mandiant/capa/issues/1755
),
(
"687e79.ghidra.be2",
"function=0x1057f8,bb=0x1057f8",
Expand Down

0 comments on commit 446a500

Please sign in to comment.