Skip to content

Commit

Permalink
binexport: code refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff committed Aug 6, 2024
1 parent 21d2b99 commit 210f127
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 61 deletions.
59 changes: 48 additions & 11 deletions capa/features/extractors/binexport2/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


@dataclass
class ExpressionPhraseInfo:
class OperandPhraseInfo:
scale: Optional[BinExport2.Expression] = None
index: Optional[BinExport2.Expression] = None
base: Optional[BinExport2.Expression] = None
Expand All @@ -23,17 +23,18 @@ def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGr
return vertex.HasField("type") and vertex.type == type_


def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[ExpressionPhraseInfo]:
def get_operand_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[OperandPhraseInfo]:
# assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory):
#
# Scale: A 2-bit constant factor
# Index: Any general purpose register
# Base: Any general purpose register
# Displacement: An integral offset

# skip first expression, assume BinExport2.Expression.DEREFERENCE
expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand)

# skip expression up to and including BinExport2.Expression.DEREFERENCE, assume caller
# has checked for BinExport2.Expression.DEREFERENCE
for i, expression in enumerate(expressions):
if expression.type == BinExport2.Expression.DEREFERENCE:
expressions = expressions[i + 1 :]
Expand All @@ -55,10 +56,10 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper

if expression0.type == BinExport2.Expression.IMMEDIATE_INT:
# Displacement
return ExpressionPhraseInfo(displacement=expression0)
return OperandPhraseInfo(displacement=expression0)
elif expression0.type == BinExport2.Expression.REGISTER:
# Base
return ExpressionPhraseInfo(base=expression0)
return OperandPhraseInfo(base=expression0)

elif len(expressions) == 3:
expression0 = expressions[0]
Expand All @@ -74,10 +75,10 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper

if expression2.type == BinExport2.Expression.REGISTER:
# Base + Index
return ExpressionPhraseInfo(base=expression0, index=expression2)
return OperandPhraseInfo(base=expression0, index=expression2)
elif expression2.type == BinExport2.Expression.IMMEDIATE_INT:
# Base + Displacement
return ExpressionPhraseInfo(base=expression0, displacement=expression2)
return OperandPhraseInfo(base=expression0, displacement=expression2)

elif len(expressions) == 5:
expression0 = expressions[0]
Expand All @@ -97,13 +98,13 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper

if expression1.symbol == "+" and expression3.symbol == "+":
# Base + Index + Displacement
return ExpressionPhraseInfo(base=expression0, index=expression2, displacement=expression4)
return OperandPhraseInfo(base=expression0, index=expression2, displacement=expression4)
elif expression1.symbol == "+" and expression3.symbol == "*":
# Base + (Index * Scale)
return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression3)
return OperandPhraseInfo(base=expression0, index=expression2, scale=expression3)
elif expression1.symbol == "*" and expression3.symbol == "+":
# (Index * Scale) + Displacement
return ExpressionPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
return OperandPhraseInfo(index=expression0, scale=expression2, displacement=expression3)
else:
raise NotImplementedError(expression1.symbol, expression3.symbol)

Expand All @@ -125,7 +126,7 @@ def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Oper
assert expression6.type == BinExport2.Expression.IMMEDIATE_INT

# Base + (Index * Scale) + Displacement
return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)
return OperandPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6)

else:
raise NotImplementedError(len(expressions))
Expand Down Expand Up @@ -275,3 +276,39 @@ def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[Bin
_get_operand_expression_list(be2, op, exp_tree, 0, exp_list)

return exp_list


def get_operand_register_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
if len(operand.expression_index) == 1:
expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
if expression.type == BinExport2.Expression.REGISTER:
return expression
return None


def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
if len(operand.expression_index) == 1:
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0
expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
if expression.type == BinExport2.Expression.IMMEDIATE_INT:
return expression

elif len(operand.expression_index) == 2:
# from IDA, which provides a size hint for every operand,
# we get the following pattern for immediate constants:
#
# - type: SIZE_PREFIX
# symbol: "b8"
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0
expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]]

if expression0.type == BinExport2.Expression.SIZE_PREFIX:
if expression1.type == BinExport2.Expression.IMMEDIATE_INT:
return expression1

return None
67 changes: 17 additions & 50 deletions capa/features/extractors/binexport2/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
InstructionContext,
)
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
from capa.features.extractors.binexport2.helpers import ExpressionPhraseInfo
from capa.features.extractors.binexport2.helpers import (
OperandPhraseInfo,
get_operand_phrase_info,
get_operand_register_expression,
get_operand_immediate_expression,
)
from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2

logger = logging.getLogger(__name__)
Expand All @@ -50,43 +55,6 @@ def mask_immediate(fhi: FunctionContext, immediate: int) -> int:
return immediate


def get_operand_register(op_index: int, fhi: FunctionContext) -> Optional[str]:
op: BinExport2.Operand = fhi.ctx.be2.operand[op_index]
if len(op.expression_index) == 1:
exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]]
if exp.type == BinExport2.Expression.Type.REGISTER:
return exp.symbol.lower()
return None


def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]:
if len(operand.expression_index) == 1:
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0
expression: BinExport2.Expression = be2.expression[operand.expression_index[0]]
if expression.type == BinExport2.Expression.IMMEDIATE_INT:
return expression

elif len(operand.expression_index) == 2:
# from IDA, which provides a size hint for every operand,
# we get the following pattern for immediate constants:
#
# - type: SIZE_PREFIX
# symbol: "b8"
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0
expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]]
expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]]

if expression0.type == BinExport2.Expression.SIZE_PREFIX:
if expression1.type == BinExport2.Expression.IMMEDIATE_INT:
return expression1

return None


def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner
Expand Down Expand Up @@ -167,21 +135,22 @@ def extract_insn_number_features(
return

if mnemonic.startswith(("add", "sub")):
register: Optional[str] = get_operand_register(instruction.operand_index[0], fhi)
if register is not None:
if register.endswith(("sp", "bp")):
# skip things like:
# 0x415bbc ADD ESP, 0xC
return
register_expression: Optional[BinExport2.Expression] = get_operand_register_expression(
be2, be2.operand[instruction.operand_index[0]]
)
if register_expression and register_expression.symbol.lower().endswith(("sp", "bp")):
# skip things like:
# 0x415bbc ADD ESP, 0xC
return

for i, operand_index in enumerate(instruction.operand_index):
operand: BinExport2.Operand = be2.operand[operand_index]

expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
if expression is None:
immediate_expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand)
if not immediate_expression:
continue

value: int = mask_immediate(fhi, expression.immediate)
value: int = mask_immediate(fhi, immediate_expression.immediate)
if is_address_mapped(be2, value):
continue

Expand Down Expand Up @@ -302,9 +271,7 @@ def extract_insn_offset_features(
continue

if fhi.arch & HAS_ARCH_INTEL:
phrase_info: Optional[ExpressionPhraseInfo] = (
capa.features.extractors.binexport2.helpers.get_operand_expression_phrase_info(be2, operand)
)
phrase_info: Optional[OperandPhraseInfo] = get_operand_phrase_info(be2, operand)
if not phrase_info:
continue

Expand Down

0 comments on commit 210f127

Please sign in to comment.