diff --git a/capa/features/extractors/binexport2/helpers.py b/capa/features/extractors/binexport2/helpers.py index d06b5f93d..708dc446a 100644 --- a/capa/features/extractors/binexport2/helpers.py +++ b/capa/features/extractors/binexport2/helpers.py @@ -5,8 +5,273 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +from typing import List, Optional +from dataclasses import dataclass + from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 +@dataclass +class ExpressionPhraseInfo: + scale: Optional[BinExport2.Expression] = None + index: Optional[BinExport2.Expression] = None + base: Optional[BinExport2.Expression] = None + displacement: Optional[BinExport2.Expression] = None + + def is_vertex_type(vertex: BinExport2.CallGraph.Vertex, type_: BinExport2.CallGraph.Vertex.Type.ValueType) -> bool: return vertex.HasField("type") and vertex.type == type_ + + +def get_operand_expression_phrase_info(be2: BinExport2, operand: BinExport2.Operand) -> Optional[ExpressionPhraseInfo]: + # assume the following (see https://blog.yossarian.net/2020/06/13/How-x86_64-addresses-memory): + # + # Scale: A 2-bit constant factor + # Index: Any general purpose register + # Base: Any general purpose register + # Displacement: An integral offset + + # skip first expression, assume BinExport2.Expression.DEREFERENCE + expressions: List[BinExport2.Expression] = get_operand_expressions(be2, operand) + + for i, expression in enumerate(expressions): + if expression.type == BinExport2.Expression.DEREFERENCE: + expressions = expressions[i + 1 :] + break + + expression0: BinExport2.Expression + expression1: BinExport2.Expression + expression2: BinExport2.Expression + expression3: BinExport2.Expression + expression4: BinExport2.Expression + + if len(expressions) == 1: + expression0 = expressions[0] + + assert ( + expression0.type == BinExport2.Expression.IMMEDIATE_INT + or expression0.type == BinExport2.Expression.REGISTER + ) + + if expression0.type == BinExport2.Expression.IMMEDIATE_INT: + # Displacement + return ExpressionPhraseInfo(displacement=expression0) + elif expression0.type == BinExport2.Expression.REGISTER: + # Base + return ExpressionPhraseInfo(base=expression0) + + elif len(expressions) == 3: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.IMMEDIATE_INT + or expression2.type == BinExport2.Expression.REGISTER + ) + + if expression2.type == BinExport2.Expression.REGISTER: + # Base + Index + return ExpressionPhraseInfo(base=expression0, index=expression2) + elif expression2.type == BinExport2.Expression.IMMEDIATE_INT: + # Base + Displacement + return ExpressionPhraseInfo(base=expression0, displacement=expression2) + + elif len(expressions) == 5: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert ( + expression2.type == BinExport2.Expression.REGISTER + or expression2.type == BinExport2.Expression.IMMEDIATE_INT + ) + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + + if expression1.symbol == "+" and expression3.symbol == "+": + # Base + Index + Displacement + return ExpressionPhraseInfo(base=expression0, index=expression2, displacement=expression4) + elif expression1.symbol == "+" and expression3.symbol == "*": + # Base + (Index * Scale) + return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression3) + elif expression1.symbol == "*" and expression3.symbol == "+": + # (Index * Scale) + Displacement + return ExpressionPhraseInfo(index=expression0, scale=expression2, displacement=expression3) + else: + raise NotImplementedError(expression1.symbol, expression3.symbol) + + elif len(expressions) == 7: + expression0 = expressions[0] + expression1 = expressions[1] + expression2 = expressions[2] + expression3 = expressions[3] + expression4 = expressions[4] + expression5 = expressions[5] + expression6 = expressions[6] + + assert expression0.type == BinExport2.Expression.REGISTER + assert expression1.type == BinExport2.Expression.OPERATOR + assert expression2.type == BinExport2.Expression.REGISTER + assert expression3.type == BinExport2.Expression.OPERATOR + assert expression4.type == BinExport2.Expression.IMMEDIATE_INT + assert expression5.type == BinExport2.Expression.OPERATOR + assert expression6.type == BinExport2.Expression.IMMEDIATE_INT + + # Base + (Index * Scale) + Displacement + return ExpressionPhraseInfo(base=expression0, index=expression2, scale=expression4, displacement=expression6) + + else: + raise NotImplementedError(len(expressions)) + + return None + + +def _get_operand_expression_list( + be2: BinExport2, + operand: BinExport2.Operand, + expression_tree: List[List[int]], + tree_index: int, + expression_list: List[BinExport2.Expression], +): + exp_index = operand.expression_index[tree_index] + expression = be2.expression[exp_index] + children_tree_indexes: List[int] = expression_tree[tree_index] + + if expression.type == BinExport2.Expression.REGISTER: + expression_list.append(expression) + assert len(children_tree_indexes) == 0 + return + + elif expression.type == BinExport2.Expression.SYMBOL: + expression_list.append(expression) + assert len(children_tree_indexes) <= 1 + + if len(children_tree_indexes) == 0: + return + elif len(children_tree_indexes) == 1: + # like: v + # from: mov v0.D[0x1], x9 + # | + # 0 + # . + # | + # D + child_index = children_tree_indexes[0] + _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.IMMEDIATE_INT: + expression_list.append(expression) + assert len(children_tree_indexes) == 0 + return + + elif expression.type == BinExport2.Expression.SIZE_PREFIX: + # like: b4 + # + # We might want to use this occasionally, such as to disambiguate the + # size of MOVs into/out of memory. But I'm not sure when/where we need that yet. + # + # IDA spams this size prefix hint *everywhere*, so we can't rely on the exporter + # to provide it only when necessary. + assert len(children_tree_indexes) == 1 + child_index = children_tree_indexes[0] + _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif expression.type == BinExport2.Expression.OPERATOR: + + if len(children_tree_indexes) == 1: + # prefix operator, like "ds:" + expression_list.append(expression) + child_index = children_tree_indexes[0] + _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif len(children_tree_indexes) == 2: + # infix operator: like "+" in "ebp+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list) + expression_list.append(expression) + _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list) + return + + elif len(children_tree_indexes) == 3: + # infix operator: like "+" in "ebp+ecx+10" + child_a = children_tree_indexes[0] + child_b = children_tree_indexes[1] + child_c = children_tree_indexes[2] + _get_operand_expression_list(be2, operand, expression_tree, child_a, expression_list) + expression_list.append(expression) + _get_operand_expression_list(be2, operand, expression_tree, child_b, expression_list) + expression_list.append(expression) + _get_operand_expression_list(be2, operand, expression_tree, child_c, expression_list) + return + + else: + raise NotImplementedError(len(children_tree_indexes)) + + elif expression.type == BinExport2.Expression.DEREFERENCE: + expression_list.append(expression) + + assert len(children_tree_indexes) == 1 + child_index = children_tree_indexes[0] + _get_operand_expression_list(be2, operand, expression_tree, child_index, expression_list) + return + + elif expression.type == BinExport2.Expression.IMMEDIATE_FLOAT: + raise NotImplementedError(expression.type) + + else: + raise NotImplementedError(expression.type) + + +def get_operand_expressions(be2: BinExport2, op: BinExport2.Operand) -> List[BinExport2.Expression]: + # The reconstructed expression tree layout, linking parent nodes to their children. + # + # There is one list of integers for each expression in the operand. + # These integers are indexes of other expressions in the same operand, + # which are the children of that expression. + # + # So: + # + # [ [1, 3], [2], [], [4], [5], []] + # + # means the first expression has two children, at index 1 and 3, + # and the tree looks like: + # + # 0 + # / \ + # 1 3 + # | | + # 2 4 + # | + # 5 + # + # Remember, these are the indices into the entries in operand.expression_index. + exp_tree: List[List[int]] = [] + for i, exp_index in enumerate(op.expression_index): + children = [] + + # scan all subsequent expressions, looking for those that have parent_index == current.expression_index + for j, candidate_index in enumerate(op.expression_index[i + 1 :]): + candidate = be2.expression[candidate_index] + + if candidate.parent_index == exp_index: + children.append(i + j + 1) + + exp_tree.append(children) + + exp_list: List[BinExport2.Expression] = [] + _get_operand_expression_list(be2, op, exp_tree, 0, exp_list) + + return exp_list diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 8fbfa5705..6bf15700d 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -25,6 +25,7 @@ InstructionContext, ) from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle +from capa.features.extractors.binexport2.helpers import ExpressionPhraseInfo from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 logger = logging.getLogger(__name__) @@ -41,7 +42,15 @@ HAS_ARCH_ARM = {ARCH_AARCH64} -def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Optional[str]: +def mask_immediate(fhi: FunctionContext, immediate: int) -> int: + if fhi.arch & HAS_ARCH64: + immediate &= 0xFFFFFFFFFFFFFFFF + elif fhi.arch & HAS_ARCH32: + immediate &= 0xFFFFFFFF + return immediate + + +def get_operand_register(op_index: int, fhi: FunctionContext) -> Optional[str]: op: BinExport2.Operand = fhi.ctx.be2.operand[op_index] if len(op.expression_index) == 1: exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] @@ -50,20 +59,16 @@ def get_operand_expression_register(op_index: int, fhi: FunctionContext) -> Opti return None -def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Optional[int]: - op: BinExport2.Operand = fhi.ctx.be2.operand[op_index] - immediate: Optional[int] = None - - if len(op.expression_index) == 1: +def get_operand_immediate_expression(be2: BinExport2, operand: BinExport2.Operand) -> Optional[BinExport2.Expression]: + if len(operand.expression_index) == 1: # - type: IMMEDIATE_INT # immediate: 20588728364 # parent_index: 0 + expression: BinExport2.Expression = be2.expression[operand.expression_index[0]] + if expression.type == BinExport2.Expression.IMMEDIATE_INT: + return expression - exp: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] - if BinExport2.Expression.Type.IMMEDIATE_INT == exp.type: - immediate = exp.immediate - - elif len(op.expression_index) == 2: + elif len(operand.expression_index) == 2: # from IDA, which provides a size hint for every operand, # we get the following pattern for immediate constants: # @@ -72,29 +77,14 @@ def get_operand_expression_immediate(op_index: int, fhi: FunctionContext) -> Opt # - type: IMMEDIATE_INT # immediate: 20588728364 # parent_index: 0 + expression0: BinExport2.Expression = be2.expression[operand.expression_index[0]] + expression1: BinExport2.Expression = be2.expression[operand.expression_index[1]] - exp0: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[0]] - exp1: BinExport2.Expression = fhi.ctx.be2.expression[op.expression_index[1]] + if expression0.type == BinExport2.Expression.SIZE_PREFIX: + if expression1.type == BinExport2.Expression.IMMEDIATE_INT: + return expression1 - if BinExport2.Expression.Type.SIZE_PREFIX == exp0.type: - if BinExport2.Expression.Type.IMMEDIATE_INT == exp1.type: - immediate = exp1.immediate - - if immediate is not None: - if fhi.arch & HAS_ARCH64: - immediate &= 0xFFFFFFFFFFFFFFFF - elif fhi.arch & HAS_ARCH32: - immediate &= 0xFFFFFFFF - - return immediate - - -def get_immediate_twos_complement(immediate: int, fhi: FunctionContext) -> int: - if fhi.arch & HAS_ARCH64: - return capa.features.extractors.helpers.twos_complement(immediate, 64) - elif fhi.arch & HAS_ARCH32: - return capa.features.extractors.helpers.twos_complement(immediate, 32) - return immediate + return None def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: @@ -158,7 +148,6 @@ def extract_insn_number_features( ii: InstructionContext = ih.inner be2: BinExport2 = fhi.ctx.be2 - analysis: BinExport2Analysis = fhi.ctx.analysis instruction_index: int = ii.instruction_index instruction: BinExport2.Instruction = be2.instruction[instruction_index] @@ -168,17 +157,17 @@ def extract_insn_number_features( # .text:0040116e leave return - mnemonic = be2.mnemonic[instruction.mnemonic_index] + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() if fhi.arch & HAS_ARCH_INTEL: # short-circut checks for intel architecture - if mnemonic.name.lower().startswith("ret"): + if mnemonic.startswith("ret"): # skip things like: # .text:0042250E retn 8 return - if mnemonic.name.lower().startswith(("add", "sub")): - register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], fhi) + if mnemonic.startswith(("add", "sub")): + register: Optional[str] = get_operand_register(instruction.operand_index[0], fhi) if register is not None: if register.endswith(("sp", "bp")): # skip things like: @@ -186,22 +175,13 @@ def extract_insn_number_features( return for i, operand_index in enumerate(instruction.operand_index): - value: Optional[int] = get_operand_expression_immediate(operand_index, fhi) - if value is None: - continue - - if analysis.base_address == 0x0: - # When the image is mapped at 0x0, - # then its hard to tell if numbers are pointers or numbers. - # TODO(mr): be a little less conservative otherwise? - # https://github.com/mandiant/capa/issues/1755 + operand: BinExport2.Operand = be2.operand[operand_index] - # TODO(mr): this removes a lot of valid numbers, could check alignment and use additional heuristics - # https://github.com/mandiant/capa/issues/1755 - # if is_address_mapped(be2, value): - # continue - pass + expression: Optional[BinExport2.Expression] = get_operand_immediate_expression(be2, operand) + if expression is None: + continue + value: int = mask_immediate(fhi, expression.immediate) if is_address_mapped(be2, value): continue @@ -209,7 +189,7 @@ def extract_insn_number_features( yield OperandNumber(i, value), ih.address if fhi.arch & HAS_ARCH_INTEL: - if mnemonic.name.lower().startswith("add"): + if mnemonic.startswith("add"): if 0 < value < MAX_STRUCTURE_SIZE: yield Offset(value), ih.address yield OperandOffset(i, value), ih.address @@ -296,9 +276,61 @@ def extract_insn_string_features( def extract_insn_offset_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: - # TODO(wb): complete - # https://github.com/mandiant/capa/issues/1755 - yield from () + fhi: FunctionContext = fh.inner + ii: InstructionContext = ih.inner + + be2: BinExport2 = fhi.ctx.be2 + instruction: BinExport2.Instruction = be2.instruction[ii.instruction_index] + + if len(instruction.operand_index) == 0: + # skip things like: + # .text:0040116e leave + return + + mnemonic: str = be2.mnemonic[instruction.mnemonic_index].name.lower() + + for i, operand_index in enumerate(instruction.operand_index): + operand: BinExport2.Operand = be2.operand[operand_index] + + is_dereference = False + for expression_index in operand.expression_index: + if be2.expression[expression_index].type == BinExport2.Expression.DEREFERENCE: + is_dereference = True + break + + if not is_dereference: + continue + + if fhi.arch & HAS_ARCH_INTEL: + phrase_info: Optional[ExpressionPhraseInfo] = ( + capa.features.extractors.binexport2.helpers.get_operand_expression_phrase_info(be2, operand) + ) + if not phrase_info: + continue + + if phrase_info.displacement: + if phrase_info.base and phrase_info.base.symbol.lower().endswith(("bp", "sp")): + # skips things like: + # 00401068 MOV dword ptr [EBP + local_8],EAX + continue + + value: int = mask_immediate(fhi, phrase_info.displacement.immediate) + if not is_address_mapped(be2, value): + value = capa.features.extractors.helpers.twos_complement(value, 32) + + yield Offset(value), ih.address + yield OperandOffset(i, value), ih.address + + if mnemonic == "lea" and i == 1: + if phrase_info.base and not any((phrase_info.scale, phrase_info.index)): + yield Number(value), ih.address + yield OperandNumber(i, value), ih.address + + elif phrase_info.base and not any((phrase_info.index, phrase_info.scale)): + # like: + # 00401062 MOVZX EAX,word ptr [EDI] + yield Offset(0), ih.address + yield OperandOffset(i, 0), ih.address def is_security_cookie( diff --git a/scripts/inspect-binexport2.py b/scripts/inspect-binexport2.py index 998741474..1a0abaffb 100644 --- a/scripts/inspect-binexport2.py +++ b/scripts/inspect-binexport2.py @@ -88,8 +88,23 @@ def _render_expression_tree( elif expression.type == BinExport2.Expression.SYMBOL: o.write(expression.symbol) - assert len(children_tree_indexes) == 0 - return + assert len(children_tree_indexes) <= 1 + + if len(children_tree_indexes) == 0: + return + elif len(children_tree_indexes) == 1: + # like: v + # from: mov v0.D[0x1], x9 + # | + # 0 + # . + # | + # D + child_index = children_tree_indexes[0] + _render_expression_tree(be2, instruction, operand, expression_tree, child_index, o) + return + else: + raise NotImplementedError(len(children_tree_indexes)) elif expression.type == BinExport2.Expression.IMMEDIATE_INT: o.write(f"0x{expression.immediate:X}") diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py index 4180baaf2..1cf8686f4 100644 --- a/tests/test_binexport_features.py +++ b/tests/test_binexport_features.py @@ -441,9 +441,6 @@ def test_binexport_features_pe_x86(sample, scope, feature, expected): if "mimikatz.exe_" not in sample.name: pytest.skip("for now only testing mimikatz.exe_ Ghidra BinExport file") - if isinstance(feature, (capa.features.insn.Offset, capa.features.insn.OperandOffset)): - pytest.xfail("Offset features not supported yet") - sample = sample.parent / "binexport2" / (sample.name + ".ghidra.BinExport") assert sample.exists() fixtures.do_test_feature_presence(fixtures.get_binexport_extractor, sample, scope, feature, expected)