Skip to content

Commit

Permalink
binexport: remove Ghidra symbol madness and fix x86/amd64 stack offse…
Browse files Browse the repository at this point in the history
…t number tests
  • Loading branch information
mike-hunhoff committed Aug 2, 2024
1 parent f8b0f50 commit 227fdeb
Showing 1 changed file with 22 additions and 88 deletions.
110 changes: 22 additions & 88 deletions capa/features/extractors/binexport2/insn.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
from typing import List, Tuple, Iterator
from typing import List, Tuple, Iterator, Optional

import capa.features.extractors.helpers
import capa.features.extractors.strings
Expand Down Expand Up @@ -34,6 +34,15 @@
SECURITY_COOKIE_BYTES_DELTA: int = 0x40


def get_operand_expression_register(op_index: int, be2: BinExport2) -> Optional[str]:
op: BinExport2.Operand = be2.operand[op_index]
if len(op.expression_index) == 1:
exp: BinExport2.Expression = be2.expression[op.expression_index[0]]
if exp.type == BinExport2.Expression.Type.REGISTER:
return exp.symbol.lower()
return None


def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]:
fhi: FunctionContext = fh.inner
ii: InstructionContext = ih.inner
Expand Down Expand Up @@ -88,48 +97,6 @@ def is_address_mapped(be2: BinExport2, address: int) -> bool:
return any(section.address <= address < section.address + section.size for section in sections_with_perms)


###############################################################################
#
# begin Ghidra symbol madness ("gsm").
#
# This is a "temporary" section of code to deal with
# https://github.com/google/binexport/issues/78
# because Ghidra exports all operands as a single SYMBOL expression node.
#
# Use references to `_is_ghidra_symbol_madness` to remove all this up later.


def _is_ghidra_symbol_madness(be2: BinExport2, instruction_index: int) -> bool:
instruction = be2.instruction[instruction_index]
for operand_index in instruction.operand_index:
operand = be2.operand[operand_index]

if len(operand.expression_index) != 1:
return False

expression0 = be2.expression[operand.expression_index[0]]

if BinExport2.Expression.Type.SYMBOL != expression0.type:
return False

return True


def _gsm_get_instruction_operand(be2: BinExport2, instruction_index: int, operand_index: int) -> str:
"""since Ghidra represents all operands as a single string, just fetch that."""
instruction = be2.instruction[instruction_index]
operand = be2.operand[instruction.operand_index[operand_index]]
assert len(operand.expression_index) == 1
expression = be2.expression[operand.expression_index[0]]
assert expression.type == BinExport2.Expression.Type.SYMBOL
return expression.symbol


# end Ghidra symbol madness.
#
###############################################################################


def extract_insn_number_features(
fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
) -> Iterator[Tuple[Feature, Address]]:
Expand All @@ -142,62 +109,29 @@ def extract_insn_number_features(
instruction_index: int = ii.instruction_index
instruction: BinExport2.Instruction = be2.instruction[instruction_index]

if len(instruction.operand_index) == 0:
# skip things like:
# .text:0040116e leave
return

# x86 / amd64
mnemonic = be2.mnemonic[instruction.mnemonic_index]
if mnemonic.name.lower().startswith("ret"):
# skip things like:
# .text:0042250E retn 8
return

_is_gsm = _is_ghidra_symbol_madness(be2, instruction_index)
register: Optional[str] = get_operand_expression_register(instruction.operand_index[0], be2)
if register is not None:
# x86 / amd64
if mnemonic.name.lower().startswith(("add", "sub")):
if register.endswith(("sp", "bp")):
return

for i, operand_index in enumerate(instruction.operand_index):
operand = be2.operand[operand_index]

if len(operand.expression_index) == 1 and _is_gsm:
# temporarily, we'll have to try to guess at the interpretation.
symbol = _gsm_get_instruction_operand(be2, instruction_index, i)

# x86 / amd64
if mnemonic.name.lower() == "add" and symbol.lower() == "esp":
# skip things like:
#
# .text:00401140 call sub_407E2B
# .text:00401145 add esp, 0Ch
return

if symbol.startswith(("#0x", "#-0x")):
# like:
# - type: SYMBOL
# symbol: "#0xffffffff"
# - type: SYMBOL
# symbol: "#-0x1"
try:
value = int(symbol[len("#") :], 0x10)
except ValueError:
# failed to parse as integer
continue

# handling continues below at label: has a value

elif symbol.startswith(("0x", "-0x")):
# like:
# - type: SYMBOL
# symbol: "0x1000"
# - type: SYMBOL
# symbol: "-0x1"
try:
value = int(symbol, 0x10)
except ValueError:
# failed to parse as integer
continue

# handling continues below at label: has a value

else:
continue

elif len(operand.expression_index) == 1:
if len(operand.expression_index) == 1:
# - type: IMMEDIATE_INT
# immediate: 20588728364
# parent_index: 0
Expand Down

0 comments on commit 227fdeb

Please sign in to comment.