diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5f953949c..f55ff643d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -94,7 +94,7 @@ jobs: if: matrix.os == 'ubuntu-20.04' run: sudo apt-get install -y libyaml-dev - name: Install capa - run: pip install -e .[dev] + run: pip install -e .[dev,scripts] - name: Run tests run: pytest -v tests/ @@ -125,7 +125,7 @@ jobs: run: sudo apt-get install -y libyaml-dev - name: Install capa if: ${{ env.BN_SERIAL != 0 }} - run: pip install -e .[dev] + run: pip install -e .[dev,scripts] - name: install Binary Ninja if: ${{ env.BN_SERIAL != 0 }} run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index c7de270e5..4dd2d4e38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master (unreleased) ### New Features +- add script to create code-based YARA based on CAPA match details called match-2-yar @jconnor0426 - ghidra: add Ghidra feature extractor and supporting code #1770 @colton-gabertan - ghidra: add entry script helping users run capa against a loaded Ghidra database #1767 @mike-hunhoff diff --git a/pyproject.toml b/pyproject.toml index 2138abf62..78a85605f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,10 @@ dev = [ "types_requests==2.31.0.2", "types-protobuf==4.23.0.3", ] +scripts = [ + "yaramod==3.20.1", + "mkYARA==1.0.0", +] build = [ "pyinstaller==5.10.1", "setuptools==68.0.0", diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py new file mode 100644 index 000000000..2c14e0b24 --- /dev/null +++ b/scripts/match-2-yar.py @@ -0,0 +1,879 @@ +#!/usr/bin/env python3 +""" +Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. +You may obtain a copy of the License at: [package root]/LICENSE.txt +Unless required by applicable law or agreed to in writing, software distributed under the License + is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and limitations under the License. + +match-2-yar + +Invoke capa to extract the capabilities of the given sample or list of samples, +and emit the matches as yara rules. + +When providing multiple samples or directories the tool will attempt to create +"super rules" based on overlapping signatures + + +Example:: + + $ python scripts/match-2-yar.py /tmp/suspicious.dll_ + ... + +Example:: + + $ python scripts/match-2-yar.py /tmp/suspicious.dll_ /tmp/suspicious2.dll_ + ... + +""" +import os +import sys +import json +import logging +import argparse +import binascii +import collections +import multiprocessing +import multiprocessing.pool +from typing import Set, Dict, List +from pathlib import Path +from datetime import date + +import dnfile +from envi.memcanvas import MemoryCanvas +from dncil.clr.token import Token +from vivisect.renderers import WorkspaceRenderer + +import capa.main +import capa.rules +import capa.engine +import capa.helpers +import capa.features +import capa.exceptions +import capa.render.utils as rutils +import capa.render.verbose +import capa.features.freeze +import capa.render.result_document as rd +from capa.features.common import OS_AUTO +from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor + +try: + import yaramod + from mkyara import YaraGenerator + from capstone import CS_MODE_32, CS_MODE_64, CS_ARCH_X86, CS_OPT_SYNTAX_INTEL, Cs +except ImportError: + print( + """\nFailed to import a module try installing required Python libraries with the following command: +pip install mkyara yaramod +""" + ) + sys.exit(1) + + +logger = logging.getLogger("capa.match-2-yar") + + +# Vivisect Related Classes and Functions + + +class BufferCanvas(MemoryCanvas): + """Subclass of Vivisect Memory canvas that captures + disassemlby output as a string rather than printing to std.out + """ + + output = "" + + def addText(self, text, tag=None): + """Overwriting the method responsible for writing to std.out""" + self.output += text + + +def get_disassembly_output(vw, va, size): + """Get Vivisect's disassembly view for a given virtual address and size + + Args: + vw: Vivisect Workspace + va: Virtual Address to start disassembling from + size: size in bytes to disassemble + + Returns: + str: String containing vivisect's disassembly output + """ + rend = WorkspaceRenderer(vw) + mcav = BufferCanvas(vw) + mcav.renderMemory(va, size, rend=rend) + return mcav.output + + +def get_disassembly_for_func(vw, funcva): + """Get vivisect disassembly for a function + + This function gets the size of a function and + uses that to get a dump of the function disassembly + with get_dissasembly_output + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + str: String containing disassembly output for a function + """ + funcsize = get_function_size(vw, funcva) + return get_disassembly_output(vw, funcva, funcsize) + + +def get_disassembly_for_cb(vw, va): + """Get vivisect disassembly for a for a Code Block + + This function gets the size of a code block and + uses that to get a dump of the code block disassembly + with get_dissasembly_output + + Args: + vw: Vivisect Workspace + va: Virtual Address of Codeblock to analyze + + Returns: + str: String containing disassembly output for a function + """ + cb = vw.getCodeBlock(va) + cbva, cbsize, cbfunc = cb + return get_disassembly_output(vw, cbva, cbsize) + + +def get_function_size(vw, funcva): + """Return the size of a function based on vivisect analysis + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + int: size of the function + """ + fsize = 0 + # Get the effective function given a virtual address + effective_funcva = vw.getFunction(funcva) + if effective_funcva is None: + raise Exception("Given funcva not a function or within a known function") + # These should only disagree if the funcva provided + # wasn't the start of a function + if effective_funcva != funcva: + logger.debug("Requested function addr %s was contained in function %s", hex(funcva), hex(effective_funcva)) + + # Get the blocks of the effective function + func_blocks = [cbva for cbva, _, _ in vw.getFunctionBlocks(effective_funcva)] + + # Figure out the size of the first linear chunk + # in this function. + # Note: if funcva isn't the start of the function (funcva != effective_funcva) + # Then we'll get everything from funcva and after + cb = vw.getCodeBlock(funcva) + if cb[0] not in func_blocks: + raise Exception( + "Provided funcva not in effective func [funcva=%s, effective_funcva=%s]", hex(funcva), hex(effective_funcva) + ) + while cb is not None: + cbva, cbsize, cbfunc = cb + if cbfunc != effective_funcva: + break + fsize += cbsize + cb = vw.getCodeBlock(cbva + cbsize) + + if fsize == 0: + raise Exception("0 length function??!?1") + + return fsize + + +def get_function_bytes(vw, funcva): + """Return the bytes from a function + + Args: + vw: Vivisect Workspace + funcva: Virtual Address of function to analyze + + Returns: + bytes: bytes of a function + """ + fsize = get_function_size(vw, funcva) + return vw.readMemory(funcva, fsize) + + +def get_cb_bytes(vw, va): + """Return the bytes from a code block + + Args: + vw: Vivisect Workspace + va: Virtual Address to analyze + + Returns: + int: size of the function + """ + cb = vw.getCodeBlock(va) + cbva, cbsize, cbfunc = cb + return vw.readMemory(cbva, cbsize) + + +# Capstone Related Classes and Functions + +VIVI_ARCH_TO_CAPSTONE = {"i386": (CS_ARCH_X86, CS_MODE_32), "amd64": (CS_ARCH_X86, CS_MODE_64)} + + +def mkyara_sig_generation(start_va, bytez, arch, mode): + """Mask x86/x64 instructions and generate a signature + + This uses mkyara's logic for now, but an area for research to + build out the system to test resiliency. + + Args: + start_va: virtual address of first instruction + bytez: byte string containing raw bytes of the function + arch: Capstone Architecture to use (CS_ARCH_X86 covers 32 and 64bit x86) + mode: Capstone mode to choose between 32 and 64 bit + + Returns: + str: signature string in the form of "AA BB CC DD" + """ + gen = YaraGenerator("normal", arch, mode) + gen.add_chunk(bytez, offset=start_va) + + md = Cs(arch, mode) + md.detail = True + md.syntax = CS_OPT_SYNTAX_INTEL + + sig = "" + disasm = md.disasm(bytez, start_va) + for ins in disasm: + rule_part, comment = gen._process_instruction(ins) + rule_part = gen.format_hex(rule_part) + sig += rule_part + " " + + return sig + + +def genSigAndMask(start_va, bytez, vivi_arch="i386"): + """Generate a signature and masked signature for a fuction virtual address + + This function performs the translation from vivisect arch + to the mode and arch needed by capstone + + Args: + start_va: virtual address of first instruction + bytez: byte string containing raw bytes of the function + vivi_arch: Vivisect architecture + + Returns: + str: signature string in the form of "AA BB CC DD" + """ + + arch, mode = VIVI_ARCH_TO_CAPSTONE[vivi_arch] + + # Other option for normal is loose, but we won't use those here + return mkyara_sig_generation(start_va, bytez, arch, mode) + + +# .NET Related Classes and Functions + + +def format_operand(pe, op): + """Return a string representation of a .NET operand + + Use a dnfile object to reference .NET tables to understand + methods, classes, and strings + + Args: + pe: dnfile object for a .NET PE + op: dncil operand from an instruction + Returns: + str: string representation of an operand + """ + if isinstance(op, Token): + op = capa.features.extractors.dnfile.helpers.resolve_dotnet_token(pe, op) + + if isinstance(op, str): + return f'"{op}"' + elif isinstance(op, int): + return hex(op) + elif isinstance(op, list): + return f"[{', '.join(['({:04X})'.format(x) for x in op])}]" + elif isinstance(op, dnfile.mdtable.MemberRefRow) and not isinstance(op.Class.row, dnfile.mdtable.TypeSpecRow): + retstr = getattr(op.Class.row, "TypeNamespace", "") + if retstr != "": + retstr += "." + retstr += getattr(op.Class.row, "TypeName", "") + if retstr != "": + retstr += "::" + retstr += op.Name + return retstr + elif isinstance(op, (dnfile.mdtable.FieldRow, dnfile.mdtable.MethodDefRow, dnfile.mdtable.MemberRefRow)): + return f"{op.Name}" + elif isinstance(op, (dnfile.mdtable.TypeDefRow, dnfile.mdtable.TypeRefRow)): + return f"{op.TypeNamespace}.{op.TypeName}" + elif isinstance(op, (dnfile.mdtable.TypeSpecRow, dnfile.mdtable.MethodSpecRow)): + return f"{str(op.struct)}" + else: + return "" if op is None else str(op) + + +def get_sig_and_mask_for_dotnet_func(dnpe, body): + """Return the comment, sig, and bytes of a .NET Method + + Iterate a method body to get IL bytes and mask the operand + values to create a more flexible signature + + Args: + dnpe: dnfile object for a .NET PE + body: dncil method body + Returns: + str comment: Comment string with formatted .NET IL disassembly + str formatted_sig: signature as string with hex and wildcards + str func_bytes: hex bytes of a .NET method + """ + + comment = "" + sig = "" + func_bytes = b"" + for insn in body.instructions: + comment += ( + "{:04X}".format(insn.offset) + + " " + + f"{' '.join('{:02x}'.format(b) for b in insn.get_bytes()) : <20}" + + f"{str(insn.opcode) : <15}" + + format_operand(dnpe, insn.operand) + + "\n" + ) + + sig += insn.get_opcode_bytes().hex() + func_bytes += insn.get_opcode_bytes() + + if insn.operand: + sig += "??" * len(insn.get_operand_bytes()) + func_bytes += insn.get_operand_bytes() + + # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces) + formatted_sig = "" + for idx, val in enumerate(sig): + if idx > 0 and idx % 2 == 0: + formatted_sig += " " + formatted_sig += val + + return comment, formatted_sig, func_bytes + + +# CodeFeature Extractor Related Classes and Functions + + +class CodeFeature: + """Basic object that that will be used to create yara rules""" + + def __init__(self, sig: str, comment: str, bytez: bytes, filemd5: str, addr, scope: str): + self.sig = sig.strip().upper() + self.comment = comment + self.bytez = bytez + self.addr = addr + self.filemd5 = filemd5 + self.scope = scope + + def json(self): + return { + "sig": self.sig, + "comment": self.comment, + "bytez": binascii.hexlify(self.bytez, " ", bytes_per_sep=1).decode("utf8").upper(), + "addr": self.addr, + "filemd5": self.filemd5, + "scope": self.scope, + } + + +def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + + This function operates on x86/x64 PE files and creates + CodeFeatures based on basic block and function CAPA matches + + Args: + doc (rd.ResultDocument): CAPA result docs + extractor: CAPA analysis extractor object + Returns: + dict: dictionary with a key of the filemd5 mapped to a list of CodeFeatures + """ + # Grab the vivisect workspace object + try: + file_vw = extractor.vw + except AttributeError: + print("No extractor workspace") + file_vw = None + raise + + # Get the filemd5 + filemd5 = doc.meta.sample.md5 + + cb_matches = collections.defaultdict(set) + func_matches = collections.defaultdict(set) + + for rule in rutils.capability_rules(doc): + if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + for addr_object, _ in rule.matches: + func_matches[addr_object.value].add(rule.meta.name) + elif rule.meta.scope == capa.rules.BASIC_BLOCK_SCOPE: + for addr_object, _ in rule.matches: + cb_matches[addr_object.value].add(rule.meta.name) + else: + # file scope + pass + + code_features = [] + + for addr, rules in cb_matches.items(): + comment = f"Basic Block at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + comment += get_disassembly_for_cb(file_vw, addr) + + bytez = get_cb_bytes(file_vw, addr) + sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.BASIC_BLOCK_SCOPE)) + + for addr, rules in func_matches.items(): + comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + comment += get_disassembly_for_func(file_vw, addr) + + bytez = get_function_bytes(file_vw, addr) + sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) + + if len(code_features) == 0: + logger.warning("No code features found for %s", filemd5) + return {filemd5: code_features} + + +def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): + """Returns a dictionary mapping a filemd5 to a list of CodeFeatures + + This function operates on .NET PE files and creates + CodeFeatures based on .NET method CAPA matches + + Args: + doc (rd.ResultDocument): CAPA result docs + extractor: CAPA analysis extractor object + Returns: + dict: dictionary with a key of the filemd5 mapped to a list of CodeFeatures + """ + # Grab the vivisect workspace object + try: + dnpe = extractor.pe + except AttributeError: + print("No dnpe file found") + raise + + filemd5 = doc.meta.sample.md5 + + func_matches = collections.defaultdict(set) + + for rule in rutils.capability_rules(doc): + if rule.meta.scope == capa.rules.FUNCTION_SCOPE: + for addr_object, _ in rule.matches: + func_matches[addr_object.value].add(rule.meta.name) + else: + # file scope + pass + + # Funcs is the cache of functions we need to reference to get + # the underlying dnfile object + funcs = list(extractor.get_functions()) + + # Return list of CodeFeature objects + code_features = [] + + logger.debug("Building CodeFeatures for %s functions in %s", len(func_matches.keys()), filemd5) + for addr, rules in func_matches.items(): + func_name = extractor.token_cache.get_method(addr) + comment = f"function {func_name} 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" + for rule_name in sorted(rules): + comment += f" - {rule_name}\n" + + # Get the CILMethodBody object by working with the function + # collection we grabbed earlier + f = [x for x in funcs if x.address.real == addr][0] + func_comment, sig, bytez = get_sig_and_mask_for_dotnet_func(dnpe, f.inner) + comment += func_comment + + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) + + if len(code_features) == 0: + logger.warning("No code features found for %s", filemd5) + return {filemd5: code_features} + + +# CAPA Entrypoints + + +def run_capa_and_get_features(args): + """Main CAPA analysis entrypoint + + This function kicks off CAPA analysis and builds CodeFeatures that + will be used to build yara rules in the main thread. + + Args: + args: Tuple containing the following + - rules: CAPA rules loaded from a repo + - sig_paths: Path to signatures used for library identification + - format: Format for processing (dotnet or auto are the expected values) + - os_: Operating system specified + - path: Path to file for analyis + Returns: + dict: dictionary with the following keys + - path: Path to file that was analyzed + - status: Status of analysis (error or ok) + - error (Optional): Details of errors that occured + - ok (Optional): Dictionary mapping the filemd5 to a list of CodeFeatures + """ + + rules, sig_paths, format, os_, path = args + should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None) + + try: + extractor = capa.main.get_extractor( + path, format, os_, capa.main.BACKEND_VIV, sig_paths, should_save_workspace, disable_progress=True + ) + except capa.main.UnsupportedFormatError: + # i'm 100% sure if multiprocessing will reliably raise exceptions across process boundaries. + # so instead, return an object with explicit success/failure status. + # + # if success, then status=ok, and results found in property "ok" + # if error, then status=error, and human readable message in property "error" + return { + "path": path, + "status": "error", + "error": f"input file does not appear to be a PE file: {path}", + } + except capa.main.UnsupportedRuntimeError: + return { + "path": path, + "status": "error", + "error": "unsupported runtime or Python interpreter", + } + except Exception as e: + return { + "path": path, + "status": "error", + "error": f"unexpected error: {e}", + } + + meta = capa.main.collect_metadata([], path, format, os_, [], extractor) + logger.info("Collecting capabilities for %s", path) + capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True) + + meta.analysis.feature_counts = counts["feature_counts"] + meta.analysis.library_functions = counts["library_functions"] + meta.analysis.layout = capa.main.compute_layout(rules, extractor, capabilities) + + if capa.main.has_file_limitation(rules, capabilities): + # bail if capa encountered file limitation e.g. a packed binary + # do show the output in verbose mode, though. + return { + "path": path, + "status": "error", + "error": "Encountered file limitation", + } + + try: + doc = rd.ResultDocument.from_capa(meta, rules, capabilities) + logger.info("Building code features for %s", path) + if isinstance(extractor, DnfileFeatureExtractor): + # Handle .NET files + features = get_code_features_for_dotnet_doc(doc, extractor) + else: + # Handle other files + features = get_code_features_for_capa_doc(doc, extractor) + except Exception as e: + return { + "path": path, + "status": "error", + "error": f"unexpected error: {e}", + } + return {"path": path, "status": "ok", "ok": features} + + +def multi_process_capa(argv=None): + """CAPA argument handler and multiprocessing manager + + This function processes CLI arguments and kicks of capa analysis + and extacts CodeFeatures into a dictionary that maps filemd5s + to a list of CodeFeatures that will be used to build yara rules + + Args: + argv: + Returns: + dict: dictionary mapping filemd5's processed to a list of CodeFeatures + """ + if argv is None: + argv = sys.argv[1:] + + parser = argparse.ArgumentParser(description="Build YARA rules for CAPA matches") + capa.main.install_common_args(parser, wanted={"rules", "signatures", "format", "os"}) + parser.add_argument("input", type=str, nargs="+", help="Path to directory or files to analyze") + parser.add_argument("-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor") + parser.add_argument("--no-mp", action="store_true", help="disable subprocesses") + parser.add_argument("--dump-features", action="store_true", help="output feature dictionary as json") + args = parser.parse_args(args=argv) + capa.main.handle_common_args(args) + + try: + rules = capa.main.get_rules(args.rules) + logger.info("successfully loaded %s rules", len(rules)) + except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e: + logger.error("%s", str(e)) + return -1 + + try: + sig_paths = capa.main.get_signatures(args.signatures) + except IOError as e: + logger.error("%s", str(e)) + return -1 + + samples = [] + for p in args.input: + path = Path(p) + if not path.exists(): + raise ValueError(f"Invalid path {p}") + if path.is_dir(): + for subpath in path.rglob("*"): + samples.append(subpath) + elif path.is_file(): + samples.append(path) + logger.info("Starting to process %s files", len(samples)) + + cpu_count = multiprocessing.cpu_count() + + def pmap(f, args, parallelism=cpu_count): + """apply the given function f to the given args using subprocesses""" + return multiprocessing.Pool(parallelism).imap(f, args) + + def tmap(f, args, parallelism=cpu_count): + """apply the given function f to the given args using threads""" + return multiprocessing.pool.ThreadPool(parallelism).imap(f, args) + + def map(f, args, parallelism=None): + """apply the given function f to the given args in the current thread""" + for arg in args: + yield f(arg) + + if args.no_mp: + if args.parallelism == 1: + logger.debug("using current thread mapper") + mapper = map + else: + logger.debug("using threading mapper") + mapper = tmap + else: + logger.debug("using process mapper") + mapper = pmap + + results = {} + for result in mapper( + run_capa_and_get_features, + [(rules, sig_paths, args.format, OS_AUTO, sample) for sample in samples], + parallelism=args.parallelism, + ): + if result["status"] == "error": + logger.warning("%s: %s", result["path"], result["error"]) + elif result["status"] == "ok": + results.update(result["ok"]) + else: + raise ValueError("unexpected status: %s", result["status"]) + + logger.info("Done processing %s samples", len(samples)) + + if args.dump_features: + dump_file_features(results) + sys.exit(0) + + return results + + +# Output related functions + + +def dump_file_features(result_dict: dict): + """Print out bytes for the code features extracted""" + output_dict = {filemd5: [x.json() for x in features] for filemd5, features in result_dict.items()} + print(json.dumps(output_dict, indent=4)) + + +# YARA related functions + +CODE_FEATURES_REFERENCED: List[CodeFeature] = [] + + +def build_rule_from_combo(combo_dict: dict, **kwargs): + """Build a yaramod yara rule using a combination dictionary + + Args: + combo_dict: Dictionary of features that all matched on a group of files + Returns: + yaramod.Rule: yaramod representation of a yara rule generated for the file combination + """ + + # we're going to use this to create unique code features to insert the comment strings + global CODE_FEATURES_REFERENCED + + # Build metadata for the rule + rule_name = "super_rule_" + "_".join([x[:5] for x in sorted(combo_dict["files"])]) + metadict = { + "author": kwargs.get("author", "CAPA Matches"), + "date_created": kwargs.get("date_created", date.today().isoformat()), + "date_modified": kwargs.get("date_modified", date.today().isoformat()), + "description": kwargs.get("description", ""), + } + + rule = yaramod.YaraRuleBuilder().with_name(rule_name) + for metakey, metavalue in metadict.items(): + if metavalue is not None: + rule = rule.with_string_meta(metakey, metavalue) + + # Add in hash meta + rule = rule.with_name(rule_name) + for hsh in combo_dict["files"]: + rule = rule.with_string_meta("md5", hsh) + + conditions = [yaramod.of(yaramod.all(), yaramod.them())] + for codefeature in combo_dict["features"]: + idx = len(CODE_FEATURES_REFERENCED) + hexstr = yaramod.YaraHexStringBuilder() + for byte in codefeature.sig.split(" "): + if byte == "??": + hexstr = hexstr.add(yaramod.wildcard()) + elif byte == "": + continue + else: + hexstr = hexstr.add(yaramod.YaraHexStringBuilder(int(byte, 16))) + rule = rule.with_hex_string(f"$c{idx}", hexstr.get()) + CODE_FEATURES_REFERENCED.append(codefeature) + + if len(conditions) == 1: + # No fancy expression needed + rule = rule.with_condition(conditions[0].get()) + else: + rule = rule.with_condition(yaramod.conjunction(conditions, linebreaks=True).get()) + return rule.get() + + +TAB_CHAR = " " * 4 + + +def replace_tabs_with_spaces(yara_text): + """Replacing tabs with spaces in yara rule + + Args: + yara_text: string of full yara rules text + Returns: + str: formatted yara rules text + """ + return yara_text.replace("\t", TAB_CHAR) + + +def add_comments_to_yara_file(yara_text): + """Add comments to yara file text + + Args: + yara_text: string of full yara rules text + Returns: + str: formatted yara rules text + """ + + for idx, feature in enumerate(CODE_FEATURES_REFERENCED): + # Find the str in yara_text + # replace it with the comment + search_str = f"$c{idx} =" + comment_str = "/*\n" + comment_str += ("\n" + 2 * TAB_CHAR).join(feature.comment.split("\n")) + comment_str += "*/\n" + 2 * TAB_CHAR + search_str + yara_text = yara_text.replace(search_str, comment_str) + return yara_text + + +class SimilarityDictEntry: + """Simple object to hold information about a feature in a similarity dictionary""" + + values: List[CodeFeature] + files: Set[str] + + def __init__(self) -> None: + self.values = [] + self.files = set() + + +def build_yara_ruleset(files_dict, **kwargs): + """Build a YARA ruleset string based on CodeFeatures + + Args: + files_dict: dictionary mapping filemd5s to list of CodeFeatures + Returns: + str: YARA ruleset + """ + + # First we'll build a dict with a key based on the masked bytes from each + # Code feature + similarity_dict = {} + for filemd5, features in files_dict.items(): + for value in features: + if value.sig not in similarity_dict: + similarity_dict[value.sig] = SimilarityDictEntry() + similarity_dict[value.sig].values.append(value) + similarity_dict[value.sig].files.add(filemd5) + + # Next we build out a combodict and track which files have which combos of features + file_combinations: Dict[str, dict] = {} + for feature, result_dict in similarity_dict.items(): + logger.debug("Processing feature: %s", feature) + sample_combo_key = ":".join(sorted(result_dict.files)) + # logger.debug("Combo Key: %s", sample_combo_key) + if sample_combo_key not in file_combinations: + file_combinations[sample_combo_key] = {} + file_combinations[sample_combo_key]["files"] = sorted(result_dict.files) + file_combinations[sample_combo_key]["feature_count"] = 0 + file_combinations[sample_combo_key]["features"] = [] + + chosen_code_version = sorted(result_dict.values, key=lambda x: x.filemd5)[0] + file_combinations[sample_combo_key]["features"].append(chosen_code_version) + file_combinations[sample_combo_key]["feature_count"] += 1 + + # Create a list of combo keys and sort them so we get deterministic output + combo_keys = sorted(file_combinations.keys(), key=lambda x: (len(x), x)) + + # Build the YARA rule set based on the grouping + yara_file = yaramod.YaraFileBuilder() + observed_files = [] + + for key in combo_keys: + combo_dict = file_combinations[key] + rule = build_rule_from_combo(combo_dict, **kwargs) + if rule is not None: + observed_files.extend(combo_dict["files"]) + yara_file = yara_file.with_rule(rule) + + # Turn the yaramod "file" into a string + yara_text = yara_file.get().text_formatted + + yara_text = replace_tabs_with_spaces(yara_text) + + # Add our comments to the file + yara_text = add_comments_to_yara_file(yara_text) + + return yara_text + + +def main(argv=None): + all_features = multi_process_capa(argv) + print(build_yara_ruleset(all_features)) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/data b/tests/data index faf741a53..5bfe79607 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit faf741a538224f52d4412468f910d52a70911662 +Subproject commit 5bfe7960707b94e1b4f3066e66cc0f1b8136ae9e diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 7c91bc573..7aac70a07 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -7,13 +7,18 @@ # See the License for the specific language governing permissions and limitations under the License. import sys +import json import logging import textwrap import subprocess from pathlib import Path +from datetime import date +from functools import lru_cache import pytest +import capa.rules + logger = logging.getLogger(__name__) CD = Path(__file__).resolve().parent @@ -27,6 +32,10 @@ def get_file_path(): return str(CD / "data" / "9324d1a8ae37a36ae560c37448c9705a.exe_") +def get_data_path(p: str): + return str(CD / "data" / p) + + def get_rules_path(): return str(CD / ".." / "rules") @@ -69,12 +78,92 @@ def test_bulk_process(tmp_path): assert p.returncode == 0 +@pytest.mark.parametrize( + "script,args,expected_output_path", + [ + # Test match-2-yar x86 EXE + pytest.param( + "match-2-yar.py", + [get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_")], + "yara/expected_9324d1a8ae37a36ae560c37448c9705a.yar", + ), + # Test match-2-yar x64 EXE + pytest.param( + "match-2-yar.py", + [get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_")], + "yara/expected_c2bb17c12975e.yar", + ), + # Test match-2-yar x86 .NET EXE + pytest.param( + "match-2-yar.py", + [ + "-f", + "dotnet", + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + ], + "yara/expected_1c444ebeba24dcba8628b7dfe5fec7c6.yar", + ), + # Test match-2-yar files with multiple X86 PEs + pytest.param( + "match-2-yar.py", + [ + get_data_path("Practical Malware Analysis Lab 03-04.exe_"), + get_data_path("Practical Malware Analysis Lab 11-03.exe_"), + get_data_path("Practical Malware Analysis Lab 16-01.exe_"), + ], + "yara/expected_pma_03-04_11-03_16-01.yar", + ), + # Test match-2-yar files with CAPA file limitations are filtered out of multi sample + pytest.param( + "match-2-yar.py", + [ + get_data_path("Practical Malware Analysis Lab 01-01.exe_"), + get_data_path("Practical Malware Analysis Lab 01-02.exe_"), + ], + "yara/expected_pma_01-01_01-02.yar", + ), + # Test match-2-yar multiple x86 .NET PE + pytest.param( + "match-2-yar.py", + [ + "-f", + "dotnet", + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + get_data_path("dotnet/692f7fd6d198e804d6af98eb9e390d61.exe_"), + ], + "yara/expected_1c444ebe_692f7fd6.yar", + ), + ], +) +def test_script_expected_output(script, args, expected_output_path): + script_path = get_script_path(script) + + expected_output = Path(get_data_path(expected_output_path)).read_bytes() + # Update dates in expected output to be todays date + expected_output = expected_output.replace(b"EXPECTED_DATE", date.today().isoformat().encode("utf8")) + + p = run_program(script_path, args) + + assert p.returncode == 0 + assert p.stdout.decode("utf8") == expected_output.decode("utf8") + + def run_program(script_path, args): args = [sys.executable] + [script_path] + args logger.debug("running: %r", args) return subprocess.run(args, stdout=subprocess.PIPE) +@lru_cache(maxsize=1) +def get_match_2_yar_features(path, is_dotnet): + script_path = get_script_path("match-2-yar.py") + args = ["--dump-features", path] + if is_dotnet: + args.extend(["-f", "dotnet"]) + p = run_program(script_path, args) + return p.stdout + + def test_proto_conversion(tmp_path): t = tmp_path / "proto-test" t.mkdir() @@ -199,3 +288,79 @@ def test_detect_duplicate_features(tmpdir): args = [rule_dir.strpath, rule_path] overlaps_found = run_program(script_path, args) assert overlaps_found.returncode == expected_overlaps + + +@pytest.mark.parametrize( + "path,is_dotnet,filemd5,addr,scope,expected_bytestring,expected_sig", + [ + # Test match-2-yar x86 EXE - Basic Block Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004031A0, + capa.rules.BASIC_BLOCK_SCOPE, + "83 EC 10 B0 6C 8B 15 24 A0 40 00 88 44 24 01 88 44 24 02 B0 6F 8D 4C 24 00 88 44 24 04 88 44 24 0B 8B 44 24 14 C6 44 24 00 44 50 51 52 6A 00 C6 44 24 13 53 C6 44 24 15 72 C6 44 24 16 74 C6 44 24 17 57 C6 44 24 18 69 C6 44 24 19 6E C6 44 24 1A 64 C6 44 24 1C 77 C6 44 24 1D 00 E8 EF F7 FF FF A3 C4 A9 40 00 33 C0 83 C4 20 C2 04 00", + "83 EC 10 B0 6C 8B 15 ?? ?? ?? ?? 88 44 24 ?? 88 44 24 ?? B0 6F 8D 4C 24 ?? 88 44 24 ?? 88 44 24 ?? 8B 44 24 ?? C6 44 24 ?? 44 50 51 52 6A 00 C6 44 24 ?? 53 C6 44 24 ?? 72 C6 44 24 ?? 74 C6 44 24 ?? 57 C6 44 24 ?? 69 C6 44 24 ?? 6E C6 44 24 ?? 64 C6 44 24 ?? 77 C6 44 24 ?? 00 E8 ?? ?? ?? ?? A3 ?? ?? ?? ?? 33 C0 83 C4 20 C2 04 00", + ), + # Test match-2-yar x86 EXE - Function Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004019C0, + capa.rules.FUNCTION_SCOPE, + "81 EC 7C 04 00 00 53 55 8B 2D 14 92 40 00 56 8B F1 57 6A 00 8D 44 24 14 8B 8E A8 00 00 00 6A 04 B3 02 50 51 C7 44 24 28 03 00 00 00 C7 44 24 2C 00 00 00 00 C6 44 24 20 05 88 5C 24 21 C6 44 24 22 00 88 5C 24 23 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8B 96 A8 00 00 00 F3 AB 8D 44 24 18 8D 4C 24 2C 50 6A 00 6A 00 51 6A 00 89 54 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 0C 8B 96 A8 00 00 00 52 E9 5D 02 00 00 8B 8E A8 00 00 00 6A 00 8D 84 24 38 02 00 00 68 58 02 00 00 50 51 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 2C 02 00 00 8A 84 24 35 02 00 00 84 C0 74 0A 3A C3 0F 85 19 02 00 00 EB 08 3A C3 0F 85 30 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 49 2C 00 00 85 C0 0F 86 18 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 31 2C 00 00 8B 0D D4 AA 40 00 68 A0 A6 40 00 8B D8 E8 1F 2C 00 00 89 44 24 14 B9 40 00 00 00 33 C0 8D BC 24 30 01 00 00 F3 AB 66 AB 8B 3D 94 90 40 00 8D 94 24 32 01 00 00 68 A0 A5 40 00 52 C6 84 24 38 01 00 00 05 88 9C 24 39 01 00 00 FF D7 8D 44 24 14 6A 04 8D 8C 1C 36 01 00 00 50 51 8B 0D D4 AA 40 00 E8 3B 2A 00 00 8D 94 1C 33 01 00 00 68 A0 A6 40 00 52 FF D7 8B 44 24 14 6A 00 8D 94 24 34 01 00 00 8D 4C 18 03 8B 86 A8 00 00 00 51 52 50 FF D5 8D 54 24 18 33 C0 B9 96 00 00 00 8D BC 24 34 02 00 00 52 50 F3 AB 8B 8E A8 00 00 00 50 8D 44 24 38 89 4C 24 3C 50 6A 00 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 0F 8E 18 01 00 00 8B 86 A8 00 00 00 6A 00 8D 94 24 38 02 00 00 68 58 02 00 00 52 50 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 EE 00 00 00 8A 84 24 35 02 00 00 84 C0 0F 85 DF 00 00 00 8B 94 24 90 04 00 00 52 FF 15 FC 91 40 00 85 C0 0F 84 D6 00 00 00 C6 44 24 20 05 C6 44 24 21 01 C6 44 24 22 00 C6 44 24 23 01 8B 40 0C 8B 08 8B 84 24 94 04 00 00 50 8B 11 89 54 24 28 FF 15 08 92 40 00 8B 96 A8 00 00 00 6A 00 8D 4C 24 24 6A 0A 51 52 66 89 44 24 38 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8D 54 24 2C F3 AB 8B 86 A8 00 00 00 8D 4C 24 18 51 6A 00 6A 00 52 6A 00 89 44 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 09 8B 86 A8 00 00 00 50 EB 47 8B 96 A8 00 00 00 6A 00 8D 8C 24 38 02 00 00 68 58 02 00 00 51 52 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 75 D1 8A 84 24 35 02 00 00 84 C0 75 C6 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E A8 00 00 00 51 FF 15 04 92 40 00 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + "81 EC 7C 04 00 00 53 55 8B 2D ?? ?? ?? ?? 56 8B F1 57 6A 00 8D 44 24 ?? 8B 8E ?? ?? ?? ?? 6A 04 B3 02 50 51 C7 44 24 ?? 03 00 00 00 C7 44 24 ?? 00 00 00 00 C6 44 24 ?? 05 88 5C 24 ?? C6 44 24 ?? 00 88 5C 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? F3 AB 8D 44 24 ?? 8D 4C 24 ?? 50 6A 00 6A 00 51 6A 00 89 54 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 96 ?? ?? ?? ?? 52 E9 ?? ?? ?? ?? 8B 8E ?? ?? ?? ?? 6A 00 8D 84 24 ?? ?? ?? ?? 68 58 02 00 00 50 51 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 74 ?? 3A C3 0F 85 ?? ?? ?? ?? EB ?? 3A C3 0F 85 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 85 C0 0F 86 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A6 40 00 8B D8 E8 ?? ?? ?? ?? 89 44 24 ?? B9 40 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? F3 AB 66 AB 8B 3D ?? ?? ?? ?? 8D 94 24 ?? ?? ?? ?? 68 A0 A5 40 00 52 C6 84 24 ?? ?? ?? ?? 05 88 9C 24 ?? ?? ?? ?? FF D7 8D 44 24 ?? 6A 04 8D 8C 1C ?? ?? ?? ?? 50 51 8B 0D ?? ?? ?? ?? E8 ?? ?? ?? ?? 8D 94 1C ?? ?? ?? ?? 68 A0 A6 40 00 52 FF D7 8B 44 24 ?? 6A 00 8D 94 24 ?? ?? ?? ?? 8D 4C 18 ?? 8B 86 ?? ?? ?? ?? 51 52 50 FF D5 8D 54 24 ?? 33 C0 B9 96 00 00 00 8D BC 24 ?? ?? ?? ?? 52 50 F3 AB 8B 8E ?? ?? ?? ?? 50 8D 44 24 ?? 89 4C 24 ?? 50 6A 00 C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 0F 8E ?? ?? ?? ?? 8B 86 ?? ?? ?? ?? 6A 00 8D 94 24 ?? ?? ?? ?? 68 58 02 00 00 52 50 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 0F 85 ?? ?? ?? ?? 8B 94 24 ?? ?? ?? ?? 52 FF 15 ?? ?? ?? ?? 85 C0 0F 84 ?? ?? ?? ?? C6 44 24 ?? 05 C6 44 24 ?? 01 C6 44 24 ?? 00 C6 44 24 ?? 01 8B 40 ?? 8B 08 8B 84 24 ?? ?? ?? ?? 50 8B 11 89 54 24 ?? FF 15 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 4C 24 ?? 6A 0A 51 52 66 89 44 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8D 54 24 ?? F3 AB 8B 86 ?? ?? ?? ?? 8D 4C 24 ?? 51 6A 00 6A 00 52 6A 00 89 44 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 86 ?? ?? ?? ?? 50 EB ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 8C 24 ?? ?? ?? ?? 68 58 02 00 00 51 52 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 75 ?? 8A 84 24 ?? ?? ?? ?? 84 C0 75 ?? 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E ?? ?? ?? ?? 51 FF 15 ?? ?? ?? ?? 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + ), + # Test match-2-yar x64 EXE - Basic Block Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x14000109F, + capa.rules.BASIC_BLOCK_SCOPE, + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar x64 EXE - Function Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x140001010, + capa.rules.FUNCTION_SCOPE, + "48 89 5C 24 08 57 48 83 EC 50 48 8B 05 DF 1F 00 00 48 33 C4 48 89 44 24 40 66 0F 6F 15 8F 12 00 00 48 8D 3D 08 20 00 00 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 10 66 0F EF CA F3 0F 7F 4C 39 10 F3 0F 6F 44 39 20 66 0F EF C2 F3 0F 7F 44 39 20 F3 0F 6F 44 39 30 66 0F EF C2 F3 0F 7F 44 39 30 48 83 C1 40 48 3B C8 7C B9 66 0F 1F 84 00 00 00 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C F1 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "48 89 5C 24 ?? 57 48 83 EC 50 48 8B 05 ?? ?? ?? ?? 48 33 C4 48 89 44 24 ?? 66 0F 6F 15 ?? ?? 00 00 48 8D 3D ?? ?? ?? ?? 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 ?? 66 0F EF CA F3 0F 7F 4C 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? 48 83 C1 40 48 3B C8 7C ?? 66 0F 1F 84 00 ?? ?? 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C ?? 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar .NET EXE - Function Extraction + pytest.param( + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + True, + "1c444ebeba24dcba8628b7dfe5fec7c6", + 0x06000073, + capa.rules.FUNCTION_SCOPE, + "03 28 7D 00 00 06 0A 12 01 FE 15 0A 00 00 02 03 12 01 28 7F 00 00 06 26 12 01 7B 7B 00 00 04 12 01 7B 79 00 00 04 59 0C 12 01 7B 7C 00 00 04 12 01 7B 7A 00 00 04 59 0D 06 28 77 00 00 06 13 04 06 08 09 28 76 00 00 06 13 05 11 04 11 05 28 7A 00 00 06 13 06 11 04 16 16 08 09 06 16 16 20 20 00 CC 00 28 75 00 00 06 26 11 04 11 06 28 7A 00 00 06 26 11 04 28 78 00 00 06 26 03 06 28 7E 00 00 06 26 11 05 28 65 00 00 0A 13 07 11 05 28 79 00 00 06 26 11 07 2A", + "03 28 ?? ?? ?? ?? 0A 12 ?? FE 15 ?? ?? ?? ?? 03 12 ?? 28 ?? ?? ?? ?? 26 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0C 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0D 06 28 ?? ?? ?? ?? 13 ?? 06 08 09 28 ?? ?? ?? ?? 13 ?? 11 ?? 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 16 16 08 09 06 16 16 20 ?? ?? ?? ?? 28 ?? ?? ?? ?? 26 11 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 26 03 06 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 2A", + ), + ], +) +def test_match2yar_feature_extraction(path, is_dotnet, filemd5, addr, scope, expected_bytestring, expected_sig): + """Test extracting and masking bytes based on matches using match-2-yar script""" + output = get_match_2_yar_features(path, is_dotnet) + + output = output.decode("utf8") + output_data = json.loads(output) + + # Get data for filemd5: + file_features = output_data[filemd5] + + # Filter for addr with correct scope + addr_features = [x for x in file_features if x["addr"] == addr and x["scope"] == scope] + + # This should be unique + assert len(addr_features) == 1 + + # Check extraction and masking + assert addr_features[0]["bytez"] == expected_bytestring + assert addr_features[0]["sig"] == expected_sig