From a1db2b960b9865272e54c6bd9e0c9c8aae271c62 Mon Sep 17 00:00:00 2001 From: jconnor0426 Date: Tue, 22 Aug 2023 17:24:58 -0400 Subject: [PATCH] Add test to validate match-2-yar feature extraction --- scripts/match-2-yar.py | 48 ++++++++++++++++++----- tests/test_scripts.py | 88 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 9 deletions(-) diff --git a/scripts/match-2-yar.py b/scripts/match-2-yar.py index 08a095e8f6..17d02f646d 100644 --- a/scripts/match-2-yar.py +++ b/scripts/match-2-yar.py @@ -30,12 +30,14 @@ """ import os import sys +import json import logging import argparse +import binascii import collections import multiprocessing import multiprocessing.pool -from typing import Set, Dict, List +from typing import Set, Dict, List, Union from pathlib import Path from datetime import date @@ -323,7 +325,7 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): comment = "" sig = "" - func_bytes = "" + func_bytes = b"" for insn in body.instructions: comment += ( "{:04X}".format(insn.offset) @@ -335,11 +337,11 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): ) sig += insn.get_opcode_bytes().hex() - func_bytes += insn.get_opcode_bytes().hex() + func_bytes += insn.get_opcode_bytes() if insn.operand: sig += "??" * len(insn.get_operand_bytes()) - func_bytes += insn.get_operand_bytes().hex() + func_bytes += insn.get_operand_bytes() # Format the sig to be in the same style as the vivi portion (bytes seperated by spaces) formatted_sig = "" @@ -357,11 +359,25 @@ def get_sig_and_mask_for_dotnet_func(dnpe, body): class CodeFeature: """Basic object that that will be used to create yara rules""" - def __init__(self, sig: str, comment: str, bytez: bytes, filemd5: str): - self.sig = sig + def __init__( + self, sig: str, comment: str, bytez: bytes, filemd5: str, addr: Union[int, tuple[int, int], None], scope: str + ): + self.sig = sig.strip().upper() self.comment = comment self.bytez = bytez + self.addr = addr self.filemd5 = filemd5 + self.scope = scope + + def json(self): + return { + "sig": self.sig, + "comment": self.comment, + "bytez": binascii.hexlify(self.bytez, " ", bytes_per_sep=1).decode("utf8").upper(), + "addr": self.addr, + "filemd5": self.filemd5, + "scope": self.scope, + } def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): @@ -411,7 +427,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_cb_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.BASIC_BLOCK_SCOPE)) for addr, rules in func_matches.items(): comment = f"function at 0x{addr:08x}@{filemd5} with {len(rules)} features:\n" @@ -421,7 +437,7 @@ def get_code_features_for_capa_doc(doc: rd.ResultDocument, extractor): bytez = get_function_bytes(file_vw, addr) sig = genSigAndMask(addr, bytez, doc.meta.analysis.arch) - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) @@ -479,7 +495,7 @@ def get_code_features_for_dotnet_doc(doc: rd.ResultDocument, extractor): func_comment, sig, bytez = get_sig_and_mask_for_dotnet_func(dnpe, f.inner) comment += func_comment - code_features.append(CodeFeature(sig, comment, bytez, filemd5)) + code_features.append(CodeFeature(sig, comment, bytez, filemd5, addr, capa.rules.FUNCTION_SCOPE)) if len(code_features) == 0: logger.warning("No code features found for %s", filemd5) @@ -596,6 +612,7 @@ def multi_process_capa(argv=None): parser.add_argument("input", type=str, nargs="+", help="Path to directory or files to analyze") parser.add_argument("-n", "--parallelism", type=int, default=multiprocessing.cpu_count(), help="parallelism factor") parser.add_argument("--no-mp", action="store_true", help="disable subprocesses") + parser.add_argument("--dump-features", action="store_true", help="output feature dictionary as json") args = parser.parse_args(args=argv) capa.main.handle_common_args(args) @@ -665,9 +682,22 @@ def map(f, args, parallelism=None): logger.info("Done processing %s samples", len(samples)) + if args.dump_features: + dump_file_features(results) + sys.exit(0) + return results +# Output related functions + + +def dump_file_features(result_dict: dict): + """Print out bytes for the code features extracted""" + output_dict = {filemd5: [x.json() for x in features] for filemd5, features in result_dict.items()} + print(json.dumps(output_dict, indent=4)) + + # YARA related functions CODE_FEATURES_REFERENCED: List[CodeFeature] = [] diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 4ef8ce1b28..1dca780f4a 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -7,14 +7,18 @@ # See the License for the specific language governing permissions and limitations under the License. import sys +import json import logging import textwrap import subprocess from pathlib import Path from datetime import date +from functools import lru_cache import pytest +import capa.rules + logger = logging.getLogger(__name__) CD = Path(__file__).resolve().parent @@ -153,6 +157,16 @@ def run_program(script_path, args): return subprocess.run(args, stdout=subprocess.PIPE) +@lru_cache(maxsize=1) +def get_match_2_yar_features(path, is_dotnet): + script_path = get_script_path("match-2-yar.py") + args = ["--dump-features", path] + if is_dotnet: + args.extend(["-f", "dotnet"]) + p = run_program(script_path, args) + return p.stdout + + def test_proto_conversion(tmp_path): t = tmp_path / "proto-test" t.mkdir() @@ -283,3 +297,77 @@ def test_detect_duplicate_features(tmpdir): # Use importlib to import the script # Use fixtures vivisect to get a vivisect workspace for a given path # We can use known functions from the yara matches to extract out length, bytes, and masked sig +@pytest.mark.parametrize( + "path,is_dotnet,filemd5,addr,scope,expected_bytestring,expected_sig", + [ + # Test match-2-yar x86 EXE - Basic Block Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004031A0, + capa.rules.BASIC_BLOCK_SCOPE, + "83 EC 10 B0 6C 8B 15 24 A0 40 00 88 44 24 01 88 44 24 02 B0 6F 8D 4C 24 00 88 44 24 04 88 44 24 0B 8B 44 24 14 C6 44 24 00 44 50 51 52 6A 00 C6 44 24 13 53 C6 44 24 15 72 C6 44 24 16 74 C6 44 24 17 57 C6 44 24 18 69 C6 44 24 19 6E C6 44 24 1A 64 C6 44 24 1C 77 C6 44 24 1D 00 E8 EF F7 FF FF A3 C4 A9 40 00 33 C0 83 C4 20 C2 04 00", + "83 EC 10 B0 6C 8B 15 ?? ?? ?? ?? 88 44 24 ?? 88 44 24 ?? B0 6F 8D 4C 24 ?? 88 44 24 ?? 88 44 24 ?? 8B 44 24 ?? C6 44 24 ?? 44 50 51 52 6A 00 C6 44 24 ?? 53 C6 44 24 ?? 72 C6 44 24 ?? 74 C6 44 24 ?? 57 C6 44 24 ?? 69 C6 44 24 ?? 6E C6 44 24 ?? 64 C6 44 24 ?? 77 C6 44 24 ?? 00 E8 ?? ?? ?? ?? A3 ?? ?? ?? ?? 33 C0 83 C4 20 C2 04 00", + ), + # Test match-2-yar x86 EXE - Function Extraction + pytest.param( + get_data_path("9324d1a8ae37a36ae560c37448c9705a.exe_"), + False, + "9324d1a8ae37a36ae560c37448c9705a", + 0x004019C0, + capa.rules.FUNCTION_SCOPE, + "81 EC 7C 04 00 00 53 55 8B 2D 14 92 40 00 56 8B F1 57 6A 00 8D 44 24 14 8B 8E A8 00 00 00 6A 04 B3 02 50 51 C7 44 24 28 03 00 00 00 C7 44 24 2C 00 00 00 00 C6 44 24 20 05 88 5C 24 21 C6 44 24 22 00 88 5C 24 23 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8B 96 A8 00 00 00 F3 AB 8D 44 24 18 8D 4C 24 2C 50 6A 00 6A 00 51 6A 00 89 54 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 0C 8B 96 A8 00 00 00 52 E9 5D 02 00 00 8B 8E A8 00 00 00 6A 00 8D 84 24 38 02 00 00 68 58 02 00 00 50 51 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 2C 02 00 00 8A 84 24 35 02 00 00 84 C0 74 0A 3A C3 0F 85 19 02 00 00 EB 08 3A C3 0F 85 30 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 49 2C 00 00 85 C0 0F 86 18 01 00 00 8B 0D D4 AA 40 00 68 A0 A5 40 00 E8 31 2C 00 00 8B 0D D4 AA 40 00 68 A0 A6 40 00 8B D8 E8 1F 2C 00 00 89 44 24 14 B9 40 00 00 00 33 C0 8D BC 24 30 01 00 00 F3 AB 66 AB 8B 3D 94 90 40 00 8D 94 24 32 01 00 00 68 A0 A5 40 00 52 C6 84 24 38 01 00 00 05 88 9C 24 39 01 00 00 FF D7 8D 44 24 14 6A 04 8D 8C 1C 36 01 00 00 50 51 8B 0D D4 AA 40 00 E8 3B 2A 00 00 8D 94 1C 33 01 00 00 68 A0 A6 40 00 52 FF D7 8B 44 24 14 6A 00 8D 94 24 34 01 00 00 8D 4C 18 03 8B 86 A8 00 00 00 51 52 50 FF D5 8D 54 24 18 33 C0 B9 96 00 00 00 8D BC 24 34 02 00 00 52 50 F3 AB 8B 8E A8 00 00 00 50 8D 44 24 38 89 4C 24 3C 50 6A 00 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 0F 8E 18 01 00 00 8B 86 A8 00 00 00 6A 00 8D 94 24 38 02 00 00 68 58 02 00 00 52 50 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 0F 85 EE 00 00 00 8A 84 24 35 02 00 00 84 C0 0F 85 DF 00 00 00 8B 94 24 90 04 00 00 52 FF 15 FC 91 40 00 85 C0 0F 84 D6 00 00 00 C6 44 24 20 05 C6 44 24 21 01 C6 44 24 22 00 C6 44 24 23 01 8B 40 0C 8B 08 8B 84 24 94 04 00 00 50 8B 11 89 54 24 28 FF 15 08 92 40 00 8B 96 A8 00 00 00 6A 00 8D 4C 24 24 6A 0A 51 52 66 89 44 24 38 FF D5 B9 96 00 00 00 33 C0 8D BC 24 34 02 00 00 8D 54 24 2C F3 AB 8B 86 A8 00 00 00 8D 4C 24 18 51 6A 00 6A 00 52 6A 00 89 44 24 44 C7 44 24 40 01 00 00 00 FF 15 10 92 40 00 85 C0 7F 09 8B 86 A8 00 00 00 50 EB 47 8B 96 A8 00 00 00 6A 00 8D 8C 24 38 02 00 00 68 58 02 00 00 51 52 FF 15 0C 92 40 00 80 BC 24 34 02 00 00 05 75 D1 8A 84 24 35 02 00 00 84 C0 75 C6 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E A8 00 00 00 51 FF 15 04 92 40 00 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + "81 EC 7C 04 00 00 53 55 8B 2D ?? ?? ?? ?? 56 8B F1 57 6A 00 8D 44 24 ?? 8B 8E ?? ?? ?? ?? 6A 04 B3 02 50 51 C7 44 24 ?? 03 00 00 00 C7 44 24 ?? 00 00 00 00 C6 44 24 ?? 05 88 5C 24 ?? C6 44 24 ?? 00 88 5C 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? F3 AB 8D 44 24 ?? 8D 4C 24 ?? 50 6A 00 6A 00 51 6A 00 89 54 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 96 ?? ?? ?? ?? 52 E9 ?? ?? ?? ?? 8B 8E ?? ?? ?? ?? 6A 00 8D 84 24 ?? ?? ?? ?? 68 58 02 00 00 50 51 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 74 ?? 3A C3 0F 85 ?? ?? ?? ?? EB ?? 3A C3 0F 85 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 85 C0 0F 86 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A5 40 00 E8 ?? ?? ?? ?? 8B 0D ?? ?? ?? ?? 68 A0 A6 40 00 8B D8 E8 ?? ?? ?? ?? 89 44 24 ?? B9 40 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? F3 AB 66 AB 8B 3D ?? ?? ?? ?? 8D 94 24 ?? ?? ?? ?? 68 A0 A5 40 00 52 C6 84 24 ?? ?? ?? ?? 05 88 9C 24 ?? ?? ?? ?? FF D7 8D 44 24 ?? 6A 04 8D 8C 1C ?? ?? ?? ?? 50 51 8B 0D ?? ?? ?? ?? E8 ?? ?? ?? ?? 8D 94 1C ?? ?? ?? ?? 68 A0 A6 40 00 52 FF D7 8B 44 24 ?? 6A 00 8D 94 24 ?? ?? ?? ?? 8D 4C 18 ?? 8B 86 ?? ?? ?? ?? 51 52 50 FF D5 8D 54 24 ?? 33 C0 B9 96 00 00 00 8D BC 24 ?? ?? ?? ?? 52 50 F3 AB 8B 8E ?? ?? ?? ?? 50 8D 44 24 ?? 89 4C 24 ?? 50 6A 00 C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 0F 8E ?? ?? ?? ?? 8B 86 ?? ?? ?? ?? 6A 00 8D 94 24 ?? ?? ?? ?? 68 58 02 00 00 52 50 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 0F 85 ?? ?? ?? ?? 8A 84 24 ?? ?? ?? ?? 84 C0 0F 85 ?? ?? ?? ?? 8B 94 24 ?? ?? ?? ?? 52 FF 15 ?? ?? ?? ?? 85 C0 0F 84 ?? ?? ?? ?? C6 44 24 ?? 05 C6 44 24 ?? 01 C6 44 24 ?? 00 C6 44 24 ?? 01 8B 40 ?? 8B 08 8B 84 24 ?? ?? ?? ?? 50 8B 11 89 54 24 ?? FF 15 ?? ?? ?? ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 4C 24 ?? 6A 0A 51 52 66 89 44 24 ?? FF D5 B9 96 00 00 00 33 C0 8D BC 24 ?? ?? ?? ?? 8D 54 24 ?? F3 AB 8B 86 ?? ?? ?? ?? 8D 4C 24 ?? 51 6A 00 6A 00 52 6A 00 89 44 24 ?? C7 44 24 ?? 01 00 00 00 FF 15 ?? ?? ?? ?? 85 C0 7F ?? 8B 86 ?? ?? ?? ?? 50 EB ?? 8B 96 ?? ?? ?? ?? 6A 00 8D 8C 24 ?? ?? ?? ?? 68 58 02 00 00 51 52 FF 15 ?? ?? ?? ?? 80 BC 24 ?? ?? ?? ?? 05 75 ?? 8A 84 24 ?? ?? ?? ?? 84 C0 75 ?? 5F 5E 5D B0 01 5B 81 C4 7C 04 00 00 C2 08 00 8B 8E ?? ?? ?? ?? 51 FF 15 ?? ?? ?? ?? 5F 5E 5D 32 C0 5B 81 C4 7C 04 00 00 C2 08 00", + ), + # Test match-2-yar x64 EXE - Basic Block Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x14000109F, + capa.rules.BASIC_BLOCK_SCOPE, + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar x64 EXE - Function Extraction + pytest.param( + get_data_path("c2bb17c12975ea61ff43a71afd9c3ff111d018af161859abae0bdb0b3dae98f9.exe_"), + False, + "50580ef0b882905316c4569162ea07d9", + 0x140001010, + capa.rules.FUNCTION_SCOPE, + "48 89 5C 24 08 57 48 83 EC 50 48 8B 05 DF 1F 00 00 48 33 C4 48 89 44 24 40 66 0F 6F 15 8F 12 00 00 48 8D 3D 08 20 00 00 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 10 66 0F EF CA F3 0F 7F 4C 39 10 F3 0F 6F 44 39 20 66 0F EF C2 F3 0F 7F 44 39 20 F3 0F 6F 44 39 30 66 0F EF C2 F3 0F 7F 44 39 30 48 83 C1 40 48 3B C8 7C B9 66 0F 1F 84 00 00 00 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C F1 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 40 FF 15 4A 0F 00 00 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 65 0D 00 00 48 8D 0D 7F 11 00 00 C7 44 24 20 20 00 00 00 C7 44 24 24 01 00 00 00 48 C7 44 24 28 00 00 00 00 48 89 5C 24 30 48 C7 44 24 38 00 00 00 00 FF 15 0A 0F 00 00 4C 8D 44 24 20 48 8D 15 46 11 00 00 48 8D 0D 77 11 00 00 FF 15 F9 0E 00 00 33 C0 48 8B 4C 24 40 48 33 CC E8 2A 00 00 00 48 8B 5C 24 60 48 83 C4 50 5F C3", + "48 89 5C 24 ?? 57 48 83 EC 50 48 8B 05 ?? ?? ?? ?? 48 33 C4 48 89 44 24 ?? 66 0F 6F 15 ?? ?? 00 00 48 8D 3D ?? ?? ?? ?? 33 C9 B8 00 03 00 00 90 F3 0F 6F 04 39 66 0F EF C2 F3 0F 7F 04 39 F3 0F 6F 4C 39 ?? 66 0F EF CA F3 0F 7F 4C 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? F3 0F 6F 44 39 ?? 66 0F EF C2 F3 0F 7F 44 39 ?? 48 83 C1 40 48 3B C8 7C ?? 66 0F 1F 84 00 ?? ?? 00 00 80 34 38 62 48 FF C0 48 3D 1F 03 00 00 7C ?? 33 C9 BA 1F 03 00 00 41 B8 00 10 00 00 44 8D 49 ?? FF 15 ?? ?? ?? ?? 41 B8 1F 03 00 00 48 8B D7 48 8B C8 48 8B D8 E8 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? C7 44 24 ?? 20 00 00 00 C7 44 24 ?? 01 00 00 00 48 C7 44 24 ?? 00 00 00 00 48 89 5C 24 ?? 48 C7 44 24 ?? 00 00 00 00 FF 15 ?? ?? ?? ?? 4C 8D 44 24 ?? 48 8D 15 ?? ?? ?? ?? 48 8D 0D ?? ?? ?? ?? FF 15 ?? ?? ?? ?? 33 C0 48 8B 4C 24 ?? 48 33 CC E8 ?? ?? ?? ?? 48 8B 5C 24 ?? 48 83 C4 50 5F C3", + ), + # Test match-2-yar .NET EXE - Function Extraction + pytest.param( + get_data_path("dotnet/1c444ebeba24dcba8628b7dfe5fec7c6.exe_"), + True, + "1c444ebeba24dcba8628b7dfe5fec7c6", + 0x06000073, + capa.rules.FUNCTION_SCOPE, + "03 28 7D 00 00 06 0A 12 01 FE 15 0A 00 00 02 03 12 01 28 7F 00 00 06 26 12 01 7B 7B 00 00 04 12 01 7B 79 00 00 04 59 0C 12 01 7B 7C 00 00 04 12 01 7B 7A 00 00 04 59 0D 06 28 77 00 00 06 13 04 06 08 09 28 76 00 00 06 13 05 11 04 11 05 28 7A 00 00 06 13 06 11 04 16 16 08 09 06 16 16 20 20 00 CC 00 28 75 00 00 06 26 11 04 11 06 28 7A 00 00 06 26 11 04 28 78 00 00 06 26 03 06 28 7E 00 00 06 26 11 05 28 65 00 00 0A 13 07 11 05 28 79 00 00 06 26 11 07 2A", + "03 28 ?? ?? ?? ?? 0A 12 ?? FE 15 ?? ?? ?? ?? 03 12 ?? 28 ?? ?? ?? ?? 26 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0C 12 ?? 7B ?? ?? ?? ?? 12 ?? 7B ?? ?? ?? ?? 59 0D 06 28 ?? ?? ?? ?? 13 ?? 06 08 09 28 ?? ?? ?? ?? 13 ?? 11 ?? 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 16 16 08 09 06 16 16 20 ?? ?? ?? ?? 28 ?? ?? ?? ?? 26 11 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 26 03 06 28 ?? ?? ?? ?? 26 11 ?? 28 ?? ?? ?? ?? 13 ?? 11 ?? 28 ?? ?? ?? ?? 26 11 ?? 2A", + ), + ], +) +def test_match2yar_feature_extraction(path, is_dotnet, filemd5, addr, scope, expected_bytestring, expected_sig): + """Test extracting a function byte string using vivisect workspaces""" + output = get_match_2_yar_features(path, is_dotnet) + + output = output.decode("utf8") + output_data = json.loads(output) + + # Get data for filemd5: + file_features = output_data[filemd5] + + # Filter for addr with correct scope + addr_features = [x for x in file_features if x["addr"] == addr and x["scope"] == scope] + + # This should be unique + assert len(addr_features) == 1 + + # Check extraction and masking + assert addr_features[0]["bytez"] == expected_bytestring + assert addr_features[0]["sig"] == expected_sig