From 38c6623e25860bcd8a639846998a2cdccaef1065 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 01:47:22 +0100 Subject: [PATCH 01/44] initial commit --- capa/capabilities/common.py | 10 ++++++--- capa/capabilities/dynamic.py | 16 +++++++++++++- capa/capabilities/static.py | 14 +++++++++++- capa/exceptions.py | 12 +++++++++++ capa/helpers.py | 9 ++++++++ capa/main.py | 42 +++++++++++++++++++++++++++++++++--- 6 files changed, 95 insertions(+), 8 deletions(-) diff --git a/capa/capabilities/common.py b/capa/capabilities/common.py index a73f40afe..abeba9e01 100644 --- a/capa/capabilities/common.py +++ b/capa/capabilities/common.py @@ -63,7 +63,7 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon def find_capabilities( - ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None, **kwargs + ruleset: RuleSet, extractor: FeatureExtractor, target_elements=None, disable_progress=None, **kwargs ) -> Tuple[MatchResults, Any]: from capa.capabilities.static import find_static_capabilities from capa.capabilities.dynamic import find_dynamic_capabilities @@ -72,8 +72,12 @@ def find_capabilities( # for the time being, extractors are either static or dynamic. # Remove this assertion once that has changed assert not isinstance(extractor, DynamicFeatureExtractor) - return find_static_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) + return find_static_capabilities( + ruleset, extractor, target_functions=target_elements, disable_progress=disable_progress, **kwargs + ) if isinstance(extractor, DynamicFeatureExtractor): - return find_dynamic_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) + return find_dynamic_capabilities( + ruleset, extractor, target_processes=target_elements, disable_progress=disable_progress, **kwargs + ) raise ValueError(f"unexpected extractor type: {extractor.__class__.__name__}") diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py index 98858c581..10e4396ef 100644 --- a/capa/capabilities/dynamic.py +++ b/capa/capabilities/dynamic.py @@ -20,6 +20,7 @@ from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import redirecting_print_to_tqdm +from capa.exceptions import NonExistantProcessError from capa.capabilities.common import find_file_capabilities from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor @@ -130,12 +131,15 @@ def find_process_capabilities( def find_dynamic_capabilities( - ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None + ruleset: RuleSet, extractor: DynamicFeatureExtractor, target_processes=None, disable_progress=None ) -> Tuple[MatchResults, Any]: all_process_matches: MatchResults = collections.defaultdict(list) all_thread_matches: MatchResults = collections.defaultdict(list) all_call_matches: MatchResults = collections.defaultdict(list) + if target_processes is None: + target_processes = set() + feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=()) assert isinstance(extractor, DynamicFeatureExtractor) @@ -155,6 +159,16 @@ def pbar(s, *args, **kwargs): processes = list(extractor.get_processes()) + if target_processes: + # analyze only the processes that the user required. + # if none were provided, analyze all processes. + processes_set = set(map(lambda h: h.address.pid, processes)) + if not (target_processes <= processes_set): + raise NonExistantProcessError( + f"The following process ids were not found in the report: {target_processes - processes_set}" + ) + processes = list(filter(lambda h: h.address.pid in target_processes, processes)) + pb = pbar(processes, desc="matching", unit=" processes", leave=False) for p in pb: process_matches, thread_matches, call_matches, feature_count = find_process_capabilities( diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 4f3b3b6a1..5da612db3 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -21,6 +21,7 @@ from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import redirecting_print_to_tqdm +from capa.exceptions import NonExistantFunctionError from capa.capabilities.common import find_file_capabilities from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor @@ -133,7 +134,7 @@ def find_code_capabilities( def find_static_capabilities( - ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None + ruleset: RuleSet, extractor: StaticFeatureExtractor, target_functions=None, disable_progress=None ) -> Tuple[MatchResults, Any]: all_function_matches: MatchResults = collections.defaultdict(list) all_bb_matches: MatchResults = collections.defaultdict(list) @@ -163,6 +164,17 @@ def pbar(s, *args, **kwargs): return s functions = list(extractor.get_functions()) + + if target_functions: + # analyze only the functions that the user required. + # if none were provided, analyze all functions. + functions_set = set(map(lambda h: h.address, functions)) + if not target_functions <= functions_set: + raise NonExistantFunctionError( + f"The following function addresses were not found in the sample: {target_functions - functions_set}" + ) + functions = list(filter(lambda h: h.address in target_functions, functions)) + n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions", leave=False) diff --git a/capa/exceptions.py b/capa/exceptions.py index 0c900d72c..882c07181 100644 --- a/capa/exceptions.py +++ b/capa/exceptions.py @@ -23,3 +23,15 @@ class UnsupportedOSError(ValueError): class EmptyReportError(ValueError): pass + + +class InvalidArgument(ValueError): + pass + + +class NonExistantFunctionError(ValueError): + pass + + +class NonExistantProcessError(ValueError): + pass diff --git a/capa/helpers.py b/capa/helpers.py index 77380c7ed..0e9887a86 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -46,6 +46,15 @@ def hex(n: int) -> str: return f"0x{(n):X}" +def str_to_number(s: str) -> int: + if s.isdecimal(): + return int(s) + try: + return int(s, 16) + except ValueError: + raise ValueError(f"{s} is not a valid number.") + + def get_file_taste(sample_path: Path) -> bytes: if not sample_path.exists(): raise IOError(f"sample path {sample_path} does not exist or cannot be accessed") diff --git a/capa/main.py b/capa/main.py index eb43769d2..c573e1cb9 100644 --- a/capa/main.py +++ b/capa/main.py @@ -17,7 +17,7 @@ import textwrap import contextlib from types import TracebackType -from typing import Any, Dict, List, Optional +from typing import Any, Set, Dict, List, Union, Optional from pathlib import Path import colorama @@ -44,6 +44,7 @@ from capa.engine import MatchResults from capa.loader import BACKEND_VIV, BACKEND_CAPE, BACKEND_BINJA, BACKEND_DOTNET, BACKEND_FREEZE, BACKEND_PEFILE from capa.helpers import ( + str_to_number, get_file_taste, get_auto_format, log_unsupported_os_error, @@ -53,6 +54,7 @@ log_unsupported_cape_report_error, ) from capa.exceptions import ( + InvalidArgument, EmptyReportError, UnsupportedOSError, UnsupportedArchError, @@ -264,6 +266,22 @@ def install_common_args(parser, wanted=None): help=f"select backend, {backend_help}", ) + if "functions" in wanted: + parser.add_argument( + "--functions", + type=lambda s: s.replace(" ", "").split(","), + default=[], + help=f"provide a list of comma-separated functions to analyze (static analysis).", + ) + + if "processes" in wanted: + parser.add_argument( + "--processes", + type=lambda s: s.replace(" ", "").split(","), + default=[], + help=f"provide a list of comma-separaed processes to analyze (dynamic analysis).", + ) + if "os" in wanted: oses = [ (OS_AUTO, "detect OS automatically - default"), @@ -755,6 +773,19 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_INVALID_FILE_OS) from e +def get_target_elements_from_cli(args, file_extractor) -> Union[None, Set]: + if isinstance(file_extractor, StaticFeatureExtractor): + if args.processes: + raise InvalidArgument("Cannot provide process ids with static analysis.") + return set(map(str_to_number, args.functions)) + elif isinstance(file_extractor, DynamicFeatureExtractor): + if args.functions: + raise InvalidArgument("Cannot provide function addresses with dynamic analysis.") + return set(map(str_to_number, args.processes)) + else: + return ShouldExitError("Invalid file extractor is neither static nor dynamic.") + + def main(argv: Optional[List[str]] = None): if sys.version_info < (3, 8): raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+") @@ -794,7 +825,9 @@ def main(argv: Optional[List[str]] = None): parser = argparse.ArgumentParser( description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter ) - install_common_args(parser, {"input_file", "format", "backend", "os", "signatures", "rules", "tag"}) + install_common_args( + parser, {"input_file", "format", "backend", "os", "signatures", "rules", "tag", "functions", "processes"} + ) parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text") args = parser.parse_args(args=argv) @@ -805,6 +838,7 @@ def main(argv: Optional[List[str]] = None): rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) + target_elements = get_target_elements_from_cli(args, file_extractors[0]) except ShouldExitError as e: return e.status_code @@ -832,7 +866,9 @@ def main(argv: Optional[List[str]] = None): except ShouldExitError as e: return e.status_code - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) + capabilities, counts = find_capabilities( + rules, extractor, target_elements=target_elements, disable_progress=args.quiet + ) meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts) meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities) From 154afe1886e47f5b09916bdf976f3c8065748f0c Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 02:23:18 +0100 Subject: [PATCH 02/44] test_capabilities.py: add tests --- tests/test_capabilities.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index ddc7f6c3f..d2b23f031 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -174,6 +174,36 @@ def test_subscope_bb_rules(z9324d_extractor): assert "test rule" in capabilities +def test_match_specific_functions(z9324d_extractor): + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: receive data + scopes: + static: function + dynamic: call + examples: + - 9324d1a8ae37a36ae560c37448c9705a:0x401CD0 + features: + - or: + - api: recv + """ + ) + ) + ] + ) + capabilities, meta = capa.capabilities.common.find_capabilities(rules, z9324d_extractor, target_elements={0x4019C0}) + matches = capabilities["receive data"] + # test that we received only one match + assert len(matches) == 1 + # and that this match is from the specified function + assert matches[0][0] == 0x4019C0 + + def test_byte_matching(z9324d_extractor): rules = capa.rules.RuleSet( [ From 1ae174bf72060cbb81b813f91368c3ea0b81bbda Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 02:24:53 +0100 Subject: [PATCH 03/44] CHANGELOG.md: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fab4d23a5..c673f0a5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master (unreleased) ### New Features +- Add the ability to select which specific functions or processes to analyze @yelhamer ### Breaking Changes From acd69a31d13b5e96ad46f94a0a9de216917d054f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 02:31:24 +0100 Subject: [PATCH 04/44] usage.md: updated documentation --- doc/usage.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/usage.md b/doc/usage.md index 74b163f4a..58b35a722 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -9,6 +9,15 @@ Use the `-t` option to run rules with the given metadata value (see the rule fie For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference Willi's email address (probably as the author), or `capa -t communication` runs rules with the namespace `communication`. +### only analyze selected functions +Use the `--functions` option to extract capabilities from only a selected set of functions. +For example, `capa sample.exe --functions 0x4019C0,0x401CD0` will only extract the capabilities in the functions found at +addresses 0x401CD0 and 0x4019C0. + +### only analyze selected processes +Use the `--processes` option to extract capabilities from only a selected set of processes. +For example, `capa report.log --processes 3888,3214,4299` will extract capabilities only from the processes 3888, 3214, and 4299. + ### IDA Pro plugin: capa explorer Please check out the [capa explorer documentation](/capa/ida/plugin/README.md). @@ -16,4 +25,4 @@ Please check out the [capa explorer documentation](/capa/ida/plugin/README.md). Set the environment variable `CAPA_SAVE_WORKSPACE` to instruct the underlying analysis engine to cache its intermediate results to the file system. For example, vivisect will create `.viv` files. Subsequently, capa may run faster when reprocessing the same input file. -This is particularly useful during rule development as you repeatedly test a rule against a known sample. \ No newline at end of file +This is particularly useful during rule development as you repeatedly test a rule against a known sample. From 3aaae2e2be0788b7836136211396b1795dfb8ef7 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 02:55:44 +0100 Subject: [PATCH 05/44] main.py: use input_format instead of file_extractors to determine analysis flavor --- capa/main.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/capa/main.py b/capa/main.py index c573e1cb9..cfc7abdeb 100644 --- a/capa/main.py +++ b/capa/main.py @@ -75,6 +75,8 @@ FORMAT_DOTNET, FORMAT_FREEZE, FORMAT_RESULT, + STATIC_FORMATS, + DYNAMIC_FORMATS, ) from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor @@ -773,17 +775,17 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_INVALID_FILE_OS) from e -def get_target_elements_from_cli(args, file_extractor) -> Union[None, Set]: - if isinstance(file_extractor, StaticFeatureExtractor): +def get_target_elements_from_cli(args, input_format) -> Union[None, Set]: + if input_format in STATIC_FORMATS: if args.processes: raise InvalidArgument("Cannot provide process ids with static analysis.") return set(map(str_to_number, args.functions)) - elif isinstance(file_extractor, DynamicFeatureExtractor): + elif input_format in DYNAMIC_FORMATS: if args.functions: raise InvalidArgument("Cannot provide function addresses with dynamic analysis.") return set(map(str_to_number, args.processes)) else: - return ShouldExitError("Invalid file extractor is neither static nor dynamic.") + return ShouldExitError(f"format {input_format} is neither static nor dynamic.") def main(argv: Optional[List[str]] = None): @@ -838,7 +840,7 @@ def main(argv: Optional[List[str]] = None): rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) - target_elements = get_target_elements_from_cli(args, file_extractors[0]) + target_elements = get_target_elements_from_cli(args, input_format) except ShouldExitError as e: return e.status_code From f7c43e950f698d59a28a2c513dbf7da5bf49102a Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 02:58:08 +0100 Subject: [PATCH 06/44] fix linting --- capa/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index cfc7abdeb..ed98975e1 100644 --- a/capa/main.py +++ b/capa/main.py @@ -273,7 +273,7 @@ def install_common_args(parser, wanted=None): "--functions", type=lambda s: s.replace(" ", "").split(","), default=[], - help=f"provide a list of comma-separated functions to analyze (static analysis).", + help="provide a list of comma-separated functions to analyze (static analysis).", ) if "processes" in wanted: @@ -281,7 +281,7 @@ def install_common_args(parser, wanted=None): "--processes", type=lambda s: s.replace(" ", "").split(","), default=[], - help=f"provide a list of comma-separaed processes to analyze (dynamic analysis).", + help="provide a list of comma-separaed processes to analyze (dynamic analysis).", ) if "os" in wanted: From b7e345d83caf748088dbc9aaacb3dac59894aaea Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 03:09:35 +0100 Subject: [PATCH 07/44] apply flake8 suggestions --- capa/capabilities/dynamic.py | 2 +- capa/capabilities/static.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py index 10e4396ef..952866c0f 100644 --- a/capa/capabilities/dynamic.py +++ b/capa/capabilities/dynamic.py @@ -162,7 +162,7 @@ def pbar(s, *args, **kwargs): if target_processes: # analyze only the processes that the user required. # if none were provided, analyze all processes. - processes_set = set(map(lambda h: h.address.pid, processes)) + processes_set = {ph.address.pid for ph in processes} if not (target_processes <= processes_set): raise NonExistantProcessError( f"The following process ids were not found in the report: {target_processes - processes_set}" diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 5da612db3..ae954d778 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -168,7 +168,7 @@ def pbar(s, *args, **kwargs): if target_functions: # analyze only the functions that the user required. # if none were provided, analyze all functions. - functions_set = set(map(lambda h: h.address, functions)) + functions_set = {fh.address for fh in functions} if not target_functions <= functions_set: raise NonExistantFunctionError( f"The following function addresses were not found in the sample: {target_functions - functions_set}" From 8c8321b7e77fe916b5e9374636042ddb79c69cac Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 03:12:24 +0100 Subject: [PATCH 08/44] main.py: Use Optional typehint --- capa/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index ed98975e1..d41dddab0 100644 --- a/capa/main.py +++ b/capa/main.py @@ -17,7 +17,7 @@ import textwrap import contextlib from types import TracebackType -from typing import Any, Set, Dict, List, Union, Optional +from typing import Any, Set, Dict, List, Optional from pathlib import Path import colorama @@ -775,7 +775,7 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_INVALID_FILE_OS) from e -def get_target_elements_from_cli(args, input_format) -> Union[None, Set]: +def get_target_elements_from_cli(args, input_format) -> Optional[Set]: if input_format in STATIC_FORMATS: if args.processes: raise InvalidArgument("Cannot provide process ids with static analysis.") From 1642e7e8f9af6107e9e62df86378cb9bb8608a8c Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 03:28:40 +0100 Subject: [PATCH 09/44] main.py: bugfix for return instead of raise --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index d41dddab0..66204ab83 100644 --- a/capa/main.py +++ b/capa/main.py @@ -785,7 +785,7 @@ def get_target_elements_from_cli(args, input_format) -> Optional[Set]: raise InvalidArgument("Cannot provide function addresses with dynamic analysis.") return set(map(str_to_number, args.processes)) else: - return ShouldExitError(f"format {input_format} is neither static nor dynamic.") + raise ShouldExitError(f"format {input_format} is neither static nor dynamic.") def main(argv: Optional[List[str]] = None): From 090ade587e93161f02cd90e08fdf55b0453ee1be Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Wed, 19 Jun 2024 03:33:31 +0100 Subject: [PATCH 10/44] main.py: add errorcode for invalid input format --- capa/main.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 66204ab83..0c815f5e7 100644 --- a/capa/main.py +++ b/capa/main.py @@ -100,6 +100,7 @@ E_MISSING_CAPE_DYNAMIC_ANALYSIS = 22 E_EMPTY_REPORT = 23 E_UNSUPPORTED_GHIDRA_EXECUTION_MODE = 24 +E_INVALID_INPUT_FORMAT = 25 logger = logging.getLogger("capa") @@ -785,7 +786,7 @@ def get_target_elements_from_cli(args, input_format) -> Optional[Set]: raise InvalidArgument("Cannot provide function addresses with dynamic analysis.") return set(map(str_to_number, args.processes)) else: - raise ShouldExitError(f"format {input_format} is neither static nor dynamic.") + raise ShouldExitError(E_INVALID_INPUT_FORMAT) def main(argv: Optional[List[str]] = None): From 8e8e0ecccf1917f0bf09329597da1fd7b3127a20 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jun 2024 08:40:31 +0100 Subject: [PATCH 11/44] Function/Process filtering: use a function to filter --- capa/capabilities/common.py | 10 ++----- capa/capabilities/dynamic.py | 16 +---------- capa/capabilities/static.py | 14 +--------- capa/features/extractors/base_extractor.py | 23 +++++++++++++++- capa/main.py | 32 +++++++++++++++++----- tests/test_capabilities.py | 4 ++- 6 files changed, 55 insertions(+), 44 deletions(-) diff --git a/capa/capabilities/common.py b/capa/capabilities/common.py index abeba9e01..a73f40afe 100644 --- a/capa/capabilities/common.py +++ b/capa/capabilities/common.py @@ -63,7 +63,7 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon def find_capabilities( - ruleset: RuleSet, extractor: FeatureExtractor, target_elements=None, disable_progress=None, **kwargs + ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None, **kwargs ) -> Tuple[MatchResults, Any]: from capa.capabilities.static import find_static_capabilities from capa.capabilities.dynamic import find_dynamic_capabilities @@ -72,12 +72,8 @@ def find_capabilities( # for the time being, extractors are either static or dynamic. # Remove this assertion once that has changed assert not isinstance(extractor, DynamicFeatureExtractor) - return find_static_capabilities( - ruleset, extractor, target_functions=target_elements, disable_progress=disable_progress, **kwargs - ) + return find_static_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) if isinstance(extractor, DynamicFeatureExtractor): - return find_dynamic_capabilities( - ruleset, extractor, target_processes=target_elements, disable_progress=disable_progress, **kwargs - ) + return find_dynamic_capabilities(ruleset, extractor, disable_progress=disable_progress, **kwargs) raise ValueError(f"unexpected extractor type: {extractor.__class__.__name__}") diff --git a/capa/capabilities/dynamic.py b/capa/capabilities/dynamic.py index 952866c0f..98858c581 100644 --- a/capa/capabilities/dynamic.py +++ b/capa/capabilities/dynamic.py @@ -20,7 +20,6 @@ from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import redirecting_print_to_tqdm -from capa.exceptions import NonExistantProcessError from capa.capabilities.common import find_file_capabilities from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor @@ -131,15 +130,12 @@ def find_process_capabilities( def find_dynamic_capabilities( - ruleset: RuleSet, extractor: DynamicFeatureExtractor, target_processes=None, disable_progress=None + ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None ) -> Tuple[MatchResults, Any]: all_process_matches: MatchResults = collections.defaultdict(list) all_thread_matches: MatchResults = collections.defaultdict(list) all_call_matches: MatchResults = collections.defaultdict(list) - if target_processes is None: - target_processes = set() - feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=()) assert isinstance(extractor, DynamicFeatureExtractor) @@ -159,16 +155,6 @@ def pbar(s, *args, **kwargs): processes = list(extractor.get_processes()) - if target_processes: - # analyze only the processes that the user required. - # if none were provided, analyze all processes. - processes_set = {ph.address.pid for ph in processes} - if not (target_processes <= processes_set): - raise NonExistantProcessError( - f"The following process ids were not found in the report: {target_processes - processes_set}" - ) - processes = list(filter(lambda h: h.address.pid in target_processes, processes)) - pb = pbar(processes, desc="matching", unit=" processes", leave=False) for p in pb: process_matches, thread_matches, call_matches, feature_count = find_process_capabilities( diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index ae954d778..4f3b3b6a1 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -21,7 +21,6 @@ from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import redirecting_print_to_tqdm -from capa.exceptions import NonExistantFunctionError from capa.capabilities.common import find_file_capabilities from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor @@ -134,7 +133,7 @@ def find_code_capabilities( def find_static_capabilities( - ruleset: RuleSet, extractor: StaticFeatureExtractor, target_functions=None, disable_progress=None + ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None ) -> Tuple[MatchResults, Any]: all_function_matches: MatchResults = collections.defaultdict(list) all_bb_matches: MatchResults = collections.defaultdict(list) @@ -164,17 +163,6 @@ def pbar(s, *args, **kwargs): return s functions = list(extractor.get_functions()) - - if target_functions: - # analyze only the functions that the user required. - # if none were provided, analyze all functions. - functions_set = {fh.address for fh in functions} - if not target_functions <= functions_set: - raise NonExistantFunctionError( - f"The following function addresses were not found in the sample: {target_functions - functions_set}" - ) - functions = list(filter(lambda h: h.address in target_functions, functions)) - n_funcs = len(functions) pb = pbar(functions, desc="matching", unit=" functions", postfix="skipped 0 library functions", leave=False) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 002117fc6..08b62a7d7 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -9,7 +9,8 @@ import abc import hashlib import dataclasses -from typing import Any, Dict, Tuple, Union, Iterator +from types import MethodType +from typing import Any, Set, Dict, Tuple, Union, Iterator from dataclasses import dataclass # TODO(williballenthin): use typing.TypeAlias directly when Python 3.9 is deprecated @@ -296,6 +297,16 @@ def extract_insn_features( raise NotImplementedError() +def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor: + get_functions = extractor.get_functions # fetch original get_functions() + + def filtered_get_functions(self): + yield from (f for f in get_functions() if f.address in functions) + + extractor.get_functions = MethodType(filtered_get_functions, extractor) + return extractor + + @dataclass class ProcessHandle: """ @@ -467,4 +478,14 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> raise NotImplementedError() +def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: + get_processes = extractor.get_processes # fetch original get_functions() + + def filtered_get_processes(self): + yield from (f for f in get_processes() if f.address.pid in processes) + + extractor.get_processes = MethodType(filtered_get_processes, extractor) + return extractor + + FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor] diff --git a/capa/main.py b/capa/main.py index 0c815f5e7..f00d00a4c 100644 --- a/capa/main.py +++ b/capa/main.py @@ -79,7 +79,13 @@ DYNAMIC_FORMATS, ) from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities -from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import ( + ProcessFilter, + FunctionFilter, + FeatureExtractor, + StaticFeatureExtractor, + DynamicFeatureExtractor, +) RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" @@ -101,6 +107,7 @@ E_EMPTY_REPORT = 23 E_UNSUPPORTED_GHIDRA_EXECUTION_MODE = 24 E_INVALID_INPUT_FORMAT = 25 +E_INVALID_FEATURE_EXTRACTOR = 26 logger = logging.getLogger("capa") @@ -779,16 +786,25 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr def get_target_elements_from_cli(args, input_format) -> Optional[Set]: if input_format in STATIC_FORMATS: if args.processes: - raise InvalidArgument("Cannot provide process ids with static analysis.") + raise InvalidArgument("Cannot filter processes with static analysis.") return set(map(str_to_number, args.functions)) elif input_format in DYNAMIC_FORMATS: if args.functions: - raise InvalidArgument("Cannot provide function addresses with dynamic analysis.") + raise InvalidArgument("Cannot filter functions with dynamic analysis.") return set(map(str_to_number, args.processes)) else: raise ShouldExitError(E_INVALID_INPUT_FORMAT) +def apply_extractor_filters(extractor: FeatureExtractor, elements: Set) -> FeatureExtractor: + if isinstance(extractor, StaticFeatureExtractor): + return FunctionFilter(extractor, elements) + elif isinstance(extractor, DynamicFeatureExtractor): + return ProcessFilter(extractor, elements) + else: + raise ShouldExitError(E_INVALID_FEATURE_EXTRACTOR) + + def main(argv: Optional[List[str]] = None): if sys.version_info < (3, 8): raise UnsupportedRuntimeError("This version of capa can only be used with Python 3.8+") @@ -838,10 +854,10 @@ def main(argv: Optional[List[str]] = None): handle_common_args(args) ensure_input_exists_from_cli(args) input_format = get_input_format_from_cli(args) + target_elements = get_target_elements_from_cli(args, input_format) rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) - target_elements = get_target_elements_from_cli(args, input_format) except ShouldExitError as e: return e.status_code @@ -869,9 +885,11 @@ def main(argv: Optional[List[str]] = None): except ShouldExitError as e: return e.status_code - capabilities, counts = find_capabilities( - rules, extractor, target_elements=target_elements, disable_progress=args.quiet - ) + if target_elements: + # if the user specified function/process filters, apply them here. + extractor = apply_extractor_filters(extractor, target_elements) + + capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts) meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities) diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index d2b23f031..5c6de51b4 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -9,6 +9,7 @@ import textwrap import capa.capabilities.common +from capa.features.extractors.base_extractor import FunctionFilter def test_match_across_scopes_file_function(z9324d_extractor): @@ -196,7 +197,8 @@ def test_match_specific_functions(z9324d_extractor): ) ] ) - capabilities, meta = capa.capabilities.common.find_capabilities(rules, z9324d_extractor, target_elements={0x4019C0}) + extractor = FunctionFilter(z9324d_extractor, {0x4019C0}) + capabilities, meta = capa.capabilities.common.find_capabilities(rules, extractor) matches = capabilities["receive data"] # test that we received only one match assert len(matches) == 1 From 1d526008fc79ca8343dd385e3d75ddedf170b510 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jun 2024 08:55:30 +0100 Subject: [PATCH 12/44] Function/Process filtering: ignore mypy errors for method reassignment --- capa/features/extractors/base_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 08b62a7d7..9dc1b7f20 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -303,7 +303,7 @@ def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticF def filtered_get_functions(self): yield from (f for f in get_functions() if f.address in functions) - extractor.get_functions = MethodType(filtered_get_functions, extractor) + extractor.get_functions = MethodType(filtered_get_functions, extractor) # type: ignore return extractor @@ -484,7 +484,7 @@ def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> Dynamic def filtered_get_processes(self): yield from (f for f in get_processes() if f.address.pid in processes) - extractor.get_processes = MethodType(filtered_get_processes, extractor) + extractor.get_processes = MethodType(filtered_get_processes, extractor) # type: ignore return extractor From d78272fb176f027290e0706f648ac22f5bf8699d Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 20 Jun 2024 09:06:18 +0100 Subject: [PATCH 13/44] function/proc filtering tests: use a copy of the extractor in order to not interfer with following tests --- tests/test_capabilities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index 5c6de51b4..ea9479b6f 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -7,6 +7,7 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import textwrap +from copy import copy import capa.capabilities.common from capa.features.extractors.base_extractor import FunctionFilter @@ -197,7 +198,7 @@ def test_match_specific_functions(z9324d_extractor): ) ] ) - extractor = FunctionFilter(z9324d_extractor, {0x4019C0}) + extractor = FunctionFilter(copy(z9324d_extractor), {0x4019C0}) capabilities, meta = capa.capabilities.common.find_capabilities(rules, extractor) matches = capabilities["receive data"] # test that we received only one match From e3071f8065ce8b1eaa9ef42456772a5221b4154f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 06:47:28 +0100 Subject: [PATCH 14/44] Extractor Filters: wrap classes and overwrite __class__ instead of using function factories --- capa/features/extractors/base_extractor.py | 59 +++++++++++++++++----- capa/main.py | 8 +-- tests/test_capabilities.py | 3 +- 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 9dc1b7f20..12164e529 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -9,7 +9,6 @@ import abc import hashlib import dataclasses -from types import MethodType from typing import Any, Set, Dict, Tuple, Union, Iterator from dataclasses import dataclass @@ -297,14 +296,31 @@ def extract_insn_features( raise NotImplementedError() -def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor: - get_functions = extractor.get_functions # fetch original get_functions() +class StaticFeatureExtractorFilter: + def __init__(self, inner: StaticFeatureExtractor): + self.inner = inner - def filtered_get_functions(self): - yield from (f for f in get_functions() if f.address in functions) + def __getattr__(self, attr): + if attr in self.__dict__: + return getattr(self, attr) + return getattr(self.inner, attr) - extractor.get_functions = MethodType(filtered_get_functions, extractor) # type: ignore - return extractor + @property + def __class__(self): + return self.inner.__class__ + + @__class__.setter + def __class__(self, value) -> None: + self.inner.__class__ = value + + +class FunctionFilter(StaticFeatureExtractorFilter): + def __init__(self, inner: StaticFeatureExtractor, functions: Set[Address]): + super().__init__(inner) + self.functions = functions + + def get_functions(self): + yield from (f for f in self.inner.get_functions() if f.address in self.functions) @dataclass @@ -478,14 +494,31 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> raise NotImplementedError() -def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: - get_processes = extractor.get_processes # fetch original get_functions() +class DynamicFeatureExtractorFilter: + def __init__(self, inner: DynamicFeatureExtractor): + self.inner = inner + + def __getattr__(self, attr): + if attr in self.__dict__: + return getattr(self, attr) + return getattr(self.inner, attr) + + @property + def __class__(self): + return self.inner.__class__ + + @__class__.setter + def __class__(self, value) -> None: + self.inner.__class__ = value + - def filtered_get_processes(self): - yield from (f for f in get_processes() if f.address.pid in processes) +class ProcessFilter(DynamicFeatureExtractorFilter): + def __init__(self, inner: DynamicFeatureExtractor, processes: Set[Address]): + super().__init__(inner) + self.processes = processes - extractor.get_processes = MethodType(filtered_get_processes, extractor) # type: ignore - return extractor + def get_processes(self): + yield from (p for p in self.inner.get_processes() if p.address.pid in self.processes) FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor] diff --git a/capa/main.py b/capa/main.py index f00d00a4c..8b7644881 100644 --- a/capa/main.py +++ b/capa/main.py @@ -783,7 +783,7 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_INVALID_FILE_OS) from e -def get_target_elements_from_cli(args, input_format) -> Optional[Set]: +def get_extractor_filters_from_cli(args, input_format) -> Optional[Set]: if input_format in STATIC_FORMATS: if args.processes: raise InvalidArgument("Cannot filter processes with static analysis.") @@ -854,7 +854,7 @@ def main(argv: Optional[List[str]] = None): handle_common_args(args) ensure_input_exists_from_cli(args) input_format = get_input_format_from_cli(args) - target_elements = get_target_elements_from_cli(args, input_format) + extractor_filters = get_extractor_filters_from_cli(args, input_format) rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) @@ -885,9 +885,9 @@ def main(argv: Optional[List[str]] = None): except ShouldExitError as e: return e.status_code - if target_elements: + if extractor_filters: # if the user specified function/process filters, apply them here. - extractor = apply_extractor_filters(extractor, target_elements) + extractor = apply_extractor_filters(extractor, extractor_filters) capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) diff --git a/tests/test_capabilities.py b/tests/test_capabilities.py index ea9479b6f..5c6de51b4 100644 --- a/tests/test_capabilities.py +++ b/tests/test_capabilities.py @@ -7,7 +7,6 @@ # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. import textwrap -from copy import copy import capa.capabilities.common from capa.features.extractors.base_extractor import FunctionFilter @@ -198,7 +197,7 @@ def test_match_specific_functions(z9324d_extractor): ) ] ) - extractor = FunctionFilter(copy(z9324d_extractor), {0x4019C0}) + extractor = FunctionFilter(z9324d_extractor, {0x4019C0}) capabilities, meta = capa.capabilities.common.find_capabilities(rules, extractor) matches = capabilities["receive data"] # test that we received only one match From c2058bf7289adfd1f6ee338d195144d24acee183 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 06:53:17 +0100 Subject: [PATCH 15/44] Extractor Filters: fix mypy errors --- capa/features/extractors/base_extractor.py | 4 ++-- capa/main.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 12164e529..8fe606147 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -315,7 +315,7 @@ def __class__(self, value) -> None: class FunctionFilter(StaticFeatureExtractorFilter): - def __init__(self, inner: StaticFeatureExtractor, functions: Set[Address]): + def __init__(self, inner: StaticFeatureExtractor, functions: Set[int]): super().__init__(inner) self.functions = functions @@ -513,7 +513,7 @@ def __class__(self, value) -> None: class ProcessFilter(DynamicFeatureExtractorFilter): - def __init__(self, inner: DynamicFeatureExtractor, processes: Set[Address]): + def __init__(self, inner: DynamicFeatureExtractor, processes: Set[int]): super().__init__(inner) self.processes = processes diff --git a/capa/main.py b/capa/main.py index 8b7644881..5658dddc8 100644 --- a/capa/main.py +++ b/capa/main.py @@ -796,7 +796,7 @@ def get_extractor_filters_from_cli(args, input_format) -> Optional[Set]: raise ShouldExitError(E_INVALID_INPUT_FORMAT) -def apply_extractor_filters(extractor: FeatureExtractor, elements: Set) -> FeatureExtractor: +def apply_extractor_filters(extractor: FeatureExtractor, elements: Set): if isinstance(extractor, StaticFeatureExtractor): return FunctionFilter(extractor, elements) elif isinstance(extractor, DynamicFeatureExtractor): From c54bafcee8f1a3a512eb4444509e3a5149b49ca4 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 07:04:04 +0100 Subject: [PATCH 16/44] function/proc filtering: overwrite __instancecheck__() for extractor filters --- capa/features/extractors/base_extractor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 8fe606147..2df0593d6 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -313,6 +313,10 @@ def __class__(self): def __class__(self, value) -> None: self.inner.__class__ = value + @classmethod + def __instancecheck__(cls, instance): + return isinstance(instance, StaticFeatureExtractor) + class FunctionFilter(StaticFeatureExtractorFilter): def __init__(self, inner: StaticFeatureExtractor, functions: Set[int]): @@ -511,6 +515,10 @@ def __class__(self): def __class__(self, value) -> None: self.inner.__class__ = value + @classmethod + def __instancecheck__(cls, instance): + return isinstance(instance, DynamicFeatureExtractor) + class ProcessFilter(DynamicFeatureExtractorFilter): def __init__(self, inner: DynamicFeatureExtractor, processes: Set[int]): From fe9f3329a044d841c133f4a66430cde09e6ec62f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 07:07:28 +0100 Subject: [PATCH 17/44] base_extractor: update FeatureExtractor type to include filters --- capa/features/extractors/base_extractor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 2df0593d6..ff0801049 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -529,4 +529,6 @@ def get_processes(self): yield from (p for p in self.inner.get_processes() if p.address.pid in self.processes) -FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor] +FeatureExtractor: TypeAlias = Union[ + StaticFeatureExtractor, DynamicFeatureExtractor, StaticFeatureExtractorFilter, DynamicFeatureExtractorFilter +] From b329f3f26eb0ebfc729992873f840ec03d3af784 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 08:00:40 +0100 Subject: [PATCH 18/44] capa/loader.py: update assert_never() for mypy --- capa/loader.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/capa/loader.py b/capa/loader.py index e741175e7..8da39f201 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -52,6 +52,8 @@ FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor, + StaticFeatureExtractorFilter, + DynamicFeatureExtractorFilter, ) logger = logging.getLogger(__name__) @@ -405,9 +407,9 @@ def collect_metadata( arch = str(extractor_arch[0]) if extractor_arch else "unknown" os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_ - if isinstance(extractor, StaticFeatureExtractor): + if isinstance(extractor, StaticFeatureExtractor) or isinstance(extractor, StaticFeatureExtractorFilter): meta_class: type = rdoc.StaticMetadata - elif isinstance(extractor, DynamicFeatureExtractor): + elif isinstance(extractor, DynamicFeatureExtractor) or isinstance(extractor, DynamicFeatureExtractorFilter): meta_class = rdoc.DynamicMetadata else: assert_never(extractor) From 1a79591eb36dc8c54614d3f0668e87e3ac18ef46 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Fri, 21 Jun 2024 08:03:21 +0100 Subject: [PATCH 19/44] capa/loader.py: use tuple in isinstance() for flake8 --- capa/loader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/loader.py b/capa/loader.py index 8da39f201..fe5f1fc95 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -407,9 +407,9 @@ def collect_metadata( arch = str(extractor_arch[0]) if extractor_arch else "unknown" os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_ - if isinstance(extractor, StaticFeatureExtractor) or isinstance(extractor, StaticFeatureExtractorFilter): + if isinstance(extractor, (StaticFeatureExtractor, StaticFeatureExtractorFilter)): meta_class: type = rdoc.StaticMetadata - elif isinstance(extractor, DynamicFeatureExtractor) or isinstance(extractor, DynamicFeatureExtractorFilter): + elif isinstance(extractor, (DynamicFeatureExtractor, DynamicFeatureExtractorFilter)): meta_class = rdoc.DynamicMetadata else: assert_never(extractor) From d2c19cdcf770bf1d62de2b3b904d53b8bbcbe404 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Tue, 16 Jul 2024 13:45:52 +0100 Subject: [PATCH 20/44] Update capa/main.py Co-authored-by: Willi Ballenthin --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 5658dddc8..e42da5dab 100644 --- a/capa/main.py +++ b/capa/main.py @@ -289,7 +289,7 @@ def install_common_args(parser, wanted=None): "--processes", type=lambda s: s.replace(" ", "").split(","), default=[], - help="provide a list of comma-separaed processes to analyze (dynamic analysis).", + help="provide a list of comma-separated processes to analyze (dynamic analysis).", ) if "os" in wanted: From 02ce318ac47b07e905fc8fb5c366e67ad635175f Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 04:58:15 +0100 Subject: [PATCH 21/44] process/function filtering: override extractor object method --- capa/features/extractors/base_extractor.py | 74 ++++++---------------- capa/loader.py | 6 +- capa/main.py | 2 +- 3 files changed, 22 insertions(+), 60 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index ff0801049..bd1eb06d7 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -9,6 +9,8 @@ import abc import hashlib import dataclasses +from copy import copy +from types import MethodType from typing import Any, Set, Dict, Tuple, Union, Iterator from dataclasses import dataclass @@ -296,35 +298,17 @@ def extract_insn_features( raise NotImplementedError() -class StaticFeatureExtractorFilter: - def __init__(self, inner: StaticFeatureExtractor): - self.inner = inner +def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor: + get_functions = extractor.get_functions # fetch original get_functions() - def __getattr__(self, attr): - if attr in self.__dict__: - return getattr(self, attr) - return getattr(self.inner, attr) + def filtered_get_functions(self): + yield from (f for f in get_functions() if f.address in functions) - @property - def __class__(self): - return self.inner.__class__ + # make a copy of the extractor before decorating the get_functions() method + new_extractor = copy(extractor) + new_extractor.get_functions = MethodType(filtered_get_functions, extractor) - @__class__.setter - def __class__(self, value) -> None: - self.inner.__class__ = value - - @classmethod - def __instancecheck__(cls, instance): - return isinstance(instance, StaticFeatureExtractor) - - -class FunctionFilter(StaticFeatureExtractorFilter): - def __init__(self, inner: StaticFeatureExtractor, functions: Set[int]): - super().__init__(inner) - self.functions = functions - - def get_functions(self): - yield from (f for f in self.inner.get_functions() if f.address in self.functions) + return new_extractor @dataclass @@ -498,37 +482,17 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> raise NotImplementedError() -class DynamicFeatureExtractorFilter: - def __init__(self, inner: DynamicFeatureExtractor): - self.inner = inner - - def __getattr__(self, attr): - if attr in self.__dict__: - return getattr(self, attr) - return getattr(self.inner, attr) - - @property - def __class__(self): - return self.inner.__class__ - - @__class__.setter - def __class__(self, value) -> None: - self.inner.__class__ = value - - @classmethod - def __instancecheck__(cls, instance): - return isinstance(instance, DynamicFeatureExtractor) +def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: + get_processes = extractor.get_processes # fetch original get_functions() + def filtered_get_processes(self): + yield from (f for f in get_processes() if f.address.pid in processes) -class ProcessFilter(DynamicFeatureExtractorFilter): - def __init__(self, inner: DynamicFeatureExtractor, processes: Set[int]): - super().__init__(inner) - self.processes = processes + # make a copy of the extractor before decorating the get_processes() method + new_extractor = copy(extractor) + new_extractor.get_processes = MethodType(filtered_get_processes, extractor) - def get_processes(self): - yield from (p for p in self.inner.get_processes() if p.address.pid in self.processes) + return new_extractor -FeatureExtractor: TypeAlias = Union[ - StaticFeatureExtractor, DynamicFeatureExtractor, StaticFeatureExtractorFilter, DynamicFeatureExtractorFilter -] +FeatureExtractor: TypeAlias = Union[StaticFeatureExtractor, DynamicFeatureExtractor] diff --git a/capa/loader.py b/capa/loader.py index f5ffb709e..bc69ffb3a 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -53,8 +53,6 @@ FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor, - StaticFeatureExtractorFilter, - DynamicFeatureExtractorFilter, ) logger = logging.getLogger(__name__) @@ -433,9 +431,9 @@ def collect_metadata( arch = str(extractor_arch[0]) if extractor_arch else "unknown" os_ = str(extractor_os[0]) if extractor_os else "unknown" if os_ == OS_AUTO else os_ - if isinstance(extractor, (StaticFeatureExtractor, StaticFeatureExtractorFilter)): + if isinstance(extractor, StaticFeatureExtractor): meta_class: type = rdoc.StaticMetadata - elif isinstance(extractor, (DynamicFeatureExtractor, DynamicFeatureExtractorFilter)): + elif isinstance(extractor, DynamicFeatureExtractor): meta_class = rdoc.DynamicMetadata else: assert_never(extractor) diff --git a/capa/main.py b/capa/main.py index d927f6f98..6859422e3 100644 --- a/capa/main.py +++ b/capa/main.py @@ -84,9 +84,9 @@ FORMAT_DOTNET, FORMAT_FREEZE, FORMAT_RESULT, + FORMAT_DRAKVUF, STATIC_FORMATS, DYNAMIC_FORMATS, - FORMAT_DRAKVUF, ) from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities from capa.features.extractors.base_extractor import ( From 38e3ab14f9047ae2b931e94aee8b6725e21800bf Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 05:06:44 +0100 Subject: [PATCH 22/44] function/process filtering: ignore method reassignment type errors --- capa/features/extractors/base_extractor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index bd1eb06d7..361d4d8f0 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -306,7 +306,7 @@ def filtered_get_functions(self): # make a copy of the extractor before decorating the get_functions() method new_extractor = copy(extractor) - new_extractor.get_functions = MethodType(filtered_get_functions, extractor) + new_extractor.get_functions = MethodType(filtered_get_functions, extractor) # type: ignore return new_extractor @@ -490,7 +490,7 @@ def filtered_get_processes(self): # make a copy of the extractor before decorating the get_processes() method new_extractor = copy(extractor) - new_extractor.get_processes = MethodType(filtered_get_processes, extractor) + new_extractor.get_processes = MethodType(filtered_get_processes, extractor) # type: ignore return new_extractor From c91580bd72ded683f94b5e83144fb636b908860e Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 07:03:50 +0100 Subject: [PATCH 23/44] process/function filtering: use --restrict-to-{processes/functions} for argument --- capa/main.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/capa/main.py b/capa/main.py index 6859422e3..f3677f6ce 100644 --- a/capa/main.py +++ b/capa/main.py @@ -288,17 +288,17 @@ def install_common_args(parser, wanted=None): help=f"select backend, {backend_help}", ) - if "functions" in wanted: + if "restrict-to-functions" in wanted: parser.add_argument( - "--functions", + "--restrict-to-functions", type=lambda s: s.replace(" ", "").split(","), default=[], help="provide a list of comma-separated functions to analyze (static analysis).", ) - if "processes" in wanted: + if "restrict-to-processes" in wanted: parser.add_argument( - "--processes", + "--restrict-to-processes", type=lambda s: s.replace(" ", "").split(","), default=[], help="provide a list of comma-separated processes to analyze (dynamic analysis).", @@ -810,13 +810,13 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr def get_extractor_filters_from_cli(args, input_format) -> Optional[Set]: if input_format in STATIC_FORMATS: - if args.processes: + if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") - return set(map(str_to_number, args.functions)) + return set(map(str_to_number, args.restrict_to_functions)) elif input_format in DYNAMIC_FORMATS: - if args.functions: + if args.restrict_to_functions: raise InvalidArgument("Cannot filter functions with dynamic analysis.") - return set(map(str_to_number, args.processes)) + return set(map(str_to_number, args.restrict_to_processes)) else: raise ShouldExitError(E_INVALID_INPUT_FORMAT) @@ -870,7 +870,18 @@ def main(argv: Optional[List[str]] = None): description=desc, epilog=epilog, formatter_class=argparse.RawDescriptionHelpFormatter ) install_common_args( - parser, {"input_file", "format", "backend", "os", "signatures", "rules", "tag", "functions", "processes"} + parser, + { + "input_file", + "format", + "backend", + "os", + "signatures", + "rules", + "tag", + "restrict-to-functions", + "restrict-to-processes", + }, ) parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text") args = parser.parse_args(args=argv) From 5dc562d111d1362d615c31f627b552e58e5bc0d9 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 08:29:53 +0100 Subject: [PATCH 24/44] process/functions filtering: make `apply_extractor_filters()` extensible --- capa/helpers.py | 9 --------- capa/main.py | 15 +++++++-------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/capa/helpers.py b/capa/helpers.py index 8cb12e62b..92ed967ec 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -49,15 +49,6 @@ def hex(n: int) -> str: return f"0x{(n):X}" -def str_to_number(s: str) -> int: - if s.isdecimal(): - return int(s) - try: - return int(s, 16) - except ValueError: - raise ValueError(f"{s} is not a valid number.") - - def get_file_taste(sample_path: Path) -> bytes: if not sample_path.exists(): raise IOError(f"sample path {sample_path} does not exist or cannot be accessed") diff --git a/capa/main.py b/capa/main.py index f3677f6ce..b36e008e1 100644 --- a/capa/main.py +++ b/capa/main.py @@ -52,7 +52,6 @@ BACKEND_DRAKVUF, ) from capa.helpers import ( - str_to_number, get_file_taste, get_auto_format, log_unsupported_os_error, @@ -808,24 +807,24 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_CORRUPT_FILE) from e -def get_extractor_filters_from_cli(args, input_format) -> Optional[Set]: +def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: if input_format in STATIC_FORMATS: if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") - return set(map(str_to_number, args.restrict_to_functions)) + return {"functions": set(map(lambda x: int(x, 0), args.restrict_to_functions))} elif input_format in DYNAMIC_FORMATS: if args.restrict_to_functions: raise InvalidArgument("Cannot filter functions with dynamic analysis.") - return set(map(str_to_number, args.restrict_to_processes)) + return {"processes": set(map(lambda x: int(x, 0), args.restrict_to_processes))} else: raise ShouldExitError(E_INVALID_INPUT_FORMAT) -def apply_extractor_filters(extractor: FeatureExtractor, elements: Set): +def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: Set): if isinstance(extractor, StaticFeatureExtractor): - return FunctionFilter(extractor, elements) + return FunctionFilter(extractor, extractor_filters["functions"]) elif isinstance(extractor, DynamicFeatureExtractor): - return ProcessFilter(extractor, elements) + return ProcessFilter(extractor, extractor_filters["processes"]) else: raise ShouldExitError(E_INVALID_FEATURE_EXTRACTOR) @@ -921,7 +920,7 @@ def main(argv: Optional[List[str]] = None): except ShouldExitError as e: return e.status_code - if extractor_filters: + if any(extractor_filters.values()): # if the user specified function/process filters, apply them here. extractor = apply_extractor_filters(extractor, extractor_filters) From 5300f4a66c71ce66937c5c0678d81f2059543cb2 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 08:35:13 +0100 Subject: [PATCH 25/44] process/functions filtering: use list comprehension instead of map --- capa/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/main.py b/capa/main.py index b36e008e1..aa16347f0 100644 --- a/capa/main.py +++ b/capa/main.py @@ -811,11 +811,11 @@ def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: if input_format in STATIC_FORMATS: if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") - return {"functions": set(map(lambda x: int(x, 0), args.restrict_to_functions))} + return {"functions": set(int(addr, 0) for addr in args.restrict_to_functions)} elif input_format in DYNAMIC_FORMATS: if args.restrict_to_functions: raise InvalidArgument("Cannot filter functions with dynamic analysis.") - return {"processes": set(map(lambda x: int(x, 0), args.restrict_to_processes))} + return {"processes": set(int(pid, 0) for pid in args.restrict_to_processes)} else: raise ShouldExitError(E_INVALID_INPUT_FORMAT) @@ -921,7 +921,7 @@ def main(argv: Optional[List[str]] = None): return e.status_code if any(extractor_filters.values()): - # if the user specified function/process filters, apply them here. + # if the user specified any extractor filters, apply them here. extractor = apply_extractor_filters(extractor, extractor_filters) capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) From d6cf34a343c17044a64ec08e5d79beb57b4016cb Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 08:40:46 +0100 Subject: [PATCH 26/44] process/functions filtering: use set comprehension instead of set() --- capa/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/main.py b/capa/main.py index aa16347f0..d7ca5fcf3 100644 --- a/capa/main.py +++ b/capa/main.py @@ -811,11 +811,11 @@ def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: if input_format in STATIC_FORMATS: if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") - return {"functions": set(int(addr, 0) for addr in args.restrict_to_functions)} + return {"functions": {int(addr, 0) for addr in args.restrict_to_functions}} elif input_format in DYNAMIC_FORMATS: if args.restrict_to_functions: raise InvalidArgument("Cannot filter functions with dynamic analysis.") - return {"processes": set(int(pid, 0) for pid in args.restrict_to_processes)} + return {"processes": {int(pid, 0) for pid in args.restrict_to_processes}} else: raise ShouldExitError(E_INVALID_INPUT_FORMAT) From e4836e599c4b4e10d9ff8d4baefbb5c2cc03c4de Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Thu, 15 Aug 2024 08:44:56 +0100 Subject: [PATCH 27/44] capa/main.py: fix mypy issues --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index d7ca5fcf3..41d6c331f 100644 --- a/capa/main.py +++ b/capa/main.py @@ -820,7 +820,7 @@ def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: raise ShouldExitError(E_INVALID_INPUT_FORMAT) -def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: Set): +def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: Dict[str, Set]): if isinstance(extractor, StaticFeatureExtractor): return FunctionFilter(extractor, extractor_filters["functions"]) elif isinstance(extractor, DynamicFeatureExtractor): From 116899603da9e3140e4bb012a548067efe09d6a6 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:17:39 +0100 Subject: [PATCH 28/44] Update CHANGELOG.md: typo Co-authored-by: Willi Ballenthin --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 218856172..4edb6fbd3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## master (unreleased) ### New Features -- Add the ability to select which specific functions or processes to analyze @yelhamer +- add the ability to select which specific functions or processes to analyze @yelhamer - webui: explore capa analysis results in a web-based UI online and offline #2224 @s-ff - support analyzing DRAKVUF traces #2143 @yelhamer From 2f00b7f320ea3e2c6852426fc1145792733cc965 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:18:15 +0100 Subject: [PATCH 29/44] Update capa/main.py Co-authored-by: Willi Ballenthin --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 41d6c331f..6311f1eb2 100644 --- a/capa/main.py +++ b/capa/main.py @@ -292,7 +292,7 @@ def install_common_args(parser, wanted=None): "--restrict-to-functions", type=lambda s: s.replace(" ", "").split(","), default=[], - help="provide a list of comma-separated functions to analyze (static analysis).", + help="provide a list of comma-separated function virtual addresses to analyze (static analysis).", ) if "restrict-to-processes" in wanted: From 79f30974aada7e2341520786cc6dddfcbfb9e142 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:18:38 +0100 Subject: [PATCH 30/44] Update capa/main.py Co-authored-by: Willi Ballenthin --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 6311f1eb2..9a2fc2189 100644 --- a/capa/main.py +++ b/capa/main.py @@ -300,7 +300,7 @@ def install_common_args(parser, wanted=None): "--restrict-to-processes", type=lambda s: s.replace(" ", "").split(","), default=[], - help="provide a list of comma-separated processes to analyze (dynamic analysis).", + help="provide a list of comma-separated process IDs to analyze (dynamic analysis).", ) if "os" in wanted: From 9ce2a3cccabbd80863c70a5563d3acb398d2d081 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:20:41 +0100 Subject: [PATCH 31/44] Update doc/usage.md Co-authored-by: Willi Ballenthin --- doc/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 58b35a722..03357ced8 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -10,8 +10,8 @@ For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference `capa -t communication` runs rules with the namespace `communication`. ### only analyze selected functions -Use the `--functions` option to extract capabilities from only a selected set of functions. -For example, `capa sample.exe --functions 0x4019C0,0x401CD0` will only extract the capabilities in the functions found at +Use the `--restrict-to-functions` option to extract capabilities from only a selected set of functions. +For example, `capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0` will only extract the capabilities in the functions found at addresses 0x401CD0 and 0x4019C0. ### only analyze selected processes From 28e274fac6a4e505208644969ce672dde6dafa7f Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:20:52 +0100 Subject: [PATCH 32/44] Update doc/usage.md Co-authored-by: Willi Ballenthin --- doc/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 03357ced8..654dc6c2d 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -15,8 +15,8 @@ For example, `capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0` will on addresses 0x401CD0 and 0x4019C0. ### only analyze selected processes -Use the `--processes` option to extract capabilities from only a selected set of processes. -For example, `capa report.log --processes 3888,3214,4299` will extract capabilities only from the processes 3888, 3214, and 4299. +Use the `--restrict-to-processes` option to extract capabilities from only a selected set of processes. +For example, `capa report.log --restrict-to-processes 3888,3214,4299` will extract capabilities only from the processes 3888, 3214, and 4299. ### IDA Pro plugin: capa explorer Please check out the [capa explorer documentation](/capa/ida/plugin/README.md). From fa612735e092fba8ae8e4a52fdc315be9bdc4f3e Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 03:13:52 +0100 Subject: [PATCH 33/44] update changelog --- CHANGELOG.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4edb6fbd3..66336e92d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,7 @@ ## master (unreleased) ### New Features -- add the ability to select which specific functions or processes to analyze @yelhamer - +- cli: add the ability to select which specific functions or processes to analyze @yelhamer - webui: explore capa analysis results in a web-based UI online and offline #2224 @s-ff - support analyzing DRAKVUF traces #2143 @yelhamer - IDA extractor: extract names from dynamically resolved APIs stored in renamed global variables #2201 @Ana06 From 0640ba95a64581c22581979e3edd8a35335e64f4 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Aug 2024 03:15:25 +0100 Subject: [PATCH 34/44] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 361d4d8f0..b52cf8798 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -299,7 +299,7 @@ def extract_insn_features( def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor: - get_functions = extractor.get_functions # fetch original get_functions() + original_get_functions = extractor.get_functions # fetch original get_functions() def filtered_get_functions(self): yield from (f for f in get_functions() if f.address in functions) From b693aa023f9af898eabac3fece486679903f0701 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 03:16:11 +0100 Subject: [PATCH 35/44] base_extractor.py: rename variable --- capa/features/extractors/base_extractor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b52cf8798..f3c26fed2 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -302,7 +302,7 @@ def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticF original_get_functions = extractor.get_functions # fetch original get_functions() def filtered_get_functions(self): - yield from (f for f in get_functions() if f.address in functions) + yield from (f for f in original_get_functions() if f.address in functions) # make a copy of the extractor before decorating the get_functions() method new_extractor = copy(extractor) @@ -483,10 +483,10 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: - get_processes = extractor.get_processes # fetch original get_functions() + original_get_processes = extractor.get_processes # fetch original get_functions() def filtered_get_processes(self): - yield from (f for f in get_processes() if f.address.pid in processes) + yield from (f for f in original_get_processes() if f.address.pid in processes) # make a copy of the extractor before decorating the get_processes() method new_extractor = copy(extractor) From 10a26a84bc0ca6a572f77f92c88de8f4543a7f95 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 03:33:47 +0100 Subject: [PATCH 36/44] base_extractor.py: update comments --- capa/features/extractors/base_extractor.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index f3c26fed2..2b8fc3dfa 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -304,7 +304,10 @@ def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticF def filtered_get_functions(self): yield from (f for f in original_get_functions() if f.address in functions) - # make a copy of the extractor before decorating the get_functions() method + # we make a copy of the original extractor object and then update its get_functions() method with the decorated filter one. + # this is in order to preserve the original extractor object's get_functions() method, in case it is used elsewhere in the code. + # an example where this is important is in our testfiles where we may use the same extractor object with different tests, + # with some of these tests needing to install a functions filter on the extractor object. new_extractor = copy(extractor) new_extractor.get_functions = MethodType(filtered_get_functions, extractor) # type: ignore @@ -483,12 +486,15 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: - original_get_processes = extractor.get_processes # fetch original get_functions() + original_get_processes = extractor.get_processes # fetch original get_processes() def filtered_get_processes(self): yield from (f for f in original_get_processes() if f.address.pid in processes) - # make a copy of the extractor before decorating the get_processes() method + # we make a copy of the original extractor object and then update its get_processes() method with the decorated filter one. + # this is in order to preserve the original extractor object's get_processes() method, in case it is used elsewhere in the code. + # an example where this is important is in our testfiles where we may use the same extractor object with different tests, + # with some of these tests needing to install a processes filter on the extractor object. new_extractor = copy(extractor) new_extractor.get_processes = MethodType(filtered_get_processes, extractor) # type: ignore From b0d8071988a8ff7af92a06d77b7e3468eb9f2c6a Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 03:43:09 +0100 Subject: [PATCH 37/44] main.py: add FilterConfig type --- capa/main.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/capa/main.py b/capa/main.py index 9a2fc2189..a8e70456e 100644 --- a/capa/main.py +++ b/capa/main.py @@ -17,7 +17,7 @@ import textwrap import contextlib from types import TracebackType -from typing import Any, Set, Dict, List, Optional +from typing import Any, Set, Dict, List, Optional, TypedDict from pathlib import Path import colorama @@ -121,6 +121,11 @@ logger = logging.getLogger("capa") +class FilterConfig(TypedDict, total=False): + processes: set[int] + functions: set[int] + + @contextlib.contextmanager def timing(msg: str): t0 = time.time() @@ -807,7 +812,7 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr raise ShouldExitError(E_CORRUPT_FILE) from e -def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: +def get_extractor_filters_from_cli(args, input_format) -> FilterConfig: if input_format in STATIC_FORMATS: if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") @@ -820,7 +825,7 @@ def get_extractor_filters_from_cli(args, input_format) -> Dict[str, Set]: raise ShouldExitError(E_INVALID_INPUT_FORMAT) -def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: Dict[str, Set]): +def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: FilterConfig): if isinstance(extractor, StaticFeatureExtractor): return FunctionFilter(extractor, extractor_filters["functions"]) elif isinstance(extractor, DynamicFeatureExtractor): From ac50103754f456a2a463a1d2a8e88bc057a4a3e1 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 03:46:02 +0100 Subject: [PATCH 38/44] main.py: add asserts for checking filters are not empty --- capa/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/capa/main.py b/capa/main.py index a8e70456e..4ad1eb4b3 100644 --- a/capa/main.py +++ b/capa/main.py @@ -827,8 +827,10 @@ def get_extractor_filters_from_cli(args, input_format) -> FilterConfig: def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: FilterConfig): if isinstance(extractor, StaticFeatureExtractor): + assert extractor_filters["functions"] return FunctionFilter(extractor, extractor_filters["functions"]) elif isinstance(extractor, DynamicFeatureExtractor): + assert extractor_filters["processes"] return ProcessFilter(extractor, extractor_filters["processes"]) else: raise ShouldExitError(E_INVALID_FEATURE_EXTRACTOR) From a194a13a9b43bb12430b6263f7be33c565b4eea1 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 04:03:29 +0100 Subject: [PATCH 39/44] main.py: remove unused Set import --- capa/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/main.py b/capa/main.py index 4ad1eb4b3..00197c13d 100644 --- a/capa/main.py +++ b/capa/main.py @@ -17,7 +17,7 @@ import textwrap import contextlib from types import TracebackType -from typing import Any, Set, Dict, List, Optional, TypedDict +from typing import Any, Dict, List, Optional, TypedDict from pathlib import Path import colorama From e80f4747fc479600a03092247ac73d0898565464 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 05:00:53 +0100 Subject: [PATCH 40/44] main.py: move filters extractor into get_extractor_from_cli() routine --- capa/main.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/capa/main.py b/capa/main.py index 00197c13d..6f09ccdac 100644 --- a/capa/main.py +++ b/capa/main.py @@ -17,7 +17,7 @@ import textwrap import contextlib from types import TracebackType -from typing import Any, Dict, List, Optional, TypedDict +from typing import Any, Set, Dict, List, Optional, TypedDict from pathlib import Path import colorama @@ -122,8 +122,8 @@ class FilterConfig(TypedDict, total=False): - processes: set[int] - functions: set[int] + processes: Set[int] + functions: Set[int] @contextlib.contextmanager @@ -781,9 +781,10 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr os_ = get_os_from_cli(args, backend) sample_path = get_sample_path_from_cli(args, backend) + extractor_filters = get_extractor_filters_from_cli(args, input_format) try: - return capa.loader.get_extractor( + extractor = capa.loader.get_extractor( args.input_file, input_format, os_, @@ -793,6 +794,7 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr disable_progress=args.quiet or args.debug, sample_path=sample_path, ) + return apply_extractor_filters(extractor, extractor_filters) except UnsupportedFormatError as e: if input_format == FORMAT_CAPE: log_unsupported_cape_report_error(str(e)) @@ -813,6 +815,10 @@ def get_extractor_from_cli(args, input_format: str, backend: str) -> FeatureExtr def get_extractor_filters_from_cli(args, input_format) -> FilterConfig: + if not hasattr(args, "restrict_to_processes") and not hasattr(args, "restrict_to_functions"): + # no processes or function filters were installed in the args + return {} + if input_format in STATIC_FORMATS: if args.restrict_to_processes: raise InvalidArgument("Cannot filter processes with static analysis.") @@ -826,6 +832,10 @@ def get_extractor_filters_from_cli(args, input_format) -> FilterConfig: def apply_extractor_filters(extractor: FeatureExtractor, extractor_filters: FilterConfig): + if not any(extractor_filters.values()): + return extractor + + # if the user specified extractor filters, then apply them here if isinstance(extractor, StaticFeatureExtractor): assert extractor_filters["functions"] return FunctionFilter(extractor, extractor_filters["functions"]) @@ -896,7 +906,6 @@ def main(argv: Optional[List[str]] = None): handle_common_args(args) ensure_input_exists_from_cli(args) input_format = get_input_format_from_cli(args) - extractor_filters = get_extractor_filters_from_cli(args, input_format) rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) @@ -927,10 +936,6 @@ def main(argv: Optional[List[str]] = None): except ShouldExitError as e: return e.status_code - if any(extractor_filters.values()): - # if the user specified any extractor filters, apply them here. - extractor = apply_extractor_filters(extractor, extractor_filters) - capabilities, counts = find_capabilities(rules, extractor, disable_progress=args.quiet) meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts) From 88d9d67f776f2b9fe1328276c48aaa5d1a809139 Mon Sep 17 00:00:00 2001 From: Yacine Elhamer Date: Tue, 20 Aug 2024 06:04:21 +0100 Subject: [PATCH 41/44] doc/usage.md: update usage according to reviews --- doc/usage.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/doc/usage.md b/doc/usage.md index 654dc6c2d..457e6a4a7 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -10,13 +10,20 @@ For example, `capa -t william.ballenthin@mandiant.com` runs rules that reference `capa -t communication` runs rules with the namespace `communication`. ### only analyze selected functions -Use the `--restrict-to-functions` option to extract capabilities from only a selected set of functions. -For example, `capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0` will only extract the capabilities in the functions found at -addresses 0x401CD0 and 0x4019C0. +Use the `--restrict-to-functions` option to extract capabilities from only a selected set of functions. This is useful for analyzing +large functions and figuring out their capabilities and their address of occurance; for example: PEB access, RC4 encryption, etc. + +To use this, you can copy the virtual addresses from your favorite disassembler and pass them to capa as follows: +`capa sample.exe --restrict-to-functions 0x4019C0,0x401CD0`. If you add the `-v` option then capa will extract the interesting parts of a function for you. ### only analyze selected processes -Use the `--restrict-to-processes` option to extract capabilities from only a selected set of processes. -For example, `capa report.log --restrict-to-processes 3888,3214,4299` will extract capabilities only from the processes 3888, 3214, and 4299. +Use the `--restrict-to-processes` option to extract capabilities from only a selected set of processes. This is useful for filtering the noise +generated from analyzing non-malicious malware processes that can be reported by some sandboxes, as well as reduce the execution time +by not analyzing such processes in the first place. + +To use this, you can pick the PIDs of the processes you are interested in from the sandbox-generated process tree (or from the sandbox-reported malware PID) +and pass that to capa as follows: `capa report.log --restrict-to-processes 3888,3214,4299`. If you add the `-v` option then capa will tell you +which threads perform what actions (encrypt/decrypt data, initiate a connection, etc.). ### IDA Pro plugin: capa explorer Please check out the [capa explorer documentation](/capa/ida/plugin/README.md). From c30a10abb4ea994eafa01234823d88617dace9b4 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:20:19 +0100 Subject: [PATCH 42/44] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index 2b8fc3dfa..b68a5b218 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -299,7 +299,7 @@ def extract_insn_features( def FunctionFilter(extractor: StaticFeatureExtractor, functions: Set) -> StaticFeatureExtractor: - original_get_functions = extractor.get_functions # fetch original get_functions() + original_get_functions = extractor.get_functions def filtered_get_functions(self): yield from (f for f in original_get_functions() if f.address in functions) From 150d6f00e7e421f5ff2eec38fc07fab0cd2fdad2 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:20:28 +0100 Subject: [PATCH 43/44] Update capa/features/extractors/base_extractor.py Co-authored-by: Willi Ballenthin --- capa/features/extractors/base_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/base_extractor.py b/capa/features/extractors/base_extractor.py index b68a5b218..a58016bcc 100644 --- a/capa/features/extractors/base_extractor.py +++ b/capa/features/extractors/base_extractor.py @@ -486,7 +486,7 @@ def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> def ProcessFilter(extractor: DynamicFeatureExtractor, processes: Set) -> DynamicFeatureExtractor: - original_get_processes = extractor.get_processes # fetch original get_processes() + original_get_processes = extractor.get_processes def filtered_get_processes(self): yield from (f for f in original_get_processes() if f.address.pid in processes) From 3aefa76a40b2c630d0d26a0e5f5d0ba34ec7aff1 Mon Sep 17 00:00:00 2001 From: Yacine <16624109+yelhamer@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:23:42 +0100 Subject: [PATCH 44/44] Update doc/usage.md Co-authored-by: Willi Ballenthin --- doc/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/usage.md b/doc/usage.md index 457e6a4a7..949e03e14 100644 --- a/doc/usage.md +++ b/doc/usage.md @@ -18,7 +18,7 @@ To use this, you can copy the virtual addresses from your favorite disassembler ### only analyze selected processes Use the `--restrict-to-processes` option to extract capabilities from only a selected set of processes. This is useful for filtering the noise -generated from analyzing non-malicious malware processes that can be reported by some sandboxes, as well as reduce the execution time +generated from analyzing non-malicious processes that can be reported by some sandboxes, as well as reduce the execution time by not analyzing such processes in the first place. To use this, you can pick the PIDs of the processes you are interested in from the sandbox-generated process tree (or from the sandbox-reported malware PID)