From 1361e68c81bb37ec04a550cef202b1fa7eb83654 Mon Sep 17 00:00:00 2001 From: Dani Alcala <112832187+clavedeluna@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:44:11 -0300 Subject: [PATCH] Enable running codemodder as a library import (#879) * First steps towards refactoring as a library * Make sure no threads are used for maxworkers=1 case * Fix up some types and defaults * organize run args * run returns output * make output a Path * make dry run required * fix sast only filtering --------- Co-authored-by: Daniel D'Avella --- README.md | 19 ++- src/codemodder/__init__.py | 4 + src/codemodder/cli.py | 1 + src/codemodder/codemodder.py | 162 ++++++++++++++++-------- src/codemodder/codemods/base_codemod.py | 13 +- src/codemodder/codetf.py | 8 +- src/codemodder/context.py | 30 +++-- tests/test_codemodder.py | 66 +++++++--- 8 files changed, 207 insertions(+), 96 deletions(-) diff --git a/README.md b/README.md index 16c63c67..2cffc8c5 100644 --- a/README.md +++ b/README.md @@ -30,9 +30,11 @@ To install the package from source, use `pip`: $ pip install /path/to/codemodder-python ``` -## Running Locally +## Running `codemodder` -The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation. +### CLI + +Codemodder can be run as a CLI. The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation. For basic usage, run the `codemodder` command with a target directory path: @@ -55,6 +57,19 @@ For a full list of options, use the `--help` flag: $ codemodder --help ``` +### Library + +You can also run `codemodder` as a library by importing the module and running `run`. For basic usage, pass a target directory path and the `dry_run` argument: + +```python +import codemodder + +output, exit_code = codemodder.run("/path/to/my-project", dry_run=True) +``` + +Unlike the CLI which has a default `dry_run` of `False`, when calling `codemodder` as a library you must indicate if you want `codemodder` to make changes to your files. + + ## Architecture Codemods are composed of the following key components: diff --git a/src/codemodder/__init__.py b/src/codemodder/__init__.py index e3389fa7..bd824baf 100644 --- a/src/codemodder/__init__.py +++ b/src/codemodder/__init__.py @@ -2,3 +2,7 @@ from ._version import __version__ except ImportError: # pragma: no cover __version__ = "unknown" + +from codemodder.codemodder import run + +__all__ = ["run", "__version__"] diff --git a/src/codemodder/cli.py b/src/codemodder/cli.py index c3b93612..5b3f47dc 100644 --- a/src/codemodder/cli.py +++ b/src/codemodder/cli.py @@ -121,6 +121,7 @@ def parse_args(argv, codemod_registry: CodemodRegistry): parser.add_argument( "--dry-run", action=argparse.BooleanOptionalAction, + default=False, help="do everything except make changes to files", ) parser.add_argument( diff --git a/src/codemodder/codemodder.py b/src/codemodder/codemodder.py index d3d98428..2094f405 100644 --- a/src/codemodder/codemodder.py +++ b/src/codemodder/codemodder.py @@ -3,6 +3,7 @@ import logging import os import sys +from collections import defaultdict from pathlib import Path from typing import DefaultDict, Sequence @@ -14,7 +15,13 @@ from codemodder.context import CodemodExecutionContext from codemodder.dependency import Dependency from codemodder.llm import MisconfiguredAIClient -from codemodder.logging import configure_logger, log_list, log_section, logger +from codemodder.logging import ( + OutputFormat, + configure_logger, + log_list, + log_section, + logger, +) from codemodder.project_analysis.file_parsers.package_store import PackageStore from codemodder.project_analysis.python_repo_manager import PythonRepoManager from codemodder.result import ResultSet @@ -45,7 +52,7 @@ def find_semgrep_results( return run_semgrep(context, yaml_files, files_to_analyze) -def log_report(context, argv, elapsed_ms, files_to_analyze): +def log_report(context, output, elapsed_ms, files_to_analyze): log_section("report") logger.info("scanned: %s files", len(files_to_analyze)) all_failures = context.get_failed_files() @@ -60,7 +67,7 @@ def log_report(context, argv, elapsed_ms, files_to_analyze): len(all_changes), len(set(all_changes)), ) - logger.info("report file: %s", argv.output) + logger.info("report file: %s", output) logger.info("total elapsed: %s ms", elapsed_ms) logger.info(" semgrep: %s ms", context.timer.get_time_ms("semgrep")) logger.info(" parse: %s ms", context.timer.get_time_ms("parse")) @@ -111,79 +118,79 @@ def record_dependency_update(dependency_results: dict[Dependency, PackageStore | logger.debug("The following dependencies could not be added: %s", str_list) -def run(original_args) -> int: +def run( + directory: Path | str, + dry_run: bool, + output: Path | str | None = None, + output_format: str = "codetf", + verbose: bool = False, + log_format: OutputFormat = OutputFormat.JSON, + project_name: str | None = None, + tool_result_files_map: DefaultDict[str, list[str]] = defaultdict(list), + path_include: list[str] | None = None, + path_exclude: list[str] | None = None, + codemod_include: list[str] | None = None, + codemod_exclude: list[str] | None = None, + max_workers: int = 1, + original_cli_args: list[str] | None = None, + codemod_registry: registry.CodemodRegistry | None = None, + sast_only: bool = False, +) -> tuple[CodeTF | None, int]: start = datetime.datetime.now() - codemod_registry = registry.load_registered_codemods() - provider_registry = providers.load_providers() + codemod_registry = codemod_registry or registry.load_registered_codemods() - # A little awkward, but we need the codemod registry in order to validate potential arguments - argv = parse_args(original_args, codemod_registry) - if not os.path.exists(argv.directory): - logger.error( - "given directory '%s' doesn't exist or can’t be read", - argv.directory, - ) - return 1 + path_include = path_include or [] + path_exclude = path_exclude or [] + codemod_include = codemod_include or [] + codemod_exclude = codemod_exclude or [] + + provider_registry = providers.load_providers() - configure_logger(argv.verbose, argv.log_format, argv.project_name) + configure_logger(verbose, log_format, project_name) log_section("startup") logger.info("codemodder: python/%s", __version__) - logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args)) - - try: - # TODO: this should be dict[str, list[Path]] - tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools( - [Path(name) for name in argv.sarif or []] - ) - except (DuplicateToolError, FileNotFoundError) as err: - logger.error(err) - return 1 - - tool_result_files_map["sonar"].extend(argv.sonar_issues_json or []) - tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or []) - tool_result_files_map["defectdojo"] = argv.defectdojo_findings_json or [] for file_name in itertools.chain(*tool_result_files_map.values()): if not os.path.exists(file_name): logger.error( f"FileNotFoundError: [Errno 2] No such file or directory: '{file_name}'" ) - return 1 + return None, 1 - repo_manager = PythonRepoManager(Path(argv.directory)) + repo_manager = PythonRepoManager(Path(directory)) try: context = CodemodExecutionContext( - Path(argv.directory), - argv.dry_run, - argv.verbose, + Path(directory), + dry_run, + verbose, codemod_registry, provider_registry, repo_manager, - argv.path_include, - argv.path_exclude, + path_include, + path_exclude, tool_result_files_map, - argv.max_workers, + max_workers, ) except MisconfiguredAIClient as e: logger.error(e) - return 3 # Codemodder instructions conflicted (according to spec) + return None, 3 # Codemodder instructions conflicted (according to spec) - repo_manager.parse_project() + context.repo_manager.parse_project() # TODO: this should be a method of CodemodExecutionContext codemods_to_run = codemod_registry.match_codemods( - argv.codemod_include, - argv.codemod_exclude, - sast_only=argv.sonar_issues_json or argv.sarif, + codemod_include, + codemod_exclude, + sast_only=sast_only, ) log_section("setup") log_list(logging.INFO, "running", codemods_to_run, predicate=lambda c: c.id) log_list(logging.INFO, "including paths", context.included_paths) - log_list(logging.INFO, "excluding paths", argv.path_exclude) + log_list(logging.INFO, "excluding paths", path_exclude) log_list( logging.DEBUG, "matched files", (str(path) for path in context.files_to_analyze) @@ -203,24 +210,71 @@ def run(original_args) -> int: elapsed = datetime.datetime.now() - start elapsed_ms = int(elapsed.total_seconds() * 1000) - if argv.output: - codetf = CodeTF.build( - context, - elapsed_ms, - original_args, - context.compile_results(codemods_to_run), - ) - codetf.write_report(argv.output) + logger.debug("Output format %s", output_format) + codetf = CodeTF.build( + context, + elapsed_ms, + original_cli_args or [], + context.compile_results(codemods_to_run), + ) + if output: + codetf.write_report(output) log_report( context, - argv, + output, elapsed_ms, [] if not codemods_to_run else context.files_to_analyze, ) - return 0 + return codetf, 0 + + +def _run_cli(original_args) -> int: + codemod_registry = registry.load_registered_codemods() + argv = parse_args(original_args, codemod_registry) + if not os.path.exists(argv.directory): + logger.error( + "given directory '%s' doesn't exist or can’t be read", + argv.directory, + ) + return 1 + + try: + # TODO: this should be dict[str, list[Path]] + tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools( + [Path(name) for name in argv.sarif or []] + ) + except (DuplicateToolError, FileNotFoundError) as err: + logger.error(err) + return 1 + + tool_result_files_map["sonar"].extend(argv.sonar_issues_json or []) + tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or []) + tool_result_files_map["defectdojo"].extend(argv.defectdojo_findings_json or []) + + logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args)) + + _, status = run( + argv.directory, + argv.dry_run, + argv.output, + argv.output_format, + argv.verbose, + argv.log_format, + argv.project_name, + tool_result_files_map, + argv.path_include, + argv.path_exclude, + argv.codemod_include, + argv.codemod_exclude, + max_workers=argv.max_workers, + original_cli_args=original_args, + codemod_registry=codemod_registry, + sast_only=argv.sonar_issues_json or argv.sarif, + ) + return status def main(): sys_argv = sys.argv[1:] - sys.exit(run(sys_argv)) + sys.exit(_run_cli(sys_argv)) diff --git a/src/codemodder/codemods/base_codemod.py b/src/codemodder/codemods/base_codemod.py index c85870ba..7a458c27 100644 --- a/src/codemodder/codemods/base_codemod.py +++ b/src/codemodder/codemods/base_codemod.py @@ -229,10 +229,15 @@ def _apply( self._process_file, context=context, results=results, rules=rules ) - with ThreadPoolExecutor() as executor: - logger.debug("using executor with %s workers", context.max_workers) - contexts = executor.map(process_file, files_to_analyze) - executor.shutdown(wait=True) + contexts = [] + if context.max_workers == 1: + logger.debug("processing files serially") + contexts.extend([process_file(file) for file in files_to_analyze]) + else: + with ThreadPoolExecutor() as executor: + logger.debug("using executor with %s workers", context.max_workers) + contexts.extend(executor.map(process_file, files_to_analyze)) + executor.shutdown(wait=True) context.process_results(self.id, contexts) diff --git a/src/codemodder/codetf.py b/src/codemodder/codetf.py index 579accc4..c67acbd5 100644 --- a/src/codemodder/codetf.py +++ b/src/codemodder/codetf.py @@ -9,6 +9,7 @@ import os import sys from enum import Enum +from pathlib import Path from typing import TYPE_CHECKING, Optional from pydantic import BaseModel, model_validator @@ -165,7 +166,7 @@ def build( cls, context: CodemodExecutionContext, elapsed_ms, - original_args, + original_args: list, results: list[Result], ): command_name = os.path.basename(sys.argv[0]) @@ -183,10 +184,9 @@ def build( ) return cls(run=run, results=results) - def write_report(self, outfile): + def write_report(self, outfile: Path | str): try: - with open(outfile, "w", encoding="utf-8") as f: - f.write(self.model_dump_json(exclude_none=True)) + Path(outfile).write_text(self.model_dump_json(exclude_none=True)) except Exception: logger.exception("failed to write report file.") # Any issues with writing the output file should exit status 2. diff --git a/src/codemodder/context.py b/src/codemodder/context.py index 98fa9aac..11ab540c 100644 --- a/src/codemodder/context.py +++ b/src/codemodder/context.py @@ -21,8 +21,8 @@ from codemodder.logging import log_list, logger from codemodder.project_analysis.file_parsers.package_store import PackageStore from codemodder.project_analysis.python_repo_manager import PythonRepoManager -from codemodder.providers import ProviderRegistry -from codemodder.registry import CodemodRegistry +from codemodder.providers import ProviderRegistry, load_providers +from codemodder.registry import CodemodRegistry, load_registered_codemods from codemodder.result import ResultSet from codemodder.utils.timer import Timer from codemodder.utils.update_finding_metadata import update_finding_metadata @@ -58,12 +58,12 @@ def __init__( self, directory: Path, dry_run: bool, - verbose: bool, - registry: CodemodRegistry, - providers: ProviderRegistry, - repo_manager: PythonRepoManager, - path_include: list[str], - path_exclude: list[str], + verbose: bool = False, + registry: CodemodRegistry | None = None, + providers: ProviderRegistry | None = None, + repo_manager: PythonRepoManager | None = None, + path_include: list[str] | None = None, + path_exclude: list[str] | None = None, tool_result_files_map: dict[str, list[str]] | None = None, max_workers: int = 1, ): @@ -75,12 +75,12 @@ def __init__( self._dependency_update_by_codemod = {} self._unfixed_findings_by_codemod = {} self.dependencies = {} - self.registry = registry - self.providers = providers - self.repo_manager = repo_manager + self.registry = registry or load_registered_codemods() + self.providers = providers or load_providers() + self.repo_manager = repo_manager or PythonRepoManager(directory) self.timer = Timer() - self.path_include = path_include - self.path_exclude = path_exclude + self.path_include = path_include or [] + self.path_exclude = path_exclude or [] self.max_workers = max_workers self.tool_result_files_map = tool_result_files_map or {} self.semgrep_prefilter_results = None @@ -173,7 +173,9 @@ def add_unfixed_findings( unfixed_findings ) - def process_results(self, codemod_id: str, results: Iterator[FileContext]): + def process_results( + self, codemod_id: str, results: Iterator[FileContext] | list[FileContext] + ): for file_context in results: self.add_changesets(codemod_id, file_context.changesets) self.add_failures(codemod_id, file_context.failures) diff --git a/tests/test_codemodder.py b/tests/test_codemodder.py index f7438f15..d7fa54f2 100644 --- a/tests/test_codemodder.py +++ b/tests/test_codemodder.py @@ -4,7 +4,8 @@ import mock import pytest -from codemodder.codemodder import find_semgrep_results, run +from codemodder import run +from codemodder.codemodder import _run_cli, find_semgrep_results from codemodder.diff import create_diff_from_tree from codemodder.registry import load_registered_codemods from codemodder.result import ResultSet @@ -57,7 +58,7 @@ def func(foo=[]): return code_dir, codetf -class TestRun: +class TestRunCli: @mock.patch("libcst.parse_module") def test_no_files_matched(self, mock_parse, dir_structure): code_dir, codetf = dir_structure @@ -69,7 +70,7 @@ def test_no_files_matched(self, mock_parse, dir_structure): "--path-exclude", "*py", ] - res = run(args) + res = _run_cli(args) assert res == 0 mock_parse.assert_not_called() @@ -86,7 +87,7 @@ def test_skip_symlinks(self, mocker, dir_structure): str(codetf), "--codemod-include=pixee:python/url-sandbox", ] - res = run(args) + res = _run_cli(args) assert res == 0 @mock.patch("libcst.parse_module", side_effect=Exception) @@ -103,7 +104,7 @@ def test_cst_parsing_fails(self, build_report, mock_parse, dir_structure): "*request.py", ] - res = run(args) + res = _run_cli(args) assert res == 0 mock_parse.assert_called() @@ -145,7 +146,7 @@ def test_dry_run(self, mocker, dir_structure): assert not codetf.exists() - res = run(args) + res = _run_cli(args) assert res == 0 assert codetf.exists() transform_apply.assert_called() @@ -165,7 +166,7 @@ def test_reporting(self, mock_reporting, dry_run, dir_structure): if dry_run: args += ["--dry-run"] - res = run(args) + res = _run_cli(args) assert res == 0 mock_reporting.assert_called_once() @@ -178,7 +179,7 @@ def test_reporting(self, mock_reporting, dry_run, dir_structure): mock_reporting.return_value.write_report.assert_called_once() -class TestCodemodIncludeExclude: +class TestCliCodemodIncludeExclude: @mock.patch("codemodder.codetf.CodeTF.write_report") def test_codemod_include_no_match(self, write_report, dir_structure, caplog): @@ -192,7 +193,7 @@ def test_codemod_include_no_match(self, write_report, dir_structure, caplog): ] caplog.set_level(logging.INFO) - run(args) + _run_cli(args) write_report.assert_called_once() assert "no codemods to run" in caplog.text @@ -214,7 +215,7 @@ def test_codemod_include_some_match(self, write_report, dir_structure, caplog): f"--codemod-include={bad_codemod},{good_codemod}", ] caplog.set_level(logging.INFO) - run(args) + _run_cli(args) write_report.assert_called_once() assert f"running codemod {good_codemod}" in caplog.text assert ( @@ -234,7 +235,7 @@ def test_codemod_exclude_some_match(self, write_report, dir_structure, caplog): f"--codemod-exclude={bad_codemod},{good_codemod}", ] caplog.set_level(logging.INFO) - run(args) + _run_cli(args) write_report.assert_called_once() assert f"running codemod {good_codemod}" not in caplog.text @@ -252,7 +253,7 @@ def test_codemod_exclude_no_match(self, apply, write_report, dir_structure, capl f"--codemod-exclude={bad_codemod}", ] caplog.set_level(logging.INFO) - run(args) + _run_cli(args) write_report.assert_called_once() assert "running codemod " in caplog.text @@ -270,13 +271,13 @@ def test_exclude_all_registered_codemods(self, mock_semgrep_run, dir_structure): f"--codemod-exclude={names}", ] - exit_code = run(args) + exit_code = _run_cli(args) assert exit_code == 0 mock_semgrep_run.assert_not_called() assert codetf.exists() -class TestExitCode: +class TestCliExitCode: @mock.patch("codemodder.codetf.CodeTF.write_report") def test_no_changes_success_0(self, mock_report, dir_structure): del mock_report @@ -290,7 +291,7 @@ def test_no_changes_success_0(self, mock_report, dir_structure): "*request.py", ] - exit_code = run(args) + exit_code = _run_cli(args) assert exit_code == 0 @mock.patch("codemodder.codetf.CodeTF.write_report") @@ -303,7 +304,7 @@ def test_bad_project_dir_1(self, mock_report): "--codemod-include=pixee:python/url-sandbox", ] - exit_code = run(args) + exit_code = _run_cli(args) assert exit_code == 1 @mock.patch("codemodder.codetf.CodeTF.write_report") @@ -319,7 +320,7 @@ def test_conflicting_include_exclude(self, mock_report): "secure-random", ] with pytest.raises(SystemExit) as err: - run(args) + _run_cli(args) assert err.value.args[0] == 3 def test_find_semgrep_results(self, mocker): @@ -383,7 +384,36 @@ def test_bad_sarif_path(self, mock_report, caplog, flag): f"--{flag}=bad.json", ] - exit_code = run(args) + exit_code = _run_cli(args) assert exit_code == 1 assert "No such file or directory: 'bad.json'" in caplog.text mock_report.assert_not_called() + + +class TestRun: + @mock.patch("libcst.parse_module") + def test_run_basic_call(self, mock_parse, dir_structure): + code_dir, codetf = dir_structure + + codetf_output, status = run(code_dir, dry_run=True) + assert status == 0 + assert codetf_output + assert codetf_output.run.directory == str(code_dir) + mock_parse.assert_not_called() + assert not codetf.exists() + + @mock.patch("libcst.parse_module") + def test_run_with_output(self, mock_parse, dir_structure): + code_dir, codetf = dir_structure + + codetf_output, status = run( + code_dir, + output=codetf, + dry_run=True, + codemod_include=["pixee:python/url-sandbox"], + ) + assert status == 0 + assert codetf_output + assert codetf_output.run.directory == str(code_dir) + mock_parse.assert_not_called() + assert codetf.exists()