Skip to content

Commit

Permalink
Enable running codemodder as a library import (#879)
Browse files Browse the repository at this point in the history
* First steps towards refactoring as a library

* Make sure no threads are used for maxworkers=1 case

* Fix up some types and defaults

* organize run args

* run returns output

* make output a Path

* make dry run required

* fix sast only filtering

---------

Co-authored-by: Daniel D'Avella <[email protected]>
  • Loading branch information
clavedeluna and drdavella authored Oct 15, 2024
1 parent c2560fc commit 1361e68
Show file tree
Hide file tree
Showing 8 changed files with 207 additions and 96 deletions.
19 changes: 17 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,11 @@ To install the package from source, use `pip`:
$ pip install /path/to/codemodder-python
```

## Running Locally
## Running `codemodder`

The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation.
### CLI

Codemodder can be run as a CLI. The codemodder package provides an executable called `codemodder`. This should be available on your path by default after installation.

For basic usage, run the `codemodder` command with a target directory path:

Expand All @@ -55,6 +57,19 @@ For a full list of options, use the `--help` flag:
$ codemodder --help
```

### Library

You can also run `codemodder` as a library by importing the module and running `run`. For basic usage, pass a target directory path and the `dry_run` argument:

```python
import codemodder

output, exit_code = codemodder.run("/path/to/my-project", dry_run=True)
```

Unlike the CLI which has a default `dry_run` of `False`, when calling `codemodder` as a library you must indicate if you want `codemodder` to make changes to your files.


## Architecture

Codemods are composed of the following key components:
Expand Down
4 changes: 4 additions & 0 deletions src/codemodder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
from ._version import __version__
except ImportError: # pragma: no cover
__version__ = "unknown"

from codemodder.codemodder import run

__all__ = ["run", "__version__"]
1 change: 1 addition & 0 deletions src/codemodder/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def parse_args(argv, codemod_registry: CodemodRegistry):
parser.add_argument(
"--dry-run",
action=argparse.BooleanOptionalAction,
default=False,
help="do everything except make changes to files",
)
parser.add_argument(
Expand Down
162 changes: 108 additions & 54 deletions src/codemodder/codemodder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import sys
from collections import defaultdict
from pathlib import Path
from typing import DefaultDict, Sequence

Expand All @@ -14,7 +15,13 @@
from codemodder.context import CodemodExecutionContext
from codemodder.dependency import Dependency
from codemodder.llm import MisconfiguredAIClient
from codemodder.logging import configure_logger, log_list, log_section, logger
from codemodder.logging import (
OutputFormat,
configure_logger,
log_list,
log_section,
logger,
)
from codemodder.project_analysis.file_parsers.package_store import PackageStore
from codemodder.project_analysis.python_repo_manager import PythonRepoManager
from codemodder.result import ResultSet
Expand Down Expand Up @@ -45,7 +52,7 @@ def find_semgrep_results(
return run_semgrep(context, yaml_files, files_to_analyze)


def log_report(context, argv, elapsed_ms, files_to_analyze):
def log_report(context, output, elapsed_ms, files_to_analyze):
log_section("report")
logger.info("scanned: %s files", len(files_to_analyze))
all_failures = context.get_failed_files()
Expand All @@ -60,7 +67,7 @@ def log_report(context, argv, elapsed_ms, files_to_analyze):
len(all_changes),
len(set(all_changes)),
)
logger.info("report file: %s", argv.output)
logger.info("report file: %s", output)
logger.info("total elapsed: %s ms", elapsed_ms)
logger.info(" semgrep: %s ms", context.timer.get_time_ms("semgrep"))
logger.info(" parse: %s ms", context.timer.get_time_ms("parse"))
Expand Down Expand Up @@ -111,79 +118,79 @@ def record_dependency_update(dependency_results: dict[Dependency, PackageStore |
logger.debug("The following dependencies could not be added: %s", str_list)


def run(original_args) -> int:
def run(
directory: Path | str,
dry_run: bool,
output: Path | str | None = None,
output_format: str = "codetf",
verbose: bool = False,
log_format: OutputFormat = OutputFormat.JSON,
project_name: str | None = None,
tool_result_files_map: DefaultDict[str, list[str]] = defaultdict(list),
path_include: list[str] | None = None,
path_exclude: list[str] | None = None,
codemod_include: list[str] | None = None,
codemod_exclude: list[str] | None = None,
max_workers: int = 1,
original_cli_args: list[str] | None = None,
codemod_registry: registry.CodemodRegistry | None = None,
sast_only: bool = False,
) -> tuple[CodeTF | None, int]:
start = datetime.datetime.now()

codemod_registry = registry.load_registered_codemods()
provider_registry = providers.load_providers()
codemod_registry = codemod_registry or registry.load_registered_codemods()

# A little awkward, but we need the codemod registry in order to validate potential arguments
argv = parse_args(original_args, codemod_registry)
if not os.path.exists(argv.directory):
logger.error(
"given directory '%s' doesn't exist or can’t be read",
argv.directory,
)
return 1
path_include = path_include or []
path_exclude = path_exclude or []
codemod_include = codemod_include or []
codemod_exclude = codemod_exclude or []

provider_registry = providers.load_providers()

configure_logger(argv.verbose, argv.log_format, argv.project_name)
configure_logger(verbose, log_format, project_name)

log_section("startup")
logger.info("codemodder: python/%s", __version__)
logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args))

try:
# TODO: this should be dict[str, list[Path]]
tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools(
[Path(name) for name in argv.sarif or []]
)
except (DuplicateToolError, FileNotFoundError) as err:
logger.error(err)
return 1

tool_result_files_map["sonar"].extend(argv.sonar_issues_json or [])
tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or [])
tool_result_files_map["defectdojo"] = argv.defectdojo_findings_json or []

for file_name in itertools.chain(*tool_result_files_map.values()):
if not os.path.exists(file_name):
logger.error(
f"FileNotFoundError: [Errno 2] No such file or directory: '{file_name}'"
)
return 1
return None, 1

repo_manager = PythonRepoManager(Path(argv.directory))
repo_manager = PythonRepoManager(Path(directory))

try:
context = CodemodExecutionContext(
Path(argv.directory),
argv.dry_run,
argv.verbose,
Path(directory),
dry_run,
verbose,
codemod_registry,
provider_registry,
repo_manager,
argv.path_include,
argv.path_exclude,
path_include,
path_exclude,
tool_result_files_map,
argv.max_workers,
max_workers,
)
except MisconfiguredAIClient as e:
logger.error(e)
return 3 # Codemodder instructions conflicted (according to spec)
return None, 3 # Codemodder instructions conflicted (according to spec)

repo_manager.parse_project()
context.repo_manager.parse_project()

# TODO: this should be a method of CodemodExecutionContext
codemods_to_run = codemod_registry.match_codemods(
argv.codemod_include,
argv.codemod_exclude,
sast_only=argv.sonar_issues_json or argv.sarif,
codemod_include,
codemod_exclude,
sast_only=sast_only,
)

log_section("setup")
log_list(logging.INFO, "running", codemods_to_run, predicate=lambda c: c.id)
log_list(logging.INFO, "including paths", context.included_paths)
log_list(logging.INFO, "excluding paths", argv.path_exclude)
log_list(logging.INFO, "excluding paths", path_exclude)

log_list(
logging.DEBUG, "matched files", (str(path) for path in context.files_to_analyze)
Expand All @@ -203,24 +210,71 @@ def run(original_args) -> int:
elapsed = datetime.datetime.now() - start
elapsed_ms = int(elapsed.total_seconds() * 1000)

if argv.output:
codetf = CodeTF.build(
context,
elapsed_ms,
original_args,
context.compile_results(codemods_to_run),
)
codetf.write_report(argv.output)
logger.debug("Output format %s", output_format)
codetf = CodeTF.build(
context,
elapsed_ms,
original_cli_args or [],
context.compile_results(codemods_to_run),
)
if output:
codetf.write_report(output)

log_report(
context,
argv,
output,
elapsed_ms,
[] if not codemods_to_run else context.files_to_analyze,
)
return 0
return codetf, 0


def _run_cli(original_args) -> int:
codemod_registry = registry.load_registered_codemods()
argv = parse_args(original_args, codemod_registry)
if not os.path.exists(argv.directory):
logger.error(
"given directory '%s' doesn't exist or can’t be read",
argv.directory,
)
return 1

try:
# TODO: this should be dict[str, list[Path]]
tool_result_files_map: DefaultDict[str, list[str]] = detect_sarif_tools(
[Path(name) for name in argv.sarif or []]
)
except (DuplicateToolError, FileNotFoundError) as err:
logger.error(err)
return 1

tool_result_files_map["sonar"].extend(argv.sonar_issues_json or [])
tool_result_files_map["sonar"].extend(argv.sonar_hotspots_json or [])
tool_result_files_map["defectdojo"].extend(argv.defectdojo_findings_json or [])

logger.info("command: %s %s", Path(sys.argv[0]).name, " ".join(original_args))

_, status = run(
argv.directory,
argv.dry_run,
argv.output,
argv.output_format,
argv.verbose,
argv.log_format,
argv.project_name,
tool_result_files_map,
argv.path_include,
argv.path_exclude,
argv.codemod_include,
argv.codemod_exclude,
max_workers=argv.max_workers,
original_cli_args=original_args,
codemod_registry=codemod_registry,
sast_only=argv.sonar_issues_json or argv.sarif,
)
return status


def main():
sys_argv = sys.argv[1:]
sys.exit(run(sys_argv))
sys.exit(_run_cli(sys_argv))
13 changes: 9 additions & 4 deletions src/codemodder/codemods/base_codemod.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,15 @@ def _apply(
self._process_file, context=context, results=results, rules=rules
)

with ThreadPoolExecutor() as executor:
logger.debug("using executor with %s workers", context.max_workers)
contexts = executor.map(process_file, files_to_analyze)
executor.shutdown(wait=True)
contexts = []
if context.max_workers == 1:
logger.debug("processing files serially")
contexts.extend([process_file(file) for file in files_to_analyze])
else:
with ThreadPoolExecutor() as executor:
logger.debug("using executor with %s workers", context.max_workers)
contexts.extend(executor.map(process_file, files_to_analyze))
executor.shutdown(wait=True)

context.process_results(self.id, contexts)

Expand Down
8 changes: 4 additions & 4 deletions src/codemodder/codetf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
import sys
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Optional

from pydantic import BaseModel, model_validator
Expand Down Expand Up @@ -165,7 +166,7 @@ def build(
cls,
context: CodemodExecutionContext,
elapsed_ms,
original_args,
original_args: list,
results: list[Result],
):
command_name = os.path.basename(sys.argv[0])
Expand All @@ -183,10 +184,9 @@ def build(
)
return cls(run=run, results=results)

def write_report(self, outfile):
def write_report(self, outfile: Path | str):
try:
with open(outfile, "w", encoding="utf-8") as f:
f.write(self.model_dump_json(exclude_none=True))
Path(outfile).write_text(self.model_dump_json(exclude_none=True))
except Exception:
logger.exception("failed to write report file.")
# Any issues with writing the output file should exit status 2.
Expand Down
Loading

0 comments on commit 1361e68

Please sign in to comment.