From 91aeb9bfd6aade6d305c7cfe7341e8cde52a834f Mon Sep 17 00:00:00 2001 From: Phil Dominguez <142051477+phildominguez-gsa@users.noreply.github.com> Date: Wed, 28 Feb 2024 11:24:11 -0500 Subject: [PATCH] 3432: ClamAV benchmarking scripts (#3453) * Creating cmd to scan local files * Collecting metrics * Increasing timeout * Creating cmd to generate xlsx files * Cleanup * Running scans in parallel * Adding real time * Moving cmds to standalone scripts * Comment * Minor tweaks * Comment * Lint * Lint --- backend/tools/collect_scan_metrics.py | 112 ++++++++++++++++++++++++++ backend/tools/generate_xlsx_files.py | 67 +++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 backend/tools/collect_scan_metrics.py create mode 100644 backend/tools/generate_xlsx_files.py diff --git a/backend/tools/collect_scan_metrics.py b/backend/tools/collect_scan_metrics.py new file mode 100644 index 0000000000..7d0cfae9f2 --- /dev/null +++ b/backend/tools/collect_scan_metrics.py @@ -0,0 +1,112 @@ +import argparse +import glob +import logging +import requests +import time +from concurrent.futures import ThreadPoolExecutor +from io import BytesIO + + +logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) + + +class ClamAVError(Exception): + def __init__(self, file): + self.file = file + + def __str__(self): + return "Static virus scan failed" + + +def _scan_file(file, filepath): + try: + logging.info(f"Scanning {filepath}") + return requests.post( + "http://clamav-rest:9000/scan", + files={"file": file}, + data={"name": filepath}, + timeout=300, + ) + except requests.exceptions.ConnectionError: + logging.error("SCAN Connection error") + raise ClamAVError(filepath) + except Exception as e: + logging.error(f"SCAN EXCEPTION UNKNOWN {filepath} {e}") + raise ClamAVError(filepath) + + +def scan_file(filepath): + try: + with open(filepath, "rb") as fh: + file = BytesIO(fh.read()) + + t1 = time.perf_counter(), time.process_time() + _scan_file(file, filepath) + t2 = time.perf_counter(), time.process_time() + + return t2[0] - t1[0] + except Exception as e: + logging.error(f"SCAN SCAN_FILE {e}") + + +def scan_files_at_path(path, num_to_scan, max_workers): + filepaths = glob.glob(path + "*")[:num_to_scan] + if not filepaths: + raise Exception(f"No files found at {path}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + return list(executor.map(scan_file, filepaths)) + + +def is_stringlike(o): + return isinstance(o, str) or isinstance(o, bytes) + + +def not_a_stringlike(o): + return not is_stringlike(o) + + +def check_scan_ok(result): + if result and not_a_stringlike(result) and result.status_code == 200: + return True + else: + return False + + +def main(): + """ + Outputs metrics from performing ClamAV file scans. Beware: ClamAV must be restarted + between runs of this script (`docker restart backend-clamav-rest-1`) in order to + clear the file cache. + Usage: + python collect_scan_metrics --path --num_to_scan --num_workers + Example: + python collect_scan_metrics --path 'metrics_files/*.xlsx' --num_to_scan 20 --num_workers 5 + """ + parser = argparse.ArgumentParser() + + parser.add_argument("--path", type=str, required=True, default=None) + parser.add_argument("--num_to_scan", type=int, required=False, default=1) + parser.add_argument("--num_workers", type=int, required=False, default=1) + + args = parser.parse_args() + + path = args.path + num_to_scan = args.num_to_scan + num_workers = args.num_workers + + t1 = time.perf_counter(), time.process_time() + results = scan_files_at_path(path, num_to_scan, num_workers) + t2 = time.perf_counter(), time.process_time() + real_time = t2[0] - t1[0] + + logging.info(f"Num files: {num_to_scan}") + logging.info(f"Num workers: {num_workers}") + logging.info(f"Real time: {real_time / 60} minutes") + logging.info(f"Total time: {sum(results) / 60} minutes") + logging.info(f"Max time: {max(results)} seconds") + logging.info(f"Avg time: {sum(results) / len(results)} seconds") + + +if __name__ == "__main__": + main() diff --git a/backend/tools/generate_xlsx_files.py b/backend/tools/generate_xlsx_files.py new file mode 100644 index 0000000000..c3a827f366 --- /dev/null +++ b/backend/tools/generate_xlsx_files.py @@ -0,0 +1,67 @@ +import argparse +import datetime +import logging +import openpyxl +import os +import sys + + +logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) + + +def generate_files(base_xlsx, num_files, output): + logging.info(f"Loading base XLSX {base_xlsx}...") + + wb = openpyxl.load_workbook(base_xlsx) + ws = wb.active + + logging.info(f"Creating {num_files} files from {base_xlsx} in {output}") + + for i in range(num_files): + dt = datetime.datetime.now() + ws["A1"] = dt + path = os.path.join(output, f"{dt}.xlsx") + wb.save(path) + logging.info(f"#{i + 1} Created: {path}") + + logging.info("Done") + + +def main(): + """ + Generates unique XLSX files by slightly modifying copies of the given a base file. Used in conjuction with the + collect_scan_metrics cmd. + Usage: + python tools/generate_xlsx_files.py --base_xlsx --num_files + Example: + python tools/generate_xlsx_files.py --base_xlsx 'output/181744-22/federal-awards-workbook-181744.xlsx' --num_files 5 + """ + parser = argparse.ArgumentParser() + + parser.add_argument("--output", type=str, required=False, default="./metrics_files") + parser.add_argument("--base_xlsx", type=str, required=True, default=None) + parser.add_argument("--num_files", type=int, required=False, default=1) + + args = parser.parse_args() + + output = args.output + base_xlsx = args.base_xlsx + num_files = args.num_files + + if not os.path.exists(output): + try: + os.mkdir(output) + logging.info(f"Made directory {output}") + except Exception as e: + logging.error(f"Could not create directory {output}: {e}") + sys.exit() + + if not os.path.exists(base_xlsx): + logging.error(f"Given base_xlsx {base_xlsx} does not exist") + sys.exit() + + generate_files(base_xlsx, num_files, output) + + +if __name__ == "__main__": + main()