3432: ClamAV benchmarking scripts (#3453)

* Creating cmd to scan local files * Collecting metrics * Increasing timeout * Creating cmd to generate xlsx files * Cleanup * Running scans in parallel * Adding real time * Moving cmds to standalone scripts * Comment * Minor tweaks * Comment * Lint * Lint
GSA-TTS · Feb 28, 2024 · 91aeb9b · 91aeb9b
1 parent dc1e0c1
commit 91aeb9b
Show file tree

Hide file tree

Showing 2 changed files with 179 additions and 0 deletions.
diff --git a/backend/tools/collect_scan_metrics.py b/backend/tools/collect_scan_metrics.py
@@ -0,0 +1,112 @@
+import argparse
+import glob
+import logging
+import requests
+import time
+from concurrent.futures import ThreadPoolExecutor
+from io import BytesIO
+
+
+logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
+
+
+class ClamAVError(Exception):
+    def __init__(self, file):
+        self.file = file
+
+    def __str__(self):
+        return "Static virus scan failed"
+
+
+def _scan_file(file, filepath):
+    try:
+        logging.info(f"Scanning {filepath}")
+        return requests.post(
+            "http://clamav-rest:9000/scan",
+            files={"file": file},
+            data={"name": filepath},
+            timeout=300,
+        )
+    except requests.exceptions.ConnectionError:
+        logging.error("SCAN Connection error")
+        raise ClamAVError(filepath)
+    except Exception as e:
+        logging.error(f"SCAN EXCEPTION UNKNOWN {filepath} {e}")
+        raise ClamAVError(filepath)
+
+
+def scan_file(filepath):
+    try:
+        with open(filepath, "rb") as fh:
+            file = BytesIO(fh.read())
+
+        t1 = time.perf_counter(), time.process_time()
+        _scan_file(file, filepath)
+        t2 = time.perf_counter(), time.process_time()
+
+        return t2[0] - t1[0]
+    except Exception as e:
+        logging.error(f"SCAN SCAN_FILE {e}")
+
+
+def scan_files_at_path(path, num_to_scan, max_workers):
+    filepaths = glob.glob(path + "*")[:num_to_scan]
+    if not filepaths:
+        raise Exception(f"No files found at {path}")
+
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        return list(executor.map(scan_file, filepaths))
+
+
+def is_stringlike(o):
+    return isinstance(o, str) or isinstance(o, bytes)
+
+
+def not_a_stringlike(o):
+    return not is_stringlike(o)
+
+
+def check_scan_ok(result):
+    if result and not_a_stringlike(result) and result.status_code == 200:
+        return True
+    else:
+        return False
+
+
+def main():
+    """
+    Outputs metrics from performing ClamAV file scans. Beware: ClamAV must be restarted
+    between runs of this script (`docker restart backend-clamav-rest-1`) in order to
+    clear the file cache.
+    Usage:
+    python collect_scan_metrics --path <path pattern> --num_to_scan <int> --num_workers <int>
+    Example:
+    python collect_scan_metrics --path 'metrics_files/*.xlsx' --num_to_scan 20 --num_workers 5
+    """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--path", type=str, required=True, default=None)
+    parser.add_argument("--num_to_scan", type=int, required=False, default=1)
+    parser.add_argument("--num_workers", type=int, required=False, default=1)
+
+    args = parser.parse_args()
+
+    path = args.path
+    num_to_scan = args.num_to_scan
+    num_workers = args.num_workers
+
+    t1 = time.perf_counter(), time.process_time()
+    results = scan_files_at_path(path, num_to_scan, num_workers)
+    t2 = time.perf_counter(), time.process_time()
+    real_time = t2[0] - t1[0]
+
+    logging.info(f"Num files: {num_to_scan}")
+    logging.info(f"Num workers: {num_workers}")
+    logging.info(f"Real time: {real_time / 60} minutes")
+    logging.info(f"Total time: {sum(results) / 60} minutes")
+    logging.info(f"Max time: {max(results)} seconds")
+    logging.info(f"Avg time: {sum(results) / len(results)} seconds")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/backend/tools/generate_xlsx_files.py b/backend/tools/generate_xlsx_files.py
@@ -0,0 +1,67 @@
+import argparse
+import datetime
+import logging
+import openpyxl
+import os
+import sys
+
+
+logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])
+
+
+def generate_files(base_xlsx, num_files, output):
+    logging.info(f"Loading base XLSX {base_xlsx}...")
+
+    wb = openpyxl.load_workbook(base_xlsx)
+    ws = wb.active
+
+    logging.info(f"Creating {num_files} files from {base_xlsx} in {output}")
+
+    for i in range(num_files):
+        dt = datetime.datetime.now()
+        ws["A1"] = dt
+        path = os.path.join(output, f"{dt}.xlsx")
+        wb.save(path)
+        logging.info(f"#{i + 1} Created: {path}")
+
+    logging.info("Done")
+
+
+def main():
+    """
+    Generates unique XLSX files by slightly modifying copies of the given a base file. Used in conjuction with the
+    collect_scan_metrics cmd.
+    Usage:
+    python tools/generate_xlsx_files.py --base_xlsx <xlsx file path> --num_files <int>
+    Example:
+    python tools/generate_xlsx_files.py --base_xlsx 'output/181744-22/federal-awards-workbook-181744.xlsx' --num_files 5
+    """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--output", type=str, required=False, default="./metrics_files")
+    parser.add_argument("--base_xlsx", type=str, required=True, default=None)
+    parser.add_argument("--num_files", type=int, required=False, default=1)
+
+    args = parser.parse_args()
+
+    output = args.output
+    base_xlsx = args.base_xlsx
+    num_files = args.num_files
+
+    if not os.path.exists(output):
+        try:
+            os.mkdir(output)
+            logging.info(f"Made directory {output}")
+        except Exception as e:
+            logging.error(f"Could not create directory {output}: {e}")
+            sys.exit()
+
+    if not os.path.exists(base_xlsx):
+        logging.error(f"Given base_xlsx {base_xlsx} does not exist")
+        sys.exit()
+
+    generate_files(base_xlsx, num_files, output)
+
+
+if __name__ == "__main__":
+    main()