Skip to content

Commit

Permalink
3432: ClamAV benchmarking scripts (#3453)
Browse files Browse the repository at this point in the history
* Creating cmd to scan local files

* Collecting metrics

* Increasing timeout

* Creating cmd to generate xlsx files

* Cleanup

* Running scans in parallel

* Adding real time

* Moving cmds to standalone scripts

* Comment

* Minor tweaks

* Comment

* Lint

* Lint
  • Loading branch information
phildominguez-gsa authored Feb 28, 2024
1 parent dc1e0c1 commit 91aeb9b
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 0 deletions.
112 changes: 112 additions & 0 deletions backend/tools/collect_scan_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import argparse
import glob
import logging
import requests
import time
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO


logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])


class ClamAVError(Exception):
def __init__(self, file):
self.file = file

def __str__(self):
return "Static virus scan failed"


def _scan_file(file, filepath):
try:
logging.info(f"Scanning {filepath}")
return requests.post(
"http://clamav-rest:9000/scan",
files={"file": file},
data={"name": filepath},
timeout=300,
)
except requests.exceptions.ConnectionError:
logging.error("SCAN Connection error")
raise ClamAVError(filepath)
except Exception as e:
logging.error(f"SCAN EXCEPTION UNKNOWN {filepath} {e}")
raise ClamAVError(filepath)


def scan_file(filepath):
try:
with open(filepath, "rb") as fh:
file = BytesIO(fh.read())

t1 = time.perf_counter(), time.process_time()
_scan_file(file, filepath)
t2 = time.perf_counter(), time.process_time()

return t2[0] - t1[0]
except Exception as e:
logging.error(f"SCAN SCAN_FILE {e}")


def scan_files_at_path(path, num_to_scan, max_workers):
filepaths = glob.glob(path + "*")[:num_to_scan]
if not filepaths:
raise Exception(f"No files found at {path}")

with ThreadPoolExecutor(max_workers=max_workers) as executor:
return list(executor.map(scan_file, filepaths))


def is_stringlike(o):
return isinstance(o, str) or isinstance(o, bytes)


def not_a_stringlike(o):
return not is_stringlike(o)


def check_scan_ok(result):
if result and not_a_stringlike(result) and result.status_code == 200:
return True
else:
return False


def main():
"""
Outputs metrics from performing ClamAV file scans. Beware: ClamAV must be restarted
between runs of this script (`docker restart backend-clamav-rest-1`) in order to
clear the file cache.
Usage:
python collect_scan_metrics --path <path pattern> --num_to_scan <int> --num_workers <int>
Example:
python collect_scan_metrics --path 'metrics_files/*.xlsx' --num_to_scan 20 --num_workers 5
"""
parser = argparse.ArgumentParser()

parser.add_argument("--path", type=str, required=True, default=None)
parser.add_argument("--num_to_scan", type=int, required=False, default=1)
parser.add_argument("--num_workers", type=int, required=False, default=1)

args = parser.parse_args()

path = args.path
num_to_scan = args.num_to_scan
num_workers = args.num_workers

t1 = time.perf_counter(), time.process_time()
results = scan_files_at_path(path, num_to_scan, num_workers)
t2 = time.perf_counter(), time.process_time()
real_time = t2[0] - t1[0]

logging.info(f"Num files: {num_to_scan}")
logging.info(f"Num workers: {num_workers}")
logging.info(f"Real time: {real_time / 60} minutes")
logging.info(f"Total time: {sum(results) / 60} minutes")
logging.info(f"Max time: {max(results)} seconds")
logging.info(f"Avg time: {sum(results) / len(results)} seconds")


if __name__ == "__main__":
main()
67 changes: 67 additions & 0 deletions backend/tools/generate_xlsx_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import argparse
import datetime
import logging
import openpyxl
import os
import sys


logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()])


def generate_files(base_xlsx, num_files, output):
logging.info(f"Loading base XLSX {base_xlsx}...")

wb = openpyxl.load_workbook(base_xlsx)
ws = wb.active

logging.info(f"Creating {num_files} files from {base_xlsx} in {output}")

for i in range(num_files):
dt = datetime.datetime.now()
ws["A1"] = dt
path = os.path.join(output, f"{dt}.xlsx")
wb.save(path)
logging.info(f"#{i + 1} Created: {path}")

logging.info("Done")


def main():
"""
Generates unique XLSX files by slightly modifying copies of the given a base file. Used in conjuction with the
collect_scan_metrics cmd.
Usage:
python tools/generate_xlsx_files.py --base_xlsx <xlsx file path> --num_files <int>
Example:
python tools/generate_xlsx_files.py --base_xlsx 'output/181744-22/federal-awards-workbook-181744.xlsx' --num_files 5
"""
parser = argparse.ArgumentParser()

parser.add_argument("--output", type=str, required=False, default="./metrics_files")
parser.add_argument("--base_xlsx", type=str, required=True, default=None)
parser.add_argument("--num_files", type=int, required=False, default=1)

args = parser.parse_args()

output = args.output
base_xlsx = args.base_xlsx
num_files = args.num_files

if not os.path.exists(output):
try:
os.mkdir(output)
logging.info(f"Made directory {output}")
except Exception as e:
logging.error(f"Could not create directory {output}: {e}")
sys.exit()

if not os.path.exists(base_xlsx):
logging.error(f"Given base_xlsx {base_xlsx} does not exist")
sys.exit()

generate_files(base_xlsx, num_files, output)


if __name__ == "__main__":
main()

0 comments on commit 91aeb9b

Please sign in to comment.