-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
3432: ClamAV benchmarking scripts (#3453)
* Creating cmd to scan local files * Collecting metrics * Increasing timeout * Creating cmd to generate xlsx files * Cleanup * Running scans in parallel * Adding real time * Moving cmds to standalone scripts * Comment * Minor tweaks * Comment * Lint * Lint
- Loading branch information
1 parent
dc1e0c1
commit 91aeb9b
Showing
2 changed files
with
179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import argparse | ||
import glob | ||
import logging | ||
import requests | ||
import time | ||
from concurrent.futures import ThreadPoolExecutor | ||
from io import BytesIO | ||
|
||
|
||
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) | ||
|
||
|
||
class ClamAVError(Exception): | ||
def __init__(self, file): | ||
self.file = file | ||
|
||
def __str__(self): | ||
return "Static virus scan failed" | ||
|
||
|
||
def _scan_file(file, filepath): | ||
try: | ||
logging.info(f"Scanning {filepath}") | ||
return requests.post( | ||
"http://clamav-rest:9000/scan", | ||
files={"file": file}, | ||
data={"name": filepath}, | ||
timeout=300, | ||
) | ||
except requests.exceptions.ConnectionError: | ||
logging.error("SCAN Connection error") | ||
raise ClamAVError(filepath) | ||
except Exception as e: | ||
logging.error(f"SCAN EXCEPTION UNKNOWN {filepath} {e}") | ||
raise ClamAVError(filepath) | ||
|
||
|
||
def scan_file(filepath): | ||
try: | ||
with open(filepath, "rb") as fh: | ||
file = BytesIO(fh.read()) | ||
|
||
t1 = time.perf_counter(), time.process_time() | ||
_scan_file(file, filepath) | ||
t2 = time.perf_counter(), time.process_time() | ||
|
||
return t2[0] - t1[0] | ||
except Exception as e: | ||
logging.error(f"SCAN SCAN_FILE {e}") | ||
|
||
|
||
def scan_files_at_path(path, num_to_scan, max_workers): | ||
filepaths = glob.glob(path + "*")[:num_to_scan] | ||
if not filepaths: | ||
raise Exception(f"No files found at {path}") | ||
|
||
with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
return list(executor.map(scan_file, filepaths)) | ||
|
||
|
||
def is_stringlike(o): | ||
return isinstance(o, str) or isinstance(o, bytes) | ||
|
||
|
||
def not_a_stringlike(o): | ||
return not is_stringlike(o) | ||
|
||
|
||
def check_scan_ok(result): | ||
if result and not_a_stringlike(result) and result.status_code == 200: | ||
return True | ||
else: | ||
return False | ||
|
||
|
||
def main(): | ||
""" | ||
Outputs metrics from performing ClamAV file scans. Beware: ClamAV must be restarted | ||
between runs of this script (`docker restart backend-clamav-rest-1`) in order to | ||
clear the file cache. | ||
Usage: | ||
python collect_scan_metrics --path <path pattern> --num_to_scan <int> --num_workers <int> | ||
Example: | ||
python collect_scan_metrics --path 'metrics_files/*.xlsx' --num_to_scan 20 --num_workers 5 | ||
""" | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--path", type=str, required=True, default=None) | ||
parser.add_argument("--num_to_scan", type=int, required=False, default=1) | ||
parser.add_argument("--num_workers", type=int, required=False, default=1) | ||
|
||
args = parser.parse_args() | ||
|
||
path = args.path | ||
num_to_scan = args.num_to_scan | ||
num_workers = args.num_workers | ||
|
||
t1 = time.perf_counter(), time.process_time() | ||
results = scan_files_at_path(path, num_to_scan, num_workers) | ||
t2 = time.perf_counter(), time.process_time() | ||
real_time = t2[0] - t1[0] | ||
|
||
logging.info(f"Num files: {num_to_scan}") | ||
logging.info(f"Num workers: {num_workers}") | ||
logging.info(f"Real time: {real_time / 60} minutes") | ||
logging.info(f"Total time: {sum(results) / 60} minutes") | ||
logging.info(f"Max time: {max(results)} seconds") | ||
logging.info(f"Avg time: {sum(results) / len(results)} seconds") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import argparse | ||
import datetime | ||
import logging | ||
import openpyxl | ||
import os | ||
import sys | ||
|
||
|
||
logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) | ||
|
||
|
||
def generate_files(base_xlsx, num_files, output): | ||
logging.info(f"Loading base XLSX {base_xlsx}...") | ||
|
||
wb = openpyxl.load_workbook(base_xlsx) | ||
ws = wb.active | ||
|
||
logging.info(f"Creating {num_files} files from {base_xlsx} in {output}") | ||
|
||
for i in range(num_files): | ||
dt = datetime.datetime.now() | ||
ws["A1"] = dt | ||
path = os.path.join(output, f"{dt}.xlsx") | ||
wb.save(path) | ||
logging.info(f"#{i + 1} Created: {path}") | ||
|
||
logging.info("Done") | ||
|
||
|
||
def main(): | ||
""" | ||
Generates unique XLSX files by slightly modifying copies of the given a base file. Used in conjuction with the | ||
collect_scan_metrics cmd. | ||
Usage: | ||
python tools/generate_xlsx_files.py --base_xlsx <xlsx file path> --num_files <int> | ||
Example: | ||
python tools/generate_xlsx_files.py --base_xlsx 'output/181744-22/federal-awards-workbook-181744.xlsx' --num_files 5 | ||
""" | ||
parser = argparse.ArgumentParser() | ||
|
||
parser.add_argument("--output", type=str, required=False, default="./metrics_files") | ||
parser.add_argument("--base_xlsx", type=str, required=True, default=None) | ||
parser.add_argument("--num_files", type=int, required=False, default=1) | ||
|
||
args = parser.parse_args() | ||
|
||
output = args.output | ||
base_xlsx = args.base_xlsx | ||
num_files = args.num_files | ||
|
||
if not os.path.exists(output): | ||
try: | ||
os.mkdir(output) | ||
logging.info(f"Made directory {output}") | ||
except Exception as e: | ||
logging.error(f"Could not create directory {output}: {e}") | ||
sys.exit() | ||
|
||
if not os.path.exists(base_xlsx): | ||
logging.error(f"Given base_xlsx {base_xlsx} does not exist") | ||
sys.exit() | ||
|
||
generate_files(base_xlsx, num_files, output) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |