diff --git a/backend/dissemination/forms.py b/backend/dissemination/forms.py index 2271cbe227..760e0cca0c 100644 --- a/backend/dissemination/forms.py +++ b/backend/dissemination/forms.py @@ -2,9 +2,43 @@ class SearchForm(forms.Form): - AY_choices = ( + # Multiple choice field mappings + findings_field_mapping = { + "field_name": [ + "all_findings", + "is_modified_opinion", + "is_other_findings", + "is_material_weakness", + "is_significant_deficiency", + "is_other_matters", + "is_questioned_costs", + "is_repeat_finding", + ], + "friendly_name": [ + "Any findings", + "Modified opinion", + "Other findings", + "Material weakness", + "Significant deficiency", + "Other matters", + "Questioned costs", + "Repeat finding", + ], + } + + # Multiple choice field Tuples. "choices" variable in field declaration. + AY_choices = (("all_years", "All years"),) + tuple( (x, str(x)) for x in reversed(range(2016, 2024)) - ) # ((2016, "2016"), (2017, "2017"), ..., (2023, "2023")) + ) + findings_choices = list(zip(*findings_field_mapping.values())) + direct_funding_choices = ( + ("direct_funding", "Direct funding"), + ("passthrough_funding", "Passthrough funding"), + ) + major_program_choices = ( + (True, "Y"), + (False, "N"), + ) # Query params entity_name = forms.CharField(required=False) @@ -14,7 +48,16 @@ class SearchForm(forms.Form): end_date = forms.DateField(required=False) cog_or_oversight = forms.CharField(required=False) agency_name = forms.CharField(required=False) - audit_year = forms.MultipleChoiceField(choices=AY_choices, required=False) + audit_year = forms.MultipleChoiceField( + choices=AY_choices, initial=[2023], required=False + ) + findings = forms.MultipleChoiceField(choices=findings_choices, required=False) + direct_funding = forms.MultipleChoiceField( + choices=direct_funding_choices, required=False + ) + major_program = forms.MultipleChoiceField( + choices=major_program_choices, required=False + ) auditee_state = forms.CharField(required=False) # Display params @@ -22,3 +65,72 @@ class SearchForm(forms.Form): page = forms.CharField(required=False) order_by = forms.CharField(required=False) order_direction = forms.CharField(required=False) + + # Variables for cleaning functions + text_input_delimiters = [ + ",", + ":", + ";", + "-", + " ", + ] + + def clean_aln(self): + """ + Clean up the ALN field. Replace common separators with a newline. + Split on the newlines. Strip all the resulting elements. + """ + text_input = self.cleaned_data["aln"] + for delimiter in self.text_input_delimiters: + text_input = text_input.replace(delimiter, "\n") + text_input = [x.strip() for x in text_input.splitlines()] + return text_input + + def clean_entity_name(self): + """ + Clean the name field. We can't trust that separators aren't a part of a name somewhere, + so just split on newlines. + """ + text_input = self.cleaned_data["entity_name"] + return text_input.splitlines() + + def clean_uei_or_ein(self): + """ + Clean up the UEI/EIN field. Replace common separators with a newline. + Split on the newlines. Strip all the resulting elements. + """ + text_input = self.cleaned_data["uei_or_ein"] + for delimiter in self.text_input_delimiters: + text_input = text_input.replace(delimiter, "\n") + text_input = [x.strip() for x in text_input.splitlines()] + return text_input + + def clean_audit_year(self): + """ + If "All years" is selected, don't include any years. + """ + audit_year = self.cleaned_data["audit_year"] + if "all_years" in audit_year: + return [] + return audit_year + + def clean_major_program(self): + """ + If 'Any' is selected, don't include the more specific fields. + """ + major_program = self.cleaned_data["major_program"] + if "any" in major_program: + return ["any"] + return major_program + + def clean_page(self): + """ + Default page number to one. + """ + return int(self.cleaned_data["page"] or 1) + + def clean_limit(self): + """ + Default page limit to 30. + """ + return int(self.cleaned_data["limit"] or 30) diff --git a/backend/dissemination/management/commands/populate_aln_table.py b/backend/dissemination/management/commands/populate_aln_table.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/dissemination/search.py b/backend/dissemination/search.py index 28ee9e7d60..1533e82185 100644 --- a/backend/dissemination/search.py +++ b/backend/dissemination/search.py @@ -1,8 +1,11 @@ import logging import time -from .search_constants import ORDER_BY, DIRECTION, DAS_LIMIT -from .search_general import report_timing, search_general -from .search_alns import search_alns +from .searchlib.search_constants import ORDER_BY, DIRECTION, DAS_LIMIT +from .searchlib.search_general import report_timing, search_general +from .searchlib.search_alns import search_alns +from .searchlib.search_findings import search_findings +from .searchlib.search_direct_funding import search_direct_funding +from .searchlib.search_major_program import search_major_program logger = logging.getLogger(__name__) @@ -30,6 +33,10 @@ def search(params): results = search_general(params) results = _sort_results(results, params) results = search_alns(results, params) + results = search_findings(results, params) + results = search_direct_funding(results, params) + results = search_major_program(results, params) + results = results.distinct("report_id", params.get("order_by", "fac_accepted_date")) t1 = time.time() report_timing("search", params, t0, t1) diff --git a/backend/dissemination/searchlib/__init__.py b/backend/dissemination/searchlib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/backend/dissemination/search_alns.py b/backend/dissemination/searchlib/search_alns.py similarity index 100% rename from backend/dissemination/search_alns.py rename to backend/dissemination/searchlib/search_alns.py diff --git a/backend/dissemination/search_constants.py b/backend/dissemination/searchlib/search_constants.py similarity index 100% rename from backend/dissemination/search_constants.py rename to backend/dissemination/searchlib/search_constants.py diff --git a/backend/dissemination/searchlib/search_direct_funding.py b/backend/dissemination/searchlib/search_direct_funding.py new file mode 100644 index 0000000000..f3fbb69744 --- /dev/null +++ b/backend/dissemination/searchlib/search_direct_funding.py @@ -0,0 +1,28 @@ +from django.db.models import Q +import time +from .search_general import report_timing + +import logging + +logger = logging.getLogger(__name__) + + +def search_direct_funding(general_results, params): + t0 = time.time() + q = Q() + direct_funding_fields = params.get("direct_funding", []) + + for field in direct_funding_fields: + match field: + case "direct_funding": + q |= Q(federalaward__is_direct="Y") + case "passthrough_funding": + q |= Q(federalaward__is_direct="N") + case _: + pass + + filtered_general_results = general_results.filter(q) + + t1 = time.time() + report_timing("search_direct_funding", params, t0, t1) + return filtered_general_results diff --git a/backend/dissemination/searchlib/search_findings.py b/backend/dissemination/searchlib/search_findings.py new file mode 100644 index 0000000000..6d0aab9fbd --- /dev/null +++ b/backend/dissemination/searchlib/search_findings.py @@ -0,0 +1,47 @@ +from django.db.models import Q +import time +from .search_general import report_timing + +import logging + +logger = logging.getLogger(__name__) + + +def search_findings(general_results, params): + t0 = time.time() + q = Q() + findings_fields = params.get("findings", []) + + for field in findings_fields: + match field: + case "all_findings": + # This can be achieved via federalaward__findings_count__gt=0, + # But, it's faster to chain ORs in the Finding table than it is to walk the FederalAward table. + q |= Q(finding__is_modified_opinion="Y") + q |= Q(finding__is_other_findings="Y") + q |= Q(finding__is_material_weakness="Y") + q |= Q(finding__is_significant_deficiency="Y") + q |= Q(finding__is_other_matters="Y") + q |= Q(finding__is_questioned_costs="Y") + q |= Q(finding__is_repeat_finding="Y") + case "is_modified_opinion": + q |= Q(finding__is_modified_opinion="Y") + case "is_other_findings": + q |= Q(finding__is_other_findings="Y") + case "is_material_weakness": + q |= Q(finding__is_material_weakness="Y") + case "is_significant_deficiency": + q |= Q(finding__is_significant_deficiency="Y") + case "is_other_matters": + q |= Q(finding__is_other_matters="Y") + case "is_questioned_costs": + q |= Q(finding__is_questioned_costs="Y") + case "is_repeat_finding": + q |= Q(finding__is_repeat_finding="Y") + case _: + pass + filtered_general_results = general_results.filter(q) + + t1 = time.time() + report_timing("search_findings", params, t0, t1) + return filtered_general_results diff --git a/backend/dissemination/search_general.py b/backend/dissemination/searchlib/search_general.py similarity index 99% rename from backend/dissemination/search_general.py rename to backend/dissemination/searchlib/search_general.py index d4aaa576cc..690f811150 100644 --- a/backend/dissemination/search_general.py +++ b/backend/dissemination/searchlib/search_general.py @@ -47,7 +47,7 @@ def search_general(params=None): r_end_date = General.objects.filter(q_end_date) ############## - # Intersection + # Cog/Over q_cogover = _get_cog_or_oversight_match_query( params.get("agency_name", None), params.get("cog_or_oversight", None) ) diff --git a/backend/dissemination/searchlib/search_major_program.py b/backend/dissemination/searchlib/search_major_program.py new file mode 100644 index 0000000000..e1caf4cf7b --- /dev/null +++ b/backend/dissemination/searchlib/search_major_program.py @@ -0,0 +1,27 @@ +from django.db.models import Q +import time +from .search_general import report_timing + +import logging + +logger = logging.getLogger(__name__) + + +def search_major_program(general_results, params): + """ + Searches on FederalAward columns 'is_major'. Comes in as True/False, to search on Y/N. + """ + t0 = time.time() + q = Q() + major_program_fields = params.get("major_program", []) + + if True in major_program_fields: + q |= Q(federalaward__is_major="Y") + elif False in major_program_fields: + q |= Q(federalaward__is_major="N") + + filtered_general_results = general_results.filter(q).distinct() + + t1 = time.time() + report_timing("search_major_program", params, t0, t1) + return filtered_general_results diff --git a/backend/dissemination/templates/search.html b/backend/dissemination/templates/search.html index 84159808c8..63c31561d2 100644 --- a/backend/dissemination/templates/search.html +++ b/backend/dissemination/templates/search.html @@ -33,7 +33,7 @@

Filters

id="audit-year-{{ text }}" name="audit_year" type="checkbox" value={{value}} - {% if text in form.cleaned_data.audit_year %}checked{% endif %} /> + {% if value in form_user_input.audit_year or text in form_user_input.audit_year %}checked{% endif %} /> {% endfor %} @@ -42,37 +42,37 @@

Filters

- + {% comment %} ALN/CFDA {% endcomment %} - + {% comment %} Name {% endcomment %} - + {% comment %} Release Date(s) {% endcomment %}
mm/dd/yyyy
+ data-default-value="{{ form_user_input.start_date }}"> Filters
mm/dd/yyyy
+ data-default-value="{{ form_user_input.end_date }}"> Filters
+ {% comment %} State {% endcomment %} + +
+ + +
{% comment %} Cog/Over {% endcomment %}
@@ -104,40 +122,74 @@

Filters

+ id="agency-name" + name="agency_name" + value="{{ form_user_input.agency_name }}" />
- {% comment %} State {% endcomment %} + {% comment %} Findings {% endcomment %} -
- - + aria-expanded={% if form_user_input.findings %}"true"{% else %}"false"{% endif %} + aria-controls="auditfindings">Audit findings +
+ {% for value, text in form.findings.field.choices %} +
+ + +
+ {% endfor %} +
+ {% comment %} Direct funding {% endcomment %} + +
+ {% for value, text in form.direct_funding.field.choices %} +
+ + +
+ {% endfor %} +
+ {% comment %} Major program {% endcomment %} + +
+ {% for value, text in form.major_program.field.choices %} +
+ + +
+ {% endfor %}
+ {% comment %} Submission {% endcomment %}
diff --git a/backend/dissemination/test_search.py b/backend/dissemination/test_search.py index 4148ca4652..48dbba6a73 100644 --- a/backend/dissemination/test_search.py +++ b/backend/dissemination/test_search.py @@ -1,7 +1,7 @@ from django.test import TestCase -from dissemination.models import General, FederalAward -from dissemination.search import search_general, search_alns +from dissemination.models import General, FederalAward, Finding +from dissemination.search import search_general, search_alns, search from model_bakery import baker @@ -480,3 +480,109 @@ def test_alns_no_findings(self): results_alns[0].finding_my_aln is False and results_alns[0].finding_all_aln is False ) + + +class SearchAdvancedFilterTests(TestCase): + def test_search_findings(self): + """ + When making a search on a particular type of finding, search_general should only return records with a finding of that type. + """ + findings_fields = [ + {"is_modified_opinion": "Y"}, + {"is_other_findings": "Y"}, + {"is_material_weakness": "Y"}, + {"is_significant_deficiency": "Y"}, + {"is_other_matters": "Y"}, + {"is_questioned_costs": "Y"}, + {"is_repeat_finding": "Y"}, + ] + + # For every field, create a General object with an associated Finding with a 'Y' in that field. + gen_objects = [] + finding_objects = [] + for field in findings_fields: + general = baker.make( + General, + is_public=True, + ) + finding = baker.make(Finding, report_id=general, **field) + finding_objects.append(finding) + gen_objects.append(general) + + # One field returns the one appropriate general + params = {"findings": ["is_modified_opinion"]} + results = search(params) + self.assertEqual(len(results), 1) + + # Three fields returns three appropriate generals + params = { + "findings": [ + "is_other_findings", + "is_material_weakness", + "is_significant_deficiency", + ] + } + results = search(params) + self.assertEqual(len(results), 3) + + # Garbage fields don't apply any filters, so everything comes back + params = {"findings": ["a_garbage_field"]} + results = search(params) + self.assertEqual(len(results), 7) + + def test_search_direct_funding(self): + """ + When making a search on direct/passthrough funding, search_general should only return records with an award of that type. + """ + general_direct = baker.make( + General, + is_public=True, + ) + baker.make(FederalAward, report_id=general_direct, is_direct="Y") + + general_passthrough = baker.make( + General, + is_public=True, + ) + baker.make(FederalAward, report_id=general_passthrough, is_direct="N") + + params = {"direct_funding": ["direct_funding"]} + results = search(params) + self.assertEqual(len(results), 1) + self.assertEqual(results[0], general_direct) + + params = {"direct_funding": ["passthrough_funding"]} + results = search(params) + self.assertEqual(len(results), 1) + self.assertEqual(results[0], general_passthrough) + + # One can search on both, even if there's not much reason to. + params = {"direct_funding": ["direct_funding", "passthrough_funding"]} + results = search(params) + self.assertEqual(len(results), 2) + + def test_search_major_program(self): + """ + When making a search on major program, search_general should only return records with an award of that type. + """ + general_major = baker.make( + General, + is_public=True, + ) + baker.make(FederalAward, report_id=general_major, is_major="Y") + + general_non_major = baker.make( + General, + is_public=True, + ) + baker.make(FederalAward, report_id=general_non_major, is_major="N") + + params = {"major_program": [True]} + results = search(params) + self.assertEqual(len(results), 1) + self.assertEqual(results[0], general_major) + + params = {"major_program": [False]} + results = search(params) + self.assertEqual(len(results), 1) + self.assertEqual(results[0], general_non_major) diff --git a/backend/dissemination/views.py b/backend/dissemination/views.py index 1dd21ea63d..79bc293f98 100644 --- a/backend/dissemination/views.py +++ b/backend/dissemination/views.py @@ -1,4 +1,3 @@ -from collections import namedtuple from datetime import timedelta import logging import math @@ -81,57 +80,6 @@ def include_private_results(request): return True -def clean_form_data(form): - """ - Given a SearchForm, return a namedtuple with its cleaned and formatted data. - Ideally, this makes accessing form data later a little more readable. - """ - FormData = namedtuple( - "FormData", - "uei_or_eins alns names start_date end_date cog_or_oversight agency_name audit_years auditee_state order_by order_direction limit page", - ) - - if form.is_valid(): - uei_or_eins = form.cleaned_data["uei_or_ein"].splitlines() - alns = form.cleaned_data["aln"].replace(", ", " ").split() - names = form.cleaned_data["entity_name"].splitlines() - start_date = form.cleaned_data["start_date"] - end_date = form.cleaned_data["end_date"] - cog_or_oversight = form.cleaned_data["cog_or_oversight"] - agency_name = form.cleaned_data["agency_name"] - audit_years = [ - int(year) for year in form.cleaned_data["audit_year"] - ] # Cast strings from HTML to int - auditee_state = form.cleaned_data["auditee_state"] - order_by = form.cleaned_data["order_by"] - order_direction = form.cleaned_data["order_direction"] - - # TODO: Add a limit choice field to the form - limit = form.cleaned_data["limit"] or 30 - page = int(form.cleaned_data["page"] or 1) - - form_data = FormData( - uei_or_eins, - alns, - names, - start_date, - end_date, - cog_or_oversight, - agency_name, - audit_years, - auditee_state, - order_by, - order_direction, - limit, - page, - ) - - else: - raise BadRequest("Form data validation error.", form.errors) - - return form_data - - def run_search(form_data, include_private): """ Given cleaned form data and an 'include_private' boolean, run the search. @@ -140,18 +88,21 @@ def run_search(form_data, include_private): # As a dictionary, this is easily extensible. search_parameters = { - "alns": form_data.alns, - "names": form_data.names, - "uei_or_eins": form_data.uei_or_eins, - "start_date": form_data.start_date, - "end_date": form_data.end_date, - "cog_or_oversight": form_data.cog_or_oversight, - "agency_name": form_data.agency_name, - "audit_years": form_data.audit_years, - "auditee_state": form_data.auditee_state, + "alns": form_data["aln"], + "names": form_data["entity_name"], + "uei_or_eins": form_data["uei_or_ein"], + "start_date": form_data["start_date"], + "end_date": form_data["end_date"], + "cog_or_oversight": form_data["cog_or_oversight"], + "agency_name": form_data["agency_name"], + "audit_years": form_data["audit_year"], + "findings": form_data["findings"], + "direct_funding": form_data["direct_funding"], + "major_program": form_data["major_program"], + "auditee_state": form_data["auditee_state"], "include_private": include_private, - "order_by": form_data.order_by, - "order_direction": form_data.order_direction, + "order_by": form_data["order_by"], + "order_direction": form_data["order_direction"], } _add_search_params_to_newrelic(search_parameters) @@ -171,6 +122,7 @@ def get(self, request, *args, **kwargs): "search.html", { "form": form, + "form_user_input": {"audit_year": ["2023"]}, "state_abbrevs": STATE_ABBREVS, "summary_report_download_limit": SUMMARY_REPORT_DOWNLOAD_LIMIT, }, @@ -186,57 +138,53 @@ def post(self, request, *args, **kwargs): results = [] context = {} - form_data = clean_form_data(form) + # form_data = clean_form_data(form) + if form.is_valid(): + form_data = form.cleaned_data + form_user_input = { + k: v[0] if len(v) == 1 else v for k, v in form.data.lists() + } + else: + raise ValidationError(f"Form error in Search POST. {form.errors}") - # If no years are checked, check 2023. - logger.info(form_data) - if len(form_data.audit_years) == 0: - form_data = form_data._replace( - audit_years=[2023] - ) # To search on the correct year - form.cleaned_data["audit_year"] = [ - "2023" - ] # To include the param into the rendered page + logger.info(f"Searching on fields: {form_data}") include_private = include_private_results(request) results = run_search(form_data, include_private) - results_count = len(results) + results_count = results.count() # Reset page to one if the page number surpasses how many pages there actually are - page = form_data.page - ceiling = math.ceil(results_count / form_data.limit) + page = form_data["page"] + ceiling = math.ceil(results_count / form_data["limit"]) if page > ceiling: page = 1 logger.info(f"TOTAL: results_count: [{results_count}]") # The paginator object handles splicing the results to a one-page iterable and calculates which page numbers to show. - paginator = Paginator(object_list=results, per_page=form_data.limit) + paginator = Paginator(object_list=results, per_page=form_data["limit"]) paginator_results = paginator.get_page(page) paginator_results.adjusted_elided_pages = paginator.get_elided_page_range( page, on_each_side=1 ) # Reformat these so the date-picker elements in HTML prepopulate - if form.cleaned_data["start_date"]: - form.cleaned_data["start_date"] = form.cleaned_data["start_date"].strftime( - "%Y-%m-%d" - ) - if form.cleaned_data["end_date"]: - form.cleaned_data["end_date"] = form.cleaned_data["end_date"].strftime( - "%Y-%m-%d" - ) + if form_data["start_date"]: + form_data["start_date"] = form_data["start_date"].strftime("%Y-%m-%d") + if form_data["end_date"]: + form_data["end_date"] = form_data["end_date"].strftime("%Y-%m-%d") context = context | { "form": form, + "form_user_input": form_user_input, "state_abbrevs": STATE_ABBREVS, - "limit": form_data.limit, + "limit": form_data["limit"], "results": paginator_results, "results_count": results_count, "page": page, - "order_by": form_data.order_by, - "order_direction": form_data.order_direction, + "order_by": form_data["order_by"], + "order_direction": form_data["order_direction"], "summary_report_download_limit": SUMMARY_REPORT_DOWNLOAD_LIMIT, } time_beginning_render = time.time() @@ -405,9 +353,13 @@ def post(self, request): form = SearchForm(request.POST) try: - cleaned_data = clean_form_data(form) + if form.is_valid(): + form_data = form.cleaned_data + else: + raise ValidationError("Form error in Search POST.") + include_private = include_private_results(request) - results = run_search(cleaned_data, include_private) + results = run_search(form_data, include_private) results = results[:SUMMARY_REPORT_DOWNLOAD_LIMIT] # Hard limit XLSX size if len(results) == 0: diff --git a/backend/static/js/search-results.js b/backend/static/js/search-results.js index bb7afee90c..22dc85323f 100644 --- a/backend/static/js/search-results.js +++ b/backend/static/js/search-results.js @@ -47,7 +47,9 @@ function attachEventHandlersReset() { input.value = ''; }); // Uncheck checkboxes - var checkboxes = document.querySelectorAll('[type="checkbox"]'); + var checkboxes = document.querySelectorAll( + '[type="checkbox"], [type="radio"]' + ); Array.from(checkboxes).forEach((checkbox) => { checkbox.checked = false; }); diff --git a/backend/tools/collect_scan_metrics.py b/backend/tools/collect_scan_metrics.py new file mode 100644 index 0000000000..7d0cfae9f2 --- /dev/null +++ b/backend/tools/collect_scan_metrics.py @@ -0,0 +1,112 @@ +import argparse +import glob +import logging +import requests +import time +from concurrent.futures import ThreadPoolExecutor +from io import BytesIO + + +logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) + + +class ClamAVError(Exception): + def __init__(self, file): + self.file = file + + def __str__(self): + return "Static virus scan failed" + + +def _scan_file(file, filepath): + try: + logging.info(f"Scanning {filepath}") + return requests.post( + "http://clamav-rest:9000/scan", + files={"file": file}, + data={"name": filepath}, + timeout=300, + ) + except requests.exceptions.ConnectionError: + logging.error("SCAN Connection error") + raise ClamAVError(filepath) + except Exception as e: + logging.error(f"SCAN EXCEPTION UNKNOWN {filepath} {e}") + raise ClamAVError(filepath) + + +def scan_file(filepath): + try: + with open(filepath, "rb") as fh: + file = BytesIO(fh.read()) + + t1 = time.perf_counter(), time.process_time() + _scan_file(file, filepath) + t2 = time.perf_counter(), time.process_time() + + return t2[0] - t1[0] + except Exception as e: + logging.error(f"SCAN SCAN_FILE {e}") + + +def scan_files_at_path(path, num_to_scan, max_workers): + filepaths = glob.glob(path + "*")[:num_to_scan] + if not filepaths: + raise Exception(f"No files found at {path}") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + return list(executor.map(scan_file, filepaths)) + + +def is_stringlike(o): + return isinstance(o, str) or isinstance(o, bytes) + + +def not_a_stringlike(o): + return not is_stringlike(o) + + +def check_scan_ok(result): + if result and not_a_stringlike(result) and result.status_code == 200: + return True + else: + return False + + +def main(): + """ + Outputs metrics from performing ClamAV file scans. Beware: ClamAV must be restarted + between runs of this script (`docker restart backend-clamav-rest-1`) in order to + clear the file cache. + Usage: + python collect_scan_metrics --path --num_to_scan --num_workers + Example: + python collect_scan_metrics --path 'metrics_files/*.xlsx' --num_to_scan 20 --num_workers 5 + """ + parser = argparse.ArgumentParser() + + parser.add_argument("--path", type=str, required=True, default=None) + parser.add_argument("--num_to_scan", type=int, required=False, default=1) + parser.add_argument("--num_workers", type=int, required=False, default=1) + + args = parser.parse_args() + + path = args.path + num_to_scan = args.num_to_scan + num_workers = args.num_workers + + t1 = time.perf_counter(), time.process_time() + results = scan_files_at_path(path, num_to_scan, num_workers) + t2 = time.perf_counter(), time.process_time() + real_time = t2[0] - t1[0] + + logging.info(f"Num files: {num_to_scan}") + logging.info(f"Num workers: {num_workers}") + logging.info(f"Real time: {real_time / 60} minutes") + logging.info(f"Total time: {sum(results) / 60} minutes") + logging.info(f"Max time: {max(results)} seconds") + logging.info(f"Avg time: {sum(results) / len(results)} seconds") + + +if __name__ == "__main__": + main() diff --git a/backend/tools/generate_xlsx_files.py b/backend/tools/generate_xlsx_files.py new file mode 100644 index 0000000000..c3a827f366 --- /dev/null +++ b/backend/tools/generate_xlsx_files.py @@ -0,0 +1,67 @@ +import argparse +import datetime +import logging +import openpyxl +import os +import sys + + +logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler()]) + + +def generate_files(base_xlsx, num_files, output): + logging.info(f"Loading base XLSX {base_xlsx}...") + + wb = openpyxl.load_workbook(base_xlsx) + ws = wb.active + + logging.info(f"Creating {num_files} files from {base_xlsx} in {output}") + + for i in range(num_files): + dt = datetime.datetime.now() + ws["A1"] = dt + path = os.path.join(output, f"{dt}.xlsx") + wb.save(path) + logging.info(f"#{i + 1} Created: {path}") + + logging.info("Done") + + +def main(): + """ + Generates unique XLSX files by slightly modifying copies of the given a base file. Used in conjuction with the + collect_scan_metrics cmd. + Usage: + python tools/generate_xlsx_files.py --base_xlsx --num_files + Example: + python tools/generate_xlsx_files.py --base_xlsx 'output/181744-22/federal-awards-workbook-181744.xlsx' --num_files 5 + """ + parser = argparse.ArgumentParser() + + parser.add_argument("--output", type=str, required=False, default="./metrics_files") + parser.add_argument("--base_xlsx", type=str, required=True, default=None) + parser.add_argument("--num_files", type=int, required=False, default=1) + + args = parser.parse_args() + + output = args.output + base_xlsx = args.base_xlsx + num_files = args.num_files + + if not os.path.exists(output): + try: + os.mkdir(output) + logging.info(f"Made directory {output}") + except Exception as e: + logging.error(f"Could not create directory {output}: {e}") + sys.exit() + + if not os.path.exists(base_xlsx): + logging.error(f"Given base_xlsx {base_xlsx} does not exist") + sys.exit() + + generate_files(base_xlsx, num_files, output) + + +if __name__ == "__main__": + main()