diff --git a/backend/dissemination/management/commands/remove-unneeded-workbooks.py b/backend/dissemination/management/commands/remove-unneeded-workbooks.py index c8cdf683f6..c42600cf28 100644 --- a/backend/dissemination/management/commands/remove-unneeded-workbooks.py +++ b/backend/dissemination/management/commands/remove-unneeded-workbooks.py @@ -8,14 +8,20 @@ class Command(BaseCommand): - help = "Delete workbook artifacts for specified years" + help = "Delete workbook artifacts for a specific partition of disseminated reports." def add_arguments(self, parser): parser.add_argument( - "--year", + "--partition_number", type=int, required=True, - help="Comma-separated list of audit years to process", + help="The partition number to process (e.g., 1, 2, 3).", + ) + parser.add_argument( + "--total_partitions", + type=int, + required=True, + help="The total number of partitions (e.g., 4 if splitting the load into four parts).", ) parser.add_argument( "--page_size", @@ -28,14 +34,21 @@ def add_arguments(self, parser): "--pages", type=int, required=False, - default=100, + default=None, help="Maximum number of pages to process", ) def handle(self, *args, **options): - year = options["year"] + partition_number = options["partition_number"] + total_partitions = options["total_partitions"] page_size = options["page_size"] pages = options["pages"] - self.stdout.write(self.style.SUCCESS(f"Processing audit year {year}")) - delete_workbooks(year, page_size=page_size, pages=pages) + self.stdout.write( + self.style.SUCCESS( + f"Processing partition {partition_number} of {total_partitions}" + ) + ) + delete_workbooks( + partition_number, total_partitions, page_size=page_size, pages=pages + ) diff --git a/backend/dissemination/remove_workbook_artifacts.py b/backend/dissemination/remove_workbook_artifacts.py index 5499ae705f..be2c00aa32 100644 --- a/backend/dissemination/remove_workbook_artifacts.py +++ b/backend/dissemination/remove_workbook_artifacts.py @@ -1,4 +1,5 @@ import logging +import math from django.conf import settings from audit.models.models import ExcelFile, SingleAuditChecklist @@ -79,26 +80,22 @@ def delete_files_in_bulk(filenames, sac): def clean_artifacts(sac_list): """ - Remove all workbook artifacts associated with the given list of sac values. + Perform necessary cleanup associated with the given list of sac values. """ try: excel_files = ExcelFile.objects.filter(sac__in=sac_list) files = [f"excel/{excel_file.filename}" for excel_file in excel_files] - sac_to_file_map = { - excel_file.filename: excel_file.sac for excel_file in excel_files - } if files: - # Delete the records from the database - count = excel_files.count() - excel_files.delete() logger.info( - f"Deleted {count} ExcelFile records from fac-db for reports: {[sac.report_id for sac in sac_list]}" + f"Found {len(files)} ExcelFile records for reports: {[sac.report_id for sac in sac_list]}" ) - # Delete the files from S3 in bulk and track results - successful_deletes, failed_deletes = delete_files_in_bulk( - files, sac_list, sac_to_file_map + # Track results but do not delete the ExcelFile records from the database + successful_deletes, failed_deletes = batch_removal( + files, + sac_list, + {excel_file.filename: excel_file.sac for excel_file in excel_files}, ) if failed_deletes: @@ -110,12 +107,8 @@ def clean_artifacts(sac_list): f"Successfully deleted the following files from S3: {successful_deletes}" ) - except ExcelFile.DoesNotExist: - logger.info("No files found to delete for the provided sac reports.") except Exception as e: - logger.error( - f"Failed to delete files from fac-db and S3 for the provided sac values. Error: {e}" - ) + logger.error(f"Failed to process files for the provided sac values. Error: {e}") def batch_removal(filenames, sac_list, sac_to_file_map): @@ -171,19 +164,40 @@ def batch_removal(filenames, sac_list, sac_to_file_map): return [], [{"error_message": str(e)}] -def delete_workbooks(audit_year, page_size=10, pages=None): - """Iterates over disseminated reports for the given audit year.""" +def delete_workbooks(partition_number, total_partitions, page_size=10, pages=None): + """Iterates over disseminated reports for the specified partition.""" - sacs = SingleAuditChecklist.objects.filter( - AUDITYEAR=audit_year, submission_status=SingleAuditChecklist.STATUS.DISSEMINATED - ).order_by("id") - paginator = Paginator(sacs, page_size) + if partition_number < 1 or partition_number > total_partitions: + raise ValueError( + "Invalid partition number. It must be between 1 and the total number of partitions." + ) + all_ids = ( + SingleAuditChecklist.objects.filter( + submission_status=SingleAuditChecklist.STATUS.DISSEMINATED + ) + .values_list("id", flat=True) + .order_by("id") + ) + + total_ids = len(all_ids) + ids_per_partition = math.ceil(total_ids / total_partitions) + + start_index = (partition_number - 1) * ids_per_partition + end_index = min(partition_number * ids_per_partition, total_ids) + + ids_to_process = all_ids[start_index:end_index] + + sacs = SingleAuditChecklist.objects.filter(id__in=ids_to_process).order_by("id") + + paginator = Paginator(sacs, page_size) total_pages = ( paginator.num_pages if pages is None else min(pages, paginator.num_pages) ) - logger.info(f"Retrieving {sacs.count()} reports for audit year {audit_year}") + logger.info( + f"Retrieving {sacs.count()} reports for partition {partition_number} of {total_partitions}" + ) for page_number in range(1, total_pages + 1): try: diff --git a/backend/dissemination/test_remove_workbook_artifacts.py b/backend/dissemination/test_remove_workbook_artifacts.py index b126486b7e..801a0d246c 100644 --- a/backend/dissemination/test_remove_workbook_artifacts.py +++ b/backend/dissemination/test_remove_workbook_artifacts.py @@ -4,8 +4,8 @@ from model_bakery import baker from dissemination.remove_workbook_artifacts import ( - # clean_artifacts, - # delete_workbooks, + clean_artifacts, + delete_workbooks, remove_workbook_artifacts, ) @@ -60,105 +60,96 @@ def test_removed_workbook_artifacts_no_files(self, mock_delete_files_in_bulk): mock_delete_files_in_bulk.assert_not_called() -# class CleanArtifactsTestCase(TestCase): - -# @patch("dissemination.remove_workbook_artifacts.batch_removal") -# def test_clean_artifacts_success(self, mock_batch_removal): -# # Create SAC instances -# sac_1 = baker.make(SingleAuditChecklist, report_id="report_id_1") -# sac_2 = baker.make(SingleAuditChecklist, report_id="report_id_2") - -# # Create ExcelFile instances -# excel_file_1 = baker.make(ExcelFile, sac=sac_1, form_section="section_1") -# excel_file_2 = baker.make(ExcelFile, sac=sac_2, form_section="section_2") - -# sac_list = [sac_1, sac_2] -# clean_artifacts(sac_list) - -# # Assert that the ExcelFile instances are deleted -# self.assertFalse(ExcelFile.objects.filter(sac__in=sac_list).exists()) - -# # Assert S3 bulk delete was called with the correct filenames -# mock_batch_removal.assert_called_once_with( -# [ -# f"excel/{sac_1.report_id}--{excel_file_1.form_section}.xlsx", -# f"excel/{sac_2.report_id}--{excel_file_2.form_section}.xlsx", -# ], -# sac_list, -# { -# f"{sac_1.report_id}--{excel_file_1.form_section}.xlsx": sac_1, -# f"{sac_2.report_id}--{excel_file_2.form_section}.xlsx": sac_2, -# }, -# ) - -# @patch("dissemination.remove_workbook_artifacts.batch_removal") -# def test_clean_artifacts_no_files(self, mock_batch_removal): -# sac = baker.make(SingleAuditChecklist, report_id="test_report_id") - -# # Ensure no ExcelFile instances exist for this SAC -# ExcelFile.objects.filter(sac=sac).delete() - -# clean_artifacts([sac]) - -# # Assert that no ExcelFile instances exist -# self.assertFalse(ExcelFile.objects.filter(sac=sac).exists()) - -# # Assert S3 bulk delete was not called -# mock_batch_removal.assert_not_called() - - -# class DeleteWorkbooksTestCase(TestCase): - -# @patch("dissemination.remove_workbook_artifacts.clean_artifacts") -# def test_delete_workbooks_single_page(self, mock_clean_artifacts): -# sac = baker.make( -# SingleAuditChecklist, -# AUDITYEAR=22, -# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED, -# ) -# baker.make(ExcelFile, sac=sac, filename="file_1.xlsx") - -# delete_workbooks(22, page_size=10, pages=1) - -# # Assert clean_artifacts was called with the correct sac list -# mock_clean_artifacts.assert_called_once() - -# @patch("dissemination.remove_workbook_artifacts.clean_artifacts") -# def test_delete_workbooks_multiple_pages(self, mock_clean_artifacts): -# sac_1 = baker.make( -# SingleAuditChecklist, -# AUDITYEAR=22, -# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED, -# ) -# sac_2 = baker.make( -# SingleAuditChecklist, -# AUDITYEAR=22, -# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED, -# ) -# baker.make(ExcelFile, sac=sac_1, filename="file_1.xlsx") -# baker.make(ExcelFile, sac=sac_2, filename="file_2.xlsx") - -# delete_workbooks(22, page_size=1, pages=2) - -# # Assert clean_artifacts was called twice, once for each page -# self.assertEqual(mock_clean_artifacts.call_count, 2) - -# @patch("dissemination.remove_workbook_artifacts.clean_artifacts") -# def test_delete_workbooks_all_pages(self, mock_clean_artifacts): -# sac_1 = baker.make( -# SingleAuditChecklist, -# AUDITYEAR=22, -# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED, -# ) -# sac_2 = baker.make( -# SingleAuditChecklist, -# AUDITYEAR=22, -# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED, -# ) -# baker.make(ExcelFile, sac=sac_1, filename="file_1.xlsx") -# baker.make(ExcelFile, sac=sac_2, filename="file_2.xlsx") - -# delete_workbooks(22, page_size=1) - -# # Assert clean_artifacts was called twice, once for each page since we don't limit pages -# self.assertEqual(mock_clean_artifacts.call_count, 2) +class CleanArtifactsTestCase(TestCase): + + @patch("dissemination.remove_workbook_artifacts.batch_removal") + def test_clean_artifacts_success(self, mock_batch_removal): + # Create SAC instances + sac_1 = baker.make(SingleAuditChecklist, report_id="report_id_1") + sac_2 = baker.make(SingleAuditChecklist, report_id="report_id_2") + + # Create ExcelFile instances + excel_file_1 = baker.make(ExcelFile, sac=sac_1, form_section="section_1") + excel_file_2 = baker.make(ExcelFile, sac=sac_2, form_section="section_2") + + sac_list = [sac_1, sac_2] + clean_artifacts(sac_list) + + # Assert that the ExcelFile instances still exist (no deletion) + self.assertTrue(ExcelFile.objects.filter(sac__in=sac_list).exists()) + + # Assert S3 bulk delete was called with the correct filenames + mock_batch_removal.assert_called_once_with( + [ + f"excel/{sac_1.report_id}--{excel_file_1.form_section}.xlsx", + f"excel/{sac_2.report_id}--{excel_file_2.form_section}.xlsx", + ], + sac_list, + { + f"{sac_1.report_id}--{excel_file_1.form_section}.xlsx": sac_1, + f"{sac_2.report_id}--{excel_file_2.form_section}.xlsx": sac_2, + }, + ) + + @patch("dissemination.remove_workbook_artifacts.batch_removal") + def test_clean_artifacts_no_files(self, mock_batch_removal): + sac = baker.make(SingleAuditChecklist, report_id="test_report_id") + + # Ensure no ExcelFile instances exist for this SAC + ExcelFile.objects.filter(sac=sac).delete() + + clean_artifacts([sac]) + + # Assert that no ExcelFile instances exist + self.assertFalse(ExcelFile.objects.filter(sac=sac).exists()) + + # Assert S3 bulk delete was not called + mock_batch_removal.assert_not_called() + + +class DeleteWorkbooksTestCase(TestCase): + + def setUp(self): + # Common setup for SAC instances + self.sac_1 = baker.make(SingleAuditChecklist, id=1, report_id="report_1") + self.sac_2 = baker.make(SingleAuditChecklist, id=2, report_id="report_2") + # Create associated ExcelFile instances + self.excel_file_1 = baker.make( + ExcelFile, sac=self.sac_1, form_section="section_1" + ) + self.excel_file_2 = baker.make( + ExcelFile, sac=self.sac_2, form_section="section_2" + ) + # Update submission status to DISSEMINATED + self.sac_1.submission_status = SingleAuditChecklist.STATUS.DISSEMINATED + self.sac_2.submission_status = SingleAuditChecklist.STATUS.DISSEMINATED + self.sac_1.save() + self.sac_2.save() + + @patch("dissemination.remove_workbook_artifacts.clean_artifacts") + def test_delete_workbooks_single_page(self, mock_clean_artifacts): + """Test delete_workbooks with a single page of workbooks""" + delete_workbooks(partition_number=1, total_partitions=1, page_size=10, pages=1) + + mock_clean_artifacts.assert_called_once_with([self.sac_1, self.sac_2]) + + @patch("dissemination.remove_workbook_artifacts.clean_artifacts") + def test_delete_workbooks_multiple_pages(self, mock_clean_artifacts): + """Test delete_workbooks with multiple pages of workbooks""" + delete_workbooks(partition_number=1, total_partitions=1, page_size=1, pages=2) + + self.assertEqual(mock_clean_artifacts.call_count, 2) + + mock_clean_artifacts.assert_any_call([self.sac_1]) + mock_clean_artifacts.assert_any_call([self.sac_2]) + + @patch("dissemination.remove_workbook_artifacts.clean_artifacts") + def test_delete_workbooks_all_pages(self, mock_clean_artifacts): + """Test delete_workbooks with all pages of workbooks""" + + delete_workbooks(partition_number=1, total_partitions=1, page_size=1) + + self.assertEqual(mock_clean_artifacts.call_count, 2) + + mock_clean_artifacts.assert_any_call([self.sac_1]) + mock_clean_artifacts.assert_any_call([self.sac_2])