Skip to content

Commit

Permalink
#3748 Updated django command to remove files from S3 only
Browse files Browse the repository at this point in the history
  • Loading branch information
sambodeme committed Aug 17, 2024
1 parent df6d110 commit 9d58353
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 134 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@


class Command(BaseCommand):
help = "Delete workbook artifacts for specified years"
help = "Delete workbook artifacts for a specific partition of disseminated reports."

def add_arguments(self, parser):
parser.add_argument(
"--year",
"--partition_number",
type=int,
required=True,
help="Comma-separated list of audit years to process",
help="The partition number to process (e.g., 1, 2, 3).",
)
parser.add_argument(
"--total_partitions",
type=int,
required=True,
help="The total number of partitions (e.g., 4 if splitting the load into four parts).",
)
parser.add_argument(
"--page_size",
Expand All @@ -28,14 +34,21 @@ def add_arguments(self, parser):
"--pages",
type=int,
required=False,
default=100,
default=None,
help="Maximum number of pages to process",
)

def handle(self, *args, **options):
year = options["year"]
partition_number = options["partition_number"]
total_partitions = options["total_partitions"]
page_size = options["page_size"]
pages = options["pages"]

self.stdout.write(self.style.SUCCESS(f"Processing audit year {year}"))
delete_workbooks(year, page_size=page_size, pages=pages)
self.stdout.write(
self.style.SUCCESS(
f"Processing partition {partition_number} of {total_partitions}"
)
)
delete_workbooks(
partition_number, total_partitions, page_size=page_size, pages=pages
)
60 changes: 37 additions & 23 deletions backend/dissemination/remove_workbook_artifacts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import math

from django.conf import settings
from audit.models.models import ExcelFile, SingleAuditChecklist
Expand Down Expand Up @@ -79,26 +80,22 @@ def delete_files_in_bulk(filenames, sac):

def clean_artifacts(sac_list):
"""
Remove all workbook artifacts associated with the given list of sac values.
Perform necessary cleanup associated with the given list of sac values.
"""
try:
excel_files = ExcelFile.objects.filter(sac__in=sac_list)
files = [f"excel/{excel_file.filename}" for excel_file in excel_files]
sac_to_file_map = {
excel_file.filename: excel_file.sac for excel_file in excel_files
}

if files:
# Delete the records from the database
count = excel_files.count()
excel_files.delete()
logger.info(
f"Deleted {count} ExcelFile records from fac-db for reports: {[sac.report_id for sac in sac_list]}"
f"Found {len(files)} ExcelFile records for reports: {[sac.report_id for sac in sac_list]}"
)

# Delete the files from S3 in bulk and track results
successful_deletes, failed_deletes = delete_files_in_bulk(
files, sac_list, sac_to_file_map
# Track results but do not delete the ExcelFile records from the database
successful_deletes, failed_deletes = batch_removal(
files,
sac_list,
{excel_file.filename: excel_file.sac for excel_file in excel_files},
)

if failed_deletes:
Expand All @@ -110,12 +107,8 @@ def clean_artifacts(sac_list):
f"Successfully deleted the following files from S3: {successful_deletes}"
)

except ExcelFile.DoesNotExist:
logger.info("No files found to delete for the provided sac reports.")
except Exception as e:
logger.error(
f"Failed to delete files from fac-db and S3 for the provided sac values. Error: {e}"
)
logger.error(f"Failed to process files for the provided sac values. Error: {e}")


def batch_removal(filenames, sac_list, sac_to_file_map):
Expand Down Expand Up @@ -171,19 +164,40 @@ def batch_removal(filenames, sac_list, sac_to_file_map):
return [], [{"error_message": str(e)}]


def delete_workbooks(audit_year, page_size=10, pages=None):
"""Iterates over disseminated reports for the given audit year."""
def delete_workbooks(partition_number, total_partitions, page_size=10, pages=None):
"""Iterates over disseminated reports for the specified partition."""

sacs = SingleAuditChecklist.objects.filter(
AUDITYEAR=audit_year, submission_status=SingleAuditChecklist.STATUS.DISSEMINATED
).order_by("id")
paginator = Paginator(sacs, page_size)
if partition_number < 1 or partition_number > total_partitions:
raise ValueError(
"Invalid partition number. It must be between 1 and the total number of partitions."
)

all_ids = (
SingleAuditChecklist.objects.filter(
submission_status=SingleAuditChecklist.STATUS.DISSEMINATED
)
.values_list("id", flat=True)
.order_by("id")
)

total_ids = len(all_ids)
ids_per_partition = math.ceil(total_ids / total_partitions)

start_index = (partition_number - 1) * ids_per_partition
end_index = min(partition_number * ids_per_partition, total_ids)

ids_to_process = all_ids[start_index:end_index]

sacs = SingleAuditChecklist.objects.filter(id__in=ids_to_process).order_by("id")

paginator = Paginator(sacs, page_size)
total_pages = (
paginator.num_pages if pages is None else min(pages, paginator.num_pages)
)

logger.info(f"Retrieving {sacs.count()} reports for audit year {audit_year}")
logger.info(
f"Retrieving {sacs.count()} reports for partition {partition_number} of {total_partitions}"
)

for page_number in range(1, total_pages + 1):
try:
Expand Down
199 changes: 95 additions & 104 deletions backend/dissemination/test_remove_workbook_artifacts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from model_bakery import baker

from dissemination.remove_workbook_artifacts import (
# clean_artifacts,
# delete_workbooks,
clean_artifacts,
delete_workbooks,
remove_workbook_artifacts,
)

Expand Down Expand Up @@ -60,105 +60,96 @@ def test_removed_workbook_artifacts_no_files(self, mock_delete_files_in_bulk):
mock_delete_files_in_bulk.assert_not_called()


# class CleanArtifactsTestCase(TestCase):

# @patch("dissemination.remove_workbook_artifacts.batch_removal")
# def test_clean_artifacts_success(self, mock_batch_removal):
# # Create SAC instances
# sac_1 = baker.make(SingleAuditChecklist, report_id="report_id_1")
# sac_2 = baker.make(SingleAuditChecklist, report_id="report_id_2")

# # Create ExcelFile instances
# excel_file_1 = baker.make(ExcelFile, sac=sac_1, form_section="section_1")
# excel_file_2 = baker.make(ExcelFile, sac=sac_2, form_section="section_2")

# sac_list = [sac_1, sac_2]
# clean_artifacts(sac_list)

# # Assert that the ExcelFile instances are deleted
# self.assertFalse(ExcelFile.objects.filter(sac__in=sac_list).exists())

# # Assert S3 bulk delete was called with the correct filenames
# mock_batch_removal.assert_called_once_with(
# [
# f"excel/{sac_1.report_id}--{excel_file_1.form_section}.xlsx",
# f"excel/{sac_2.report_id}--{excel_file_2.form_section}.xlsx",
# ],
# sac_list,
# {
# f"{sac_1.report_id}--{excel_file_1.form_section}.xlsx": sac_1,
# f"{sac_2.report_id}--{excel_file_2.form_section}.xlsx": sac_2,
# },
# )

# @patch("dissemination.remove_workbook_artifacts.batch_removal")
# def test_clean_artifacts_no_files(self, mock_batch_removal):
# sac = baker.make(SingleAuditChecklist, report_id="test_report_id")

# # Ensure no ExcelFile instances exist for this SAC
# ExcelFile.objects.filter(sac=sac).delete()

# clean_artifacts([sac])

# # Assert that no ExcelFile instances exist
# self.assertFalse(ExcelFile.objects.filter(sac=sac).exists())

# # Assert S3 bulk delete was not called
# mock_batch_removal.assert_not_called()


# class DeleteWorkbooksTestCase(TestCase):

# @patch("dissemination.remove_workbook_artifacts.clean_artifacts")
# def test_delete_workbooks_single_page(self, mock_clean_artifacts):
# sac = baker.make(
# SingleAuditChecklist,
# AUDITYEAR=22,
# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED,
# )
# baker.make(ExcelFile, sac=sac, filename="file_1.xlsx")

# delete_workbooks(22, page_size=10, pages=1)

# # Assert clean_artifacts was called with the correct sac list
# mock_clean_artifacts.assert_called_once()

# @patch("dissemination.remove_workbook_artifacts.clean_artifacts")
# def test_delete_workbooks_multiple_pages(self, mock_clean_artifacts):
# sac_1 = baker.make(
# SingleAuditChecklist,
# AUDITYEAR=22,
# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED,
# )
# sac_2 = baker.make(
# SingleAuditChecklist,
# AUDITYEAR=22,
# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED,
# )
# baker.make(ExcelFile, sac=sac_1, filename="file_1.xlsx")
# baker.make(ExcelFile, sac=sac_2, filename="file_2.xlsx")

# delete_workbooks(22, page_size=1, pages=2)

# # Assert clean_artifacts was called twice, once for each page
# self.assertEqual(mock_clean_artifacts.call_count, 2)

# @patch("dissemination.remove_workbook_artifacts.clean_artifacts")
# def test_delete_workbooks_all_pages(self, mock_clean_artifacts):
# sac_1 = baker.make(
# SingleAuditChecklist,
# AUDITYEAR=22,
# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED,
# )
# sac_2 = baker.make(
# SingleAuditChecklist,
# AUDITYEAR=22,
# submission_status=SingleAuditChecklist.STATUS.DISSEMINATED,
# )
# baker.make(ExcelFile, sac=sac_1, filename="file_1.xlsx")
# baker.make(ExcelFile, sac=sac_2, filename="file_2.xlsx")

# delete_workbooks(22, page_size=1)

# # Assert clean_artifacts was called twice, once for each page since we don't limit pages
# self.assertEqual(mock_clean_artifacts.call_count, 2)
class CleanArtifactsTestCase(TestCase):

@patch("dissemination.remove_workbook_artifacts.batch_removal")
def test_clean_artifacts_success(self, mock_batch_removal):
# Create SAC instances
sac_1 = baker.make(SingleAuditChecklist, report_id="report_id_1")
sac_2 = baker.make(SingleAuditChecklist, report_id="report_id_2")

# Create ExcelFile instances
excel_file_1 = baker.make(ExcelFile, sac=sac_1, form_section="section_1")
excel_file_2 = baker.make(ExcelFile, sac=sac_2, form_section="section_2")

sac_list = [sac_1, sac_2]
clean_artifacts(sac_list)

# Assert that the ExcelFile instances still exist (no deletion)
self.assertTrue(ExcelFile.objects.filter(sac__in=sac_list).exists())

# Assert S3 bulk delete was called with the correct filenames
mock_batch_removal.assert_called_once_with(
[
f"excel/{sac_1.report_id}--{excel_file_1.form_section}.xlsx",
f"excel/{sac_2.report_id}--{excel_file_2.form_section}.xlsx",
],
sac_list,
{
f"{sac_1.report_id}--{excel_file_1.form_section}.xlsx": sac_1,
f"{sac_2.report_id}--{excel_file_2.form_section}.xlsx": sac_2,
},
)

@patch("dissemination.remove_workbook_artifacts.batch_removal")
def test_clean_artifacts_no_files(self, mock_batch_removal):
sac = baker.make(SingleAuditChecklist, report_id="test_report_id")

# Ensure no ExcelFile instances exist for this SAC
ExcelFile.objects.filter(sac=sac).delete()

clean_artifacts([sac])

# Assert that no ExcelFile instances exist
self.assertFalse(ExcelFile.objects.filter(sac=sac).exists())

# Assert S3 bulk delete was not called
mock_batch_removal.assert_not_called()


class DeleteWorkbooksTestCase(TestCase):

def setUp(self):
# Common setup for SAC instances
self.sac_1 = baker.make(SingleAuditChecklist, id=1, report_id="report_1")
self.sac_2 = baker.make(SingleAuditChecklist, id=2, report_id="report_2")
# Create associated ExcelFile instances
self.excel_file_1 = baker.make(
ExcelFile, sac=self.sac_1, form_section="section_1"
)
self.excel_file_2 = baker.make(
ExcelFile, sac=self.sac_2, form_section="section_2"
)
# Update submission status to DISSEMINATED
self.sac_1.submission_status = SingleAuditChecklist.STATUS.DISSEMINATED
self.sac_2.submission_status = SingleAuditChecklist.STATUS.DISSEMINATED
self.sac_1.save()
self.sac_2.save()

@patch("dissemination.remove_workbook_artifacts.clean_artifacts")
def test_delete_workbooks_single_page(self, mock_clean_artifacts):
"""Test delete_workbooks with a single page of workbooks"""
delete_workbooks(partition_number=1, total_partitions=1, page_size=10, pages=1)

mock_clean_artifacts.assert_called_once_with([self.sac_1, self.sac_2])

@patch("dissemination.remove_workbook_artifacts.clean_artifacts")
def test_delete_workbooks_multiple_pages(self, mock_clean_artifacts):
"""Test delete_workbooks with multiple pages of workbooks"""
delete_workbooks(partition_number=1, total_partitions=1, page_size=1, pages=2)

self.assertEqual(mock_clean_artifacts.call_count, 2)

mock_clean_artifacts.assert_any_call([self.sac_1])
mock_clean_artifacts.assert_any_call([self.sac_2])

@patch("dissemination.remove_workbook_artifacts.clean_artifacts")
def test_delete_workbooks_all_pages(self, mock_clean_artifacts):
"""Test delete_workbooks with all pages of workbooks"""

delete_workbooks(partition_number=1, total_partitions=1, page_size=1)

self.assertEqual(mock_clean_artifacts.call_count, 2)

mock_clean_artifacts.assert_any_call([self.sac_1])
mock_clean_artifacts.assert_any_call([self.sac_2])

0 comments on commit 9d58353

Please sign in to comment.