diff --git a/biothings/hub/__init__.py b/biothings/hub/__init__.py index 8a8e8eb32..1d133e6bd 100644 --- a/biothings/hub/__init__.py +++ b/biothings/hub/__init__.py @@ -332,6 +332,7 @@ class HubServer(object): "diff", "index", "snapshot", + "auto_snapshot_cleaner", "release", "inspect", "sync", @@ -706,6 +707,17 @@ def configure_snapshot_manager(self): snapshot_manager.poll("snapshot", snapshot_manager.snapshot_a_build) self.managers["snapshot_manager"] = snapshot_manager + def configure_auto_snapshot_cleaner_manager(self): + assert "snapshot" in self.features, "'auto_snapshot_cleaner' feature requires 'snapshot'" + from biothings.hub.dataindex.auto_snapshot_cleanup import AutoSnapshotCleanupManager + + auto_snapshot_cleaner_manager = AutoSnapshotCleanupManager( + snapshot_manager=self.managers["snapshot_manager"], + job_manager=self.managers["job_manager"], + ) + auto_snapshot_cleaner_manager.configure(config.AUTO_SNAPSHOT_CLEANUP_CONFIG) + self.managers["auto_snapshot_cleaner_manager"] = auto_snapshot_cleaner_manager + def configure_release_manager(self): assert "diff" in self.features, "'release' feature requires 'diff'" assert "snapshot" in self.features, "'release' feature requires 'snapshot'" diff --git a/biothings/hub/dataindex/auto_snapshot_cleanup.py b/biothings/hub/dataindex/auto_snapshot_cleanup.py new file mode 100644 index 000000000..6048f5532 --- /dev/null +++ b/biothings/hub/dataindex/auto_snapshot_cleanup.py @@ -0,0 +1,60 @@ +from functools import partial + +from biothings import config as btconfig +from biothings.utils.manager import BaseManager + +logger = btconfig.logger + + +class AutoSnapshotCleanupManager(BaseManager): + """This feature will add a new console command "auto_snapshot_cleanup", a new api "/auto_snapshot_cleanup". + It is intended to allow automatically cleanup old snapshot, based on configuration. + + This feature can be configurated by using AUTO_SNAPSHOT_CLEANUP_CONFIG variable, to determine: + - schedule: how frequency this task should run + - days: how old a snapshot should be deleted + + AUTO_SNAPSHOT_CLEANUP_CONFIG = { + "environment_name": { + "schedule": "* 0 * * *", # run daily at 0am UTC + "keep": 3, # the number of most recent snapshots to keep in one group + "group_by": "build_config", # the attr of which its values form groups + "extra_filters": {} # a set of criterions to limit which snapshots are to be cleaned + }, + ... + } + """ + + DEFAULT_SCHEDULE = "* 0 * * *" # run daily at 0am UTC + + def __init__(self, snapshot_manager, job_manager, *args, **kwargs): + super().__init__(job_manager, *args, **kwargs) + + self.snapshot_manager = snapshot_manager + + def configure(self, conf=None): + self.auto_snapshot_cleaner_config = conf or {} + + for env_name in self.snapshot_manager.register.keys(): + cleaner_config = self.auto_snapshot_cleaner_config.get(env_name) + + if not isinstance(cleaner_config, dict): + logger.info(f"Snapshot environment: {env_name}: No cleaner config found!") + continue + + schedule = cleaner_config.get("schedule") or self.DEFAULT_SCHEDULE + keep = cleaner_config.get("keep") + group_by = cleaner_config.get("group_by") + extra_filters = cleaner_config.get("extra_filters") + + self.job_manager.submit( + partial( + self.snapshot_manager.cleanup, + env=env_name, + keep=keep, + group_by=group_by, + dryrun=False, + **extra_filters, + ), + schedule=schedule, + ) diff --git a/biothings/hub/default_config.py b/biothings/hub/default_config.py index f1000a2d6..c01e3f64b 100644 --- a/biothings/hub/default_config.py +++ b/biothings/hub/default_config.py @@ -225,6 +225,20 @@ # Snapshot environment configuration SNAPSHOT_CONFIG = {} +# Auto snapshot cleaner feature will use this configuration to get schedule config for corresponding environment. +AUTO_SNAPSHOT_CLEANUP_CONFIG = None +""" +AUTO_SNAPSHOT_CLEANUP_CONFIG = { + "environment_name": { + "schedule": "* 0 * * *", # run daily at 0am UTC + "keep": 3, # the number of most recent snapshots to keep in one group + "group_by": "build_config", # the attr of which its values form groups + "extra_filters": {} # a set of criterions to limit which snapshots are to be cleaned + }, + ... + } +""" + # reporting diff results, number of IDs to consider (to avoid too much mem usage) MAX_REPORTED_IDS = 1000 # for diff updates, number of IDs randomly picked as examples when rendering the report