StrongResearch · adam-peaston-SC · Sep 5, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/README.md b/README.md
@@ -64,12 +64,12 @@ isc clusters # view the status of the clusters
 
 (from https://github.com/pytorch/vision/tree/main/references/segmentation)
 
-- WIP [fcn_resnet101.isc](./tv-segmentation/fcn_resnet101.isc)
-- WIP [deeplabv3_mobilenet_v3_large.isc](./tv-segmentation/deeplabv3_mobilenet_v3_large.isc)
+- [fcn_resnet101.isc](./tv-segmentation/fcn_resnet101.isc)
+- [deeplabv3_mobilenet_v3_large.isc](./tv-segmentation/deeplabv3_mobilenet_v3_large.isc)
 
 ### tv-detection
 
 (from https://github.com/pytorch/vision/tree/main/references/detection)
 
-- WIP [maskrcnn_resnet50_fpn.isc](./tv-detection/fasterrcnn_resnet50_fpn.isc)
-- WIP [retinanet_resnet50_fpn.isc](./tv-detection/retinanet_resnet50_fpn.isc)
+- [maskrcnn_resnet50_fpn.isc](./tv-detection/fasterrcnn_resnet50_fpn.isc)
+- [retinanet_resnet50_fpn.isc](./tv-detection/retinanet_resnet50_fpn.isc)
diff --git a/cycling_utils/cycling_utils/__init__.py b/cycling_utils/cycling_utils/__init__.py
@@ -1,5 +1,5 @@
-from .saving import atomic_torch_save
-from .sampler import InterruptableDistributedSampler
-from .lightning_utils import EpochHandler
+from .timer import Timer, TimestampedTimer
+from .saving import atomic_torch_save, MetricsTracker
+from .sampler import InterruptableDistributedSampler, InterruptableDistributedGroupedBatchSampler
 
-__all__ = ["InterruptableDistributedSampler", "atomic_torch_save", "EpochHandler"]
+__all__ = ["InterruptableDistributedSampler", "InterruptableDistributedGroupedBatchSampler", "atomic_torch_save", "Timer", "TimestampedTimer"]
diff --git a/cycling_utils/cycling_utils/sampler.py b/cycling_utils/cycling_utils/sampler.py
@@ -2,6 +2,8 @@
 import torch
 from torch.utils.data import Dataset, DistributedSampler
 from contextlib import contextmanager
+from collections import defaultdict
+from itertools import chain, repeat
 
 class HasNotResetProgressError(Exception):
     pass
@@ -113,3 +115,165 @@ def in_epoch(self, epoch):
         self.set_epoch(epoch)
         yield
         self._reset_progress()
+
+def _repeat_to_at_least(iterable, n):
+    repeat_times = math.ceil(n / len(iterable))
+    repeated = chain.from_iterable(repeat(iterable, repeat_times))
+    return list(repeated)
+
+class InterruptableDistributedGroupedBatchSampler(DistributedSampler):
+    def __init__(
+        self,
+        dataset: Dataset,
+        group_ids: list, 
+        batch_size: int,
+        num_replicas: int | None = None,
+        rank: int | None = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        """
+        This is a DistributedSampler that can be suspended and resumed.
+
+        This works by keeping track of the sample batches that have already been 
+        dispatched. This InterruptableDistributedGroupedBatchSampler also 
+        reproduces the sampling strategy exhibited in the torch vision detection
+        reference wherein batches are created from images from within the same
+        'group', defined in the torchvision example by similarity of image 
+        aspect ratio. 
+
+        https://github.com/pytorch/vision/tree/main/references/detection
+
+        For this reason, InterruptableDistributedGroupedBatchSampler progress is
+        tracked in units of batches, not samples. This is an important
+        distinction from the InterruptableDistributedSampler which tracks progress
+        in units of samples. The progress is reset to 0 at the end of each epoch.
+
+        The epoch is set to 0 at initialization and incremented at the start 
+        of each epoch.
+
+        Suspending and resuming the sampler is done by saving and loading the
+        state dict. The state dict contains the epoch and progress.
+        """
+        super().__init__(dataset, num_replicas, rank, shuffle, seed, drop_last)
+
+        # OVERALL STATUS INDICATOR
+        self.progress = 0
+        self._has_reset_progress = True
+
+        # PRE-PROCESS DATASET
+        if shuffle:
+            # deterministically shuffle based on seed
+            g = torch.Generator()
+            g.manual_seed(seed)
+            indices = torch.randperm(len(dataset), generator=g).tolist()  # type: ignore[arg-type]
+        else:
+            indices = list(range(len(dataset)))  # type: ignore[arg-type]
+
+        if not self.drop_last:
+            # add extra samples to make dataset evenly divisible accross ranks
+            padding_size = self.total_size - len(indices)
+            if padding_size <= len(indices):
+                indices += indices[:padding_size]
+            else:
+                indices += (indices * math.ceil(padding_size / len(indices)))[:padding_size]
+        else:
+            # remove tail of data to make dataset evenly divisible accross ranks
+            indices = indices[: self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample indices to use on this rank
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        # num_samples is the number of samples to be processed each rank
+        assert len(indices) == self.num_samples
+
+        # PRE-COMPUTE GROUPED BATCHES
+
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+        self.num_batches = math.ceil(len(indices)/ batch_size)
+
+        self.batches = [] # pre-computed so progress refers to batches, not samples.
+        for idx in indices:
+            group_id = group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == batch_size:
+                self.batches.append(buffer_per_group[group_id])
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        num_remaining = self.num_batches - len(self.batches)
+        if num_remaining > 0:
+            # for the remaining batches, take first the buffers with the largest number
+            # of elements
+            for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
+                remaining = batch_size - len(buffer_per_group[group_id])
+                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
+                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
+                assert len(buffer_per_group[group_id]) == batch_size
+                self.batches.append(buffer_per_group[group_id])
+                num_remaining -= 1
+                if num_remaining == 0:
+                    break
+
+        assert len(self.batches) == self.num_batches
+
+
+    def _reset_progress(self):
+        self.progress = 0
+        self._has_reset_progress = True
+
+    def set_epoch(self, epoch: int) -> None:
+        raise NotImplementedError("Use `with sampler.in_epoch(epoch)` instead of `sampler.set_epoch(epoch)`")
+
+    def _set_epoch(self, epoch):
+        if not self._has_reset_progress:
+            raise HasNotResetProgressError("You must reset progress before setting epoch e.g. `sampler.reset_progress()`\nor use `with sampler.in_epoch(epoch)` instead of `sampler.set_epoch(epoch)`")
+        self.epoch = epoch
+
+    def state_dict(self):
+        return {"progress": self.progress, "epoch": self.epoch}
+
+    def load_state_dict(self, state_dict):
+        self.progress = state_dict["progress"]
+        if not self.progress <= self.num_batches:
+            raise AdvancedTooFarError(f"progress should be less than or equal to the number of batches. progress: {self.progress}, num_batches: {self.num_batches}")
+        self.epoch = state_dict["epoch"]
+
+    def advance(self):
+        """
+        Record that one batch has been consumed.
+        """
+        self.progress += 1
+        if self.progress > self.num_batches:
+            raise AdvancedTooFarError(f"You have advanced too far. You can only advance up to the total number of batches: {self.num_batches}.")
+
+    def __iter__(self):
+
+        # slice from progress to pick up where we left off
+        for batch in self.batches[self.progress:]:
+            yield batch
+
+    def __len__(self):
+        return self.num_batches
+
+    @contextmanager
+    def in_epoch(self, epoch):
+        """
+        This context manager is used to set the epoch. It is used like this:
+        ```
+        for epoch in range(0, 10):
+            with sampler.in_epoch(epoch):
+                for step, (x, ) in enumerate(dataloader):
+                    # work would be done here...
+        ```
+        """
+        self._set_epoch(epoch)
+        yield
+        self._reset_progress()
diff --git a/cycling_utils/cycling_utils/saving.py b/cycling_utils/cycling_utils/saving.py
@@ -1,9 +1,87 @@
 from pathlib import Path
 import os
 import torch
+import torch.distributed as dist
+from collections import defaultdict
 
-def atomic_torch_save(obj, f: str | Path, **kwargs):
+def atomic_torch_save(obj, f: str | Path, timer=None, **kwargs):
     f = str(f)
     temp_f = f + ".temp"
     torch.save(obj, temp_f, **kwargs)
-    os.replace(temp_f, f)
+    if timer is not None:
+        timer.report(f'saving temp checkpoint')
+    os.replace(temp_f, f)
+    if timer is not None:
+        timer.report(f'replacing temp checkpoint with checkpoint')
+        return timer
+    else:
+        return
+
+class MetricsTracker:
+    '''
+    This is a general purpose MetricsTracker to assist with recording metrics from
+    a disributed cluster.
+
+    The MetricsTracker is initialised without any prior knowledge of the metrics
+    to be tracked.
+
+    >>> metrics = MetricsTracker()
+
+    Metrics can be accumulated as required, for example after each batch is procesed
+    by the model, by passing a dictionary with metrics to be updated, then reduced 
+    accross all nodes. Metric values are stored in a defaultdict.
+
+    >>> preds = model(input)
+    >>> loss = loss_fn(preds, targs)
+    >>> metrics.update({"images_seen": len(images), "loss": loss.item()})
+    >>> metrics.reduce()
+
+    Metrics are assumed to be summable scalar values. After calling reduce(), the 
+    metrics.local object contains the sum of corresponding metrics from all nodes
+    which can be used for intermediate reporting or logging.
+
+    >>> writer = SummaryWriter()
+    >>> for metric,val in metrics.local.items():
+    >>>     writer.add_scalar(metric, val, step)
+    >>> writer.flush()
+    >>> writer.close()
+
+    Once all processing of the current batch has been completed, the MetricsTracker
+    can be prepared for the next batch using reset_local().
+
+    >>> metrics.reset_loca()
+
+    Metrics are also accumulated for consecutive batches in the metrics.agg object.
+    At the end of an epoch the MetricsTracker can be reset using end_epoch().
+
+    >>> metrics.end_epoch()
+
+    The MetricsTracker saves a copy of the accumulated metrics (metrics.agg) for
+    each epoch which can be stored within a checkpoint.
+    '''
+    def __init__(self):
+        self.local = defaultdict(float)
+        self.agg = defaultdict(float)
+        self.epoch_reports = []
+
+    def update(self, metrics: dict):
+        for m,v in metrics.items():
+            self.local[m] += v
+
+    def reduce(self):
+        names, local = zip(*self.local.items())
+        local = torch.tensor(local, dtype=torch.float16, requires_grad=False, device='cuda')
+        dist.all_reduce(local, op=dist.ReduceOp.SUM)
+        self.local = defaultdict(float, zip(names, local.cpu().numpy()))
+        for k in self.local:
+            self.agg[k] += self.local[k]
+
+    def reset_local(self):
+        self.local = defaultdict(float)
+
+    def end_epoch(self):
+        self.epoch_reports.append(dict(self.agg))
+        self.local = defaultdict(float)
+        self.agg = defaultdict(float)
+
+
diff --git a/cycling_utils/cycling_utils/timer.py b/cycling_utils/cycling_utils/timer.py
@@ -0,0 +1,64 @@
+import os, time
+from datetime import datetime
+
+class Timer:
+    '''
+    This Timer can be integrated within a training routine to provide point-to-point
+    script timing and reporting.
+
+    def main():
+        timer = Timer()
+        time.sleep(2)
+        timer.report("sleeping for 2 seconds")
+        time.sleep(3)
+        timer.report("sleeping for 3 seconds")
+
+    >>> main()
+    Start                                       0.000 ms     0.000 s total
+    Completed sleeping for 2 seconds        2,000.000 ms     2.000 s total
+    Completed sleeping for 3 seconds        3,000.000 ms     5.000 s total
+    '''
+    def __init__(self, report=None, start_time=None, running=0):
+        self.start_time = start_time if start_time is not None else time.time()
+        self.running = running
+        if str(os.environ["RANK"]) == "0":
+            report = report if report else "Start"
+            print("[{:<80}] {:>12} ms, {:>12} s total".format(report, f'{0.0:,.3f}', f'{0.0:,.2f}'))
+    def report(self, annot):
+        if str(os.environ["RANK"]) == "0":
+            now = time.time()
+            duration = now - self.start_time
+            self.running += duration
+            print("Completed {:<70}{:>12} ms, {:>12} s total".format(annot, f'{1000*duration:,.3f}', f'{self.running:,.2f}'))
+            self.start_time = now
+
+class TimestampedTimer:
+    '''
+    This TimestampedTimer can be integrated within a training routine to provide 
+    point-to-point script timing and reporting.
+
+    def main():
+        timer = TimestampedTimer()
+        time.sleep(2)
+        timer.report("sleeping for 2 seconds")
+        time.sleep(3)
+        timer.report("sleeping for 3 seconds")
+
+    >>> main()
+    [TIME] Start                                       0.000 ms     0.000 s total
+    [TIME] Completed sleeping for 2 seconds        2,000.000 ms     2.000 s total
+    [TIME] Completed sleeping for 3 seconds        3,000.000 ms     5.000 s total
+    '''
+    def __init__(self, report=None, start_time=None, running=0):
+        if str(os.environ["RANK"]) == "0":
+            self.start_time = start_time if start_time is not None else time.time()
+            self.running = running
+            report = report if report else "Start"
+            print("[ {} ] Completed {:<70}{:>12} ms, {:>12} s total".format(time.strftime("%Y-%m-%d %H:%M:%S"), report, f'{0.0:,.3f}', f'{0.0:,.2f}'))
+    def report(self, annot):
+        if str(os.environ["RANK"]) == "0":
+            now = time.time()
+            duration = now - self.start_time
+            self.running += duration
+            print("[ {} ] Completed {:<70}{:>12} ms, {:>12} s total".format(time.strftime("%Y-%m-%d %H:%M:%S"), annot, f'{1000*duration:,.3f}', f'{self.running:,.2f}'))
+            self.start_time = now
diff --git a/tv-detection/README.md b/tv-detection/README.md
@@ -0,0 +1,28 @@
+# Object detection reference training scripts
+
+This folder contains reference training scripts for object detection.
+They serve as a log of how to train specific models, to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+
+To execute the example commands below you must install the following:
+
+```
+cython
+pycocotools
+matplotlib
+```
+
+You must also run "prep.py" to download pretrained model weights before 
+launching your training job.
+
+You can then run the training routines for the following models using cli.
+
+### RetinaNet
+```
+isc train ./retinanet_resnet101_fpn.isc
+```
+
+### Mask R-CNN
+```
+isc train ./maskrcnn_resnet101_fpn.isc
+```