Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MLFlow Upload Fix #10

Merged
merged 6 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions luxonis_train/callbacks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ List of all supported callbacks.
- [LuxonisProgressBar](#luxonisprogressbar)
- [MetadataLogger](#metadatalogger)
- [TestOnTrainEnd](#testontrainend)
- [UploadCheckpoint](#uploadcheckpoint)

## PytorchLightning Callbacks

Expand Down Expand Up @@ -51,3 +52,11 @@ Metadata include all defined hyperparameters together with git hashes of `luxoni
## TestOnTrainEnd

Callback to perform a test run at the end of the training.

## UploadCheckpoint

Callback that uploads currently best checkpoint (based on validation loss) to specified cloud directory after every validation epoch.

| Key | Type | Default value | Description |
| ---------------- | ---- | ------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| upload_directory | str | / | Path to cloud directory where checkpoints should be uploaded to. If you want to use current mlflow run set it to `mlflow://`. |
4 changes: 2 additions & 2 deletions luxonis_train/callbacks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .metadata_logger import MetadataLogger
from .module_freezer import ModuleFreezer
from .test_on_train_end import TestOnTrainEnd
from .upload_checkpoint_on_train_end import UploadCheckpointOnTrainEnd
from .upload_checkpoint import UploadCheckpoint

CALLBACKS.register_module(module=EarlyStopping)
CALLBACKS.register_module(module=LearningRateMonitor)
Expand All @@ -28,5 +28,5 @@
"MetadataLogger",
"ModuleFreezer",
"TestOnTrainEnd",
"UploadCheckpointOnTrainEnd",
"UploadCheckpoint",
]
4 changes: 2 additions & 2 deletions luxonis_train/callbacks/export_on_train_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def on_train_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> No
if self.upload_to_mlflow:
if cfg.tracker.is_mlflow:
tracker = cast(LuxonisTrackerPL, trainer.logger)
new_upload_directory = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_directory = new_upload_directory
new_upload_url = f"mlflow://{tracker.project_id}/{tracker.run_id}"
cfg.exporter.upload_url = new_upload_url
else:
logging.getLogger(__name__).warning(
"`upload_to_mlflow` is set to True, "
Expand Down
61 changes: 61 additions & 0 deletions luxonis_train/callbacks/upload_checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import logging
import os
from typing import Any

import lightning.pytorch as pl
import torch
from luxonis_ml.utils.filesystem import LuxonisFileSystem

from luxonis_train.utils.registry import CALLBACKS


@CALLBACKS.register_module()
class UploadCheckpoint(pl.Callback):
"""Callback that uploads best checkpoint based on the validation loss."""

def __init__(self, upload_directory: str):
"""Constructs `UploadCheckpoint`.

@type upload_directory: str
@param upload_directory: Path used as upload directory
"""
super().__init__()
self.fs = LuxonisFileSystem(
upload_directory, allow_active_mlflow_run=True, allow_local=False
)
self.logger = logging.getLogger(__name__)
self.last_logged_epoch = None
self.last_best_checkpoint = None

def on_save_checkpoint(
self,
trainer: pl.Trainer,
pl_module: pl.LightningModule,
checkpoint: dict[str, Any],
) -> None:
# Log only once per epoch in case there are multiple ModelCheckpoint callbacks
if not self.last_logged_epoch == trainer.current_epoch:
model_checkpoint_callbacks = [
c
for c in trainer.callbacks # type: ignore
if isinstance(c, pl.callbacks.ModelCheckpoint) # type: ignore
]
# NOTE: assume that first checkpoint callback is based on val loss
curr_best_checkpoint = model_checkpoint_callbacks[0].best_model_path

if self.last_best_checkpoint != curr_best_checkpoint:
self.logger.info(f"Started checkpoint upload to {self.fs.full_path}...")
temp_filename = "curr_best_val_loss.ckpt"
torch.save(checkpoint, temp_filename)
self.fs.put_file(
local_path=temp_filename,
remote_path=temp_filename,
mlflow_instance=trainer.logger.experiment.get( # type: ignore
"mlflow", None
),
)
os.remove(temp_filename)
self.logger.info("Checkpoint upload finished")
self.last_best_checkpoint = curr_best_checkpoint

self.last_logged_epoch = trainer.current_epoch
41 changes: 0 additions & 41 deletions luxonis_train/callbacks/upload_checkpoint_on_train_end.py

This file was deleted.

6 changes: 4 additions & 2 deletions luxonis_train/core/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def _upload(self, files_to_upload: list[str]):
remote_path=self.cfg.exporter.export_model_name + suffix,
)

with tempfile.TemporaryFile() as f:
with tempfile.NamedTemporaryFile(prefix="config", suffix=".yaml") as f:
self.cfg.save_data(f.name)
fs.put_file(local_path=f.name, remote_path="config.yaml")

Expand All @@ -209,7 +209,9 @@ def _upload(self, files_to_upload: list[str]):
)
modelconverter_config = self._get_modelconverter_config(onnx_path)

with tempfile.TemporaryFile() as f:
with tempfile.NamedTemporaryFile(
prefix="config_export", suffix=".yaml", mode="w+"
) as f:
yaml.dump(modelconverter_config, f, default_flow_style=False)
fs.put_file(local_path=f.name, remote_path="config_export.yaml")

Expand Down
Loading