From c0a51a2a588c94c9369aeae108d0dc582c656091 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 5 Sep 2023 13:57:00 +0200 Subject: [PATCH 01/23] ADJUST io.py for logging mlflow; Credits: Blirona --- trainer/io.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/trainer/io.py b/trainer/io.py index 6e08aea..ee5d674 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, Union from urllib.parse import urlparse - +import mlflow import fsspec import torch from coqpit import Coqpit @@ -161,6 +161,15 @@ def save_checkpoint( save_func=save_func, **kwargs, ) + from TTS.tts.models.modelWrapper import MyModel + from TTS.utils.synthesizer import Synthesizer + synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) + model = MyModel(synthesizer=synthesizer) + # For MLflow logging, , artifacts={'checkpoint': file_name, "output": output_folder} + + run_name = os.getenv("RUN_NAME", "vits_janika_de") + + mlflow.pyfunc.log_model(python_model=model, artifact_path="models/TTS",registered_model_name=run_name, code_path=[output_folder]) if save_n_checkpoints is not None: keep_n_checkpoints(output_folder, save_n_checkpoints) From fe3737d56be9a96a11cbc9e50e7ad890d9276b88 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 7 Sep 2023 15:21:03 +0200 Subject: [PATCH 02/23] MADE ajustments as commented by PR --- trainer/io.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index ee5d674..e6611e2 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -165,15 +165,11 @@ def save_checkpoint( from TTS.utils.synthesizer import Synthesizer synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) model = MyModel(synthesizer=synthesizer) - # For MLflow logging, , artifacts={'checkpoint': file_name, "output": output_folder} - - run_name = os.getenv("RUN_NAME", "vits_janika_de") - mlflow.pyfunc.log_model(python_model=model, artifact_path="models/TTS",registered_model_name=run_name, code_path=[output_folder]) + mlflow.pyfunc.log_model(python_model=model, artifact_path="models/TTS", code_path=[output_folder]) if save_n_checkpoints is not None: keep_n_checkpoints(output_folder, save_n_checkpoints) - def save_best_model( current_loss, best_loss, From f884c8882e556f169addd283d00c419fdfcc51e1 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 7 Sep 2023 16:56:05 +0200 Subject: [PATCH 03/23] ADD rclone utils file; ADJUST trainer, so that intermediate results are stored in lakefs --- trainer/trainer.py | 6 ++++++ trainer/utils/rclone.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 trainer/utils/rclone.py diff --git a/trainer/trainer.py b/trainer/trainer.py index 9c2c9ab..ef085d1 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -54,6 +54,7 @@ rank_zero_logger_info, rank_zero_only, ) +from trainer.utils.rclone import update_local_data logger = logging.getLogger("trainer") @@ -1484,6 +1485,7 @@ def train_epoch(self) -> None: loader_start_time = time.time() # TRAINING EPOCH -> iterate over the training samples batch_num_steps = len(self.train_loader) + intermediate_eval = False for cur_step, batch in enumerate(self.train_loader): outputs, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) if outputs is None: @@ -1494,6 +1496,7 @@ def train_epoch(self) -> None: # RUN EVAL -> run evaluation epoch in the middle of training. Useful for big datasets. if self.config.run_eval_steps is not None and (self.total_steps_done % self.config.run_eval_steps == 0): + intermediate_eval = True self.eval_epoch() if self.num_gpus > 1: self.model.module.train() @@ -1521,6 +1524,8 @@ def train_epoch(self) -> None: self.dashboard_logger.train_epoch_stats(self.total_steps_done, epoch_stats) if self.config.model_param_stats: self.dashboard_logger.model_weights(self.model, self.total_steps_done) + if intermediate_eval: + update_local_data(os.getenv("LAKEFS_REPO"), os.getenv("LAKEFS_BRANCH"), os.getenv("OUTPUT_PATH"), upload=True) torch.cuda.empty_cache() ####################### @@ -1771,6 +1776,7 @@ def _fit(self) -> None: self.save_best_model() self.callbacks.on_epoch_end(self) self.start_with_eval = False + def fit_with_largest_batch_size(self, starting_batch_size=2048) -> None: cuda_meminfo() diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py new file mode 100644 index 0000000..21c5ba6 --- /dev/null +++ b/trainer/utils/rclone.py @@ -0,0 +1,16 @@ +import os +import subprocess + +def update_local_data(repo_name:str, branch_name:str, target_folder:str, upload: bool = False): + #TODO do not sync, only copy metadata.csv and wavs folder from lakefs to local + if upload: + command = ["rclone", "sync", target_folder, + f"lakefs:{repo_name}/{branch_name}"] + else: + command = ["rclone", "sync", + f"lakefs:{repo_name}/{branch_name}", target_folder] + print(command) + result = subprocess.run(command, capture_output=True) + print(result.stderr) + print(os.listdir(target_folder)) + assert result.returncode == 0 \ No newline at end of file From 40bb9cac4434e07c6c9d15cc7b31bf0a37be731a Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 8 Sep 2023 00:27:37 +0200 Subject: [PATCH 04/23] ADJUST trainer file --- trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index ef085d1..1a24427 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1617,7 +1617,7 @@ def eval_epoch(self) -> None: self.eval_samples, verbose=True, ) - if self.config.run_eval + if self.config.run_eval or isinstance(self.config.run_eval_steps,type(None)) else None ) From 5a85257462e8a0cebd8dba436f14e40511022749 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 8 Sep 2023 09:49:54 +0200 Subject: [PATCH 05/23] ADJUST trainer/io.py to have the imports at the beginning of the file --- trainer/io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index e6611e2..f7e2df2 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -12,6 +12,8 @@ from coqpit import Coqpit from trainer.logger import logger +from TTS.tts.models.modelWrapper import MyModel +from TTS.utils.synthesizer import Synthesizer def get_user_data_dir(appname): @@ -161,8 +163,7 @@ def save_checkpoint( save_func=save_func, **kwargs, ) - from TTS.tts.models.modelWrapper import MyModel - from TTS.utils.synthesizer import Synthesizer + synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) model = MyModel(synthesizer=synthesizer) From 9a6e0e1d47fdc2a41ee45961d4bd3d25a7a6b390 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 8 Sep 2023 14:06:56 +0200 Subject: [PATCH 06/23] ADJUST files --- trainer/io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index f7e2df2..c5404d0 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -12,8 +12,7 @@ from coqpit import Coqpit from trainer.logger import logger -from TTS.tts.models.modelWrapper import MyModel -from TTS.utils.synthesizer import Synthesizer + def get_user_data_dir(appname): @@ -163,6 +162,8 @@ def save_checkpoint( save_func=save_func, **kwargs, ) + from TTS.tts.models.modelWrapper import MyModel + from TTS.utils.synthesizer import Synthesizer synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) model = MyModel(synthesizer=synthesizer) From 8de4be8eb86c71a208136b7559b74b020f169f2d Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 8 Sep 2023 15:15:46 +0200 Subject: [PATCH 07/23] ADJUST imports again circular import issue --- trainer/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index c5404d0..8496fea 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -11,8 +11,8 @@ import torch from coqpit import Coqpit -from trainer.logger import logger +from trainer.logger import logger def get_user_data_dir(appname): @@ -164,7 +164,7 @@ def save_checkpoint( ) from TTS.tts.models.modelWrapper import MyModel from TTS.utils.synthesizer import Synthesizer - + synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) model = MyModel(synthesizer=synthesizer) From 098bc7ef8e32928c6f29f8587ecb5a3e8667f412 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Sat, 9 Sep 2023 00:59:12 +0200 Subject: [PATCH 08/23] ADJUST output_path name --- trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/trainer/trainer.py b/trainer/trainer.py index 1a24427..baa68ab 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -414,6 +414,8 @@ def __init__( # pylint: disable=dangerous-default-value # create a new output folder name output_path = get_experiment_folder_path(config.output_path, config.run_name) os.makedirs(output_path, exist_ok=True) + + self.experiment_output_path = output_path # copy training assets to the output folder copy_model_files(config, output_path, new_fields) From bce3f8894a9ec3772a945d202a0d795ecaca0109 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Sun, 10 Sep 2023 20:44:53 +0200 Subject: [PATCH 09/23] DELETE intermediate saving of the results to lakefs <- otherwise, storage blows up --- trainer/trainer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index baa68ab..1662cdd 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1487,7 +1487,6 @@ def train_epoch(self) -> None: loader_start_time = time.time() # TRAINING EPOCH -> iterate over the training samples batch_num_steps = len(self.train_loader) - intermediate_eval = False for cur_step, batch in enumerate(self.train_loader): outputs, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) if outputs is None: @@ -1498,7 +1497,6 @@ def train_epoch(self) -> None: # RUN EVAL -> run evaluation epoch in the middle of training. Useful for big datasets. if self.config.run_eval_steps is not None and (self.total_steps_done % self.config.run_eval_steps == 0): - intermediate_eval = True self.eval_epoch() if self.num_gpus > 1: self.model.module.train() @@ -1526,8 +1524,6 @@ def train_epoch(self) -> None: self.dashboard_logger.train_epoch_stats(self.total_steps_done, epoch_stats) if self.config.model_param_stats: self.dashboard_logger.model_weights(self.model, self.total_steps_done) - if intermediate_eval: - update_local_data(os.getenv("LAKEFS_REPO"), os.getenv("LAKEFS_BRANCH"), os.getenv("OUTPUT_PATH"), upload=True) torch.cuda.empty_cache() ####################### From 51df431f29fb0d2ce7b8efe4062a87ade8d21a23 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 10:49:18 +0200 Subject: [PATCH 10/23] ADJUST rclone methods and trainer.py, so that intermediate results are stored --- trainer/trainer.py | 6 +++++- trainer/utils/rclone.py | 19 +++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index 1662cdd..508fbd9 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -54,7 +54,7 @@ rank_zero_logger_info, rank_zero_only, ) -from trainer.utils.rclone import update_local_data +from trainer.utils.rclone import sync_data2s3bucket logger = logging.getLogger("trainer") @@ -1487,6 +1487,7 @@ def train_epoch(self) -> None: loader_start_time = time.time() # TRAINING EPOCH -> iterate over the training samples batch_num_steps = len(self.train_loader) + intermediate_eval = False for cur_step, batch in enumerate(self.train_loader): outputs, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time) if outputs is None: @@ -1497,6 +1498,7 @@ def train_epoch(self) -> None: # RUN EVAL -> run evaluation epoch in the middle of training. Useful for big datasets. if self.config.run_eval_steps is not None and (self.total_steps_done % self.config.run_eval_steps == 0): + intermediate_eval = True self.eval_epoch() if self.num_gpus > 1: self.model.module.train() @@ -1524,6 +1526,8 @@ def train_epoch(self) -> None: self.dashboard_logger.train_epoch_stats(self.total_steps_done, epoch_stats) if self.config.model_param_stats: self.dashboard_logger.model_weights(self.model, self.total_steps_done) + if intermediate_eval: + sync_data2s3bucket(os.getenv("VOICE_GENERATION_RESULTS_BUCKET"), os.getenv("OUTPUT_PATH")) torch.cuda.empty_cache() ####################### diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index 21c5ba6..fddece8 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -1,16 +1,15 @@ import os import subprocess -def update_local_data(repo_name:str, branch_name:str, target_folder:str, upload: bool = False): - #TODO do not sync, only copy metadata.csv and wavs folder from lakefs to local - if upload: - command = ["rclone", "sync", target_folder, - f"lakefs:{repo_name}/{branch_name}"] - else: - command = ["rclone", "sync", +def get_data_from_lakefs(repo_name:str, branch_name:str, target_folder:str): + command = ["rclone", "sync", f"lakefs:{repo_name}/{branch_name}", target_folder] - print(command) result = subprocess.run(command, capture_output=True) - print(result.stderr) - print(os.listdir(target_folder)) + print("Rclone stderr:", result.stderr) + assert result.returncode == 0 + +def sync_data2s3bucket(bucket_name:str, source_folder:str): + command = ["rclone", "sync", source_folder, f"s3:{bucket_name}"] + result = subprocess.run(command, capture_output=True) + print("Rclone stderr:", result.stderr) assert result.returncode == 0 \ No newline at end of file From ee37597c7c80f9a952c4f71e358bd75c876beb25 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 12:53:16 +0200 Subject: [PATCH 11/23] FIX minor bug --- trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index 508fbd9..6343c66 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1619,7 +1619,7 @@ def eval_epoch(self) -> None: self.eval_samples, verbose=True, ) - if self.config.run_eval or isinstance(self.config.run_eval_steps,type(None)) + if self.config.run_eval or not isinstance(self.config.run_eval_steps,type(None)) else None ) From bf410740074e22a7064637a63f09ff40593e75d6 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 13:31:35 +0200 Subject: [PATCH 12/23] ADJUST somethiong :D --- trainer/trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/trainer/trainer.py b/trainer/trainer.py index 6343c66..56738f6 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1622,6 +1622,8 @@ def eval_epoch(self) -> None: if self.config.run_eval or not isinstance(self.config.run_eval_steps,type(None)) else None ) + + print(self.config.run_eval_steps, self.config.run_eval) torch.set_grad_enabled(False) self.model.eval() From 36d5b18566288007025ab3c25fdc4ebb29a6b40b Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 15:16:08 +0200 Subject: [PATCH 13/23] ADJUST somethiong :D --- trainer/trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index 56738f6..8e6d5e3 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1623,7 +1623,8 @@ def eval_epoch(self) -> None: else None ) - print(self.config.run_eval_steps, self.config.run_eval) + print("self.config.run_eval_steps",self.config.run_eval_steps) + print("self.config.run_eval",self.config.run_eval) torch.set_grad_enabled(False) self.model.eval() From 6958fda354c0e5dabb74a35d1ffa1b27327b40df Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 15:44:36 +0200 Subject: [PATCH 14/23] ADJUST stuff --- trainer/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index 8e6d5e3..78b6a95 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1768,8 +1768,8 @@ def _fit(self) -> None: self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path) if not self.skip_train_epoch and not self.start_with_eval: self.train_epoch() - if self.config.run_eval: - self.eval_epoch() + # if self.config.run_eval: + # self.eval_epoch() if epoch >= self.config.test_delay_epochs and self.args.rank <= 0: self.test_run() From 84999c2c588875d396ff2148c7f1251b64bbbc4e Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Mon, 11 Sep 2023 16:05:37 +0200 Subject: [PATCH 15/23] ADJUST stuff --- trainer/utils/rclone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index fddece8..88854b5 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -8,8 +8,8 @@ def get_data_from_lakefs(repo_name:str, branch_name:str, target_folder:str): print("Rclone stderr:", result.stderr) assert result.returncode == 0 -def sync_data2s3bucket(bucket_name:str, source_folder:str): - command = ["rclone", "sync", source_folder, f"s3:{bucket_name}"] +def sync_data2s3bucket(bucket_name:str, source_folder:str, name_from_config:str="lakefs"): + command = ["rclone", "sync", source_folder, f"{name_from_config}:{bucket_name}"] result = subprocess.run(command, capture_output=True) print("Rclone stderr:", result.stderr) assert result.returncode == 0 \ No newline at end of file From 4904235fc24ce841efe7394df0e9b6b42b9bfbd5 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 12 Sep 2023 14:38:39 +0200 Subject: [PATCH 16/23] run_eval does not have any influence on the training anymore. eval is done based on trainingsteps --- trainer/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/trainer/trainer.py b/trainer/trainer.py index 78b6a95..f3b218e 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1619,8 +1619,8 @@ def eval_epoch(self) -> None: self.eval_samples, verbose=True, ) - if self.config.run_eval or not isinstance(self.config.run_eval_steps,type(None)) - else None + # if self.config.run_eval or not isinstance(self.config.run_eval_steps,type(None)) + # else None ) print("self.config.run_eval_steps",self.config.run_eval_steps) @@ -1763,7 +1763,7 @@ def _fit(self) -> None: dist.barrier() self.callbacks.on_epoch_start(self) self.keep_avg_train = KeepAverage() - self.keep_avg_eval = KeepAverage() if self.config.run_eval else None + self.keep_avg_eval = KeepAverage() #if self.config.run_eval else None self.epochs_done = epoch self.c_logger.print_epoch_start(epoch, self.config.epochs, self.output_path) if not self.skip_train_epoch and not self.start_with_eval: @@ -1775,7 +1775,7 @@ def _fit(self) -> None: self.c_logger.print_epoch_end( epoch, - self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values, + self.keep_avg_eval.avg_values #if self.config.run_eval else self.keep_avg_train.avg_values, ) if self.args.rank in [None, 0]: self.save_best_model() From e1acc82d2dfa32e02c7fd6def7b72b985b17e4d9 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 12 Sep 2023 15:54:00 +0200 Subject: [PATCH 17/23] delete mlflow logging stuff. this will be done at the end of the training at the end of the training flow --- trainer/io.py | 6 ------ trainer/trainer.py | 5 +++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index 8496fea..144fa91 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -162,13 +162,7 @@ def save_checkpoint( save_func=save_func, **kwargs, ) - from TTS.tts.models.modelWrapper import MyModel - from TTS.utils.synthesizer import Synthesizer - synthesizer = Synthesizer(checkpoint_path, os.path.join(output_folder, 'config.json')) - model = MyModel(synthesizer=synthesizer) - - mlflow.pyfunc.log_model(python_model=model, artifact_path="models/TTS", code_path=[output_folder]) if save_n_checkpoints is not None: keep_n_checkpoints(output_folder, save_n_checkpoints) diff --git a/trainer/trainer.py b/trainer/trainer.py index f3b218e..3753458 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1500,6 +1500,7 @@ def train_epoch(self) -> None: if self.config.run_eval_steps is not None and (self.total_steps_done % self.config.run_eval_steps == 0): intermediate_eval = True self.eval_epoch() + self.test_run() if self.num_gpus > 1: self.model.module.train() else: @@ -1770,8 +1771,8 @@ def _fit(self) -> None: self.train_epoch() # if self.config.run_eval: # self.eval_epoch() - if epoch >= self.config.test_delay_epochs and self.args.rank <= 0: - self.test_run() + # if epoch >= self.config.test_delay_epochs and self.args.rank <= 0: + # self.test_run() self.c_logger.print_epoch_end( epoch, From e23a011c8a25a02350ffba2f8d343ea15c8ced4f Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 14 Sep 2023 15:13:11 +0200 Subject: [PATCH 18/23] ADD clone dir from minio --- trainer/utils/rclone.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index 88854b5..a36c235 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -12,4 +12,10 @@ def sync_data2s3bucket(bucket_name:str, source_folder:str, name_from_config:str= command = ["rclone", "sync", source_folder, f"{name_from_config}:{bucket_name}"] result = subprocess.run(command, capture_output=True) print("Rclone stderr:", result.stderr) + assert result.returncode == 0 + +def copy_data_from_s3bucket(bucket_name:str, target_folder:str, name_from_config:str="lakefs", dir_oi:str=""): + command = ["rclone", "copy", f"{name_from_config}:{bucket_name}"+"" if dir_oi else f"/{dir_oi}", target_folder] + result = subprocess.run(command, capture_output=True) + print("Rclone stderr:", result.stderr) assert result.returncode == 0 \ No newline at end of file From 8a1a751248c07c26d37445ea9de90037ec48205a Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 14 Sep 2023 15:21:33 +0200 Subject: [PATCH 19/23] SIMPLIFIED copy data from s3bucket function in trainer rclone.py --- trainer/utils/rclone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index a36c235..990f4f3 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -14,8 +14,8 @@ def sync_data2s3bucket(bucket_name:str, source_folder:str, name_from_config:str= print("Rclone stderr:", result.stderr) assert result.returncode == 0 -def copy_data_from_s3bucket(bucket_name:str, target_folder:str, name_from_config:str="lakefs", dir_oi:str=""): - command = ["rclone", "copy", f"{name_from_config}:{bucket_name}"+"" if dir_oi else f"/{dir_oi}", target_folder] +def copy_data_from_s3bucket(bucket_uri:str, target_folder:str): + command = ["rclone", "copy", bucket_uri, target_folder] result = subprocess.run(command, capture_output=True) print("Rclone stderr:", result.stderr) assert result.returncode == 0 \ No newline at end of file From f1725a865ae14e23ac745143ad60574628751e63 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 14 Sep 2023 15:36:03 +0200 Subject: [PATCH 20/23] ADJUST copy method --- trainer/utils/rclone.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index 990f4f3..202f41b 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -15,7 +15,12 @@ def sync_data2s3bucket(bucket_name:str, source_folder:str, name_from_config:str= assert result.returncode == 0 def copy_data_from_s3bucket(bucket_uri:str, target_folder:str): + splitted_bucket_uri = [s for s in bucket_uri.split("/") if s != ""] + if len(splitted_bucket_uri) > 1: + #assumption: only one folder is specified in the bucket_uri + target_folder = os.path.join(target_folder,splitted_bucket_uri[-1]) command = ["rclone", "copy", bucket_uri, target_folder] result = subprocess.run(command, capture_output=True) print("Rclone stderr:", result.stderr) - assert result.returncode == 0 \ No newline at end of file + assert result.returncode == 0 + return target_folder \ No newline at end of file From 6e69c68b7424116a3c6821ffb7822553b7a97ca2 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Thu, 14 Sep 2023 17:45:34 +0200 Subject: [PATCH 21/23] ADJUST to only copy from lakefs ... sync2copy --- trainer/utils/rclone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trainer/utils/rclone.py b/trainer/utils/rclone.py index 202f41b..09fdf57 100644 --- a/trainer/utils/rclone.py +++ b/trainer/utils/rclone.py @@ -2,7 +2,7 @@ import subprocess def get_data_from_lakefs(repo_name:str, branch_name:str, target_folder:str): - command = ["rclone", "sync", + command = ["rclone", "copy", f"lakefs:{repo_name}/{branch_name}", target_folder] result = subprocess.run(command, capture_output=True) print("Rclone stderr:", result.stderr) From a4118fbc19940918b5fe021873d5769348d2c30d Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 20 Sep 2023 16:40:45 +0200 Subject: [PATCH 22/23] FIX potential bug --- trainer/io.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/trainer/io.py b/trainer/io.py index 144fa91..8f2da2c 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -181,6 +181,10 @@ def save_best_model( save_func=None, **kwargs, ): + print("current_loss: ",current_loss) + print("best_loss: ",best_loss) + if isinstance(best_loss,dict): + best_loss=best_loss["eval_loss"] if current_loss < best_loss: best_model_name = f"best_model_{current_step}.pth" checkpoint_path = os.path.join(out_path, best_model_name) From a3d12c8064519954817e278a859e9d11decc7bb4 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 20 Sep 2023 17:44:36 +0200 Subject: [PATCH 23/23] FIX bug loss value, if not float --- trainer/io.py | 2 -- trainer/trainer.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/trainer/io.py b/trainer/io.py index 8f2da2c..8fc9a58 100644 --- a/trainer/io.py +++ b/trainer/io.py @@ -183,8 +183,6 @@ def save_best_model( ): print("current_loss: ",current_loss) print("best_loss: ",best_loss) - if isinstance(best_loss,dict): - best_loss=best_loss["eval_loss"] if current_loss < best_loss: best_model_name = f"best_model_{current_step}.pth" checkpoint_path = os.path.join(out_path, best_model_name) diff --git a/trainer/trainer.py b/trainer/trainer.py index 3753458..34b9f05 100644 --- a/trainer/trainer.py +++ b/trainer/trainer.py @@ -1717,7 +1717,7 @@ def _restore_best_loss(self): logger.info(" > Restoring best loss from %s ...", os.path.basename(self.args.best_path)) ch = load_fsspec(self.args.restore_path, map_location="cpu") if "model_loss" in ch: - self.best_loss = ch["model_loss"] + self.best_loss = ch["model_loss"]["eval_loss"] if isinstance(ch["model_loss"],dict) else ch["model_loss"] logger.info(" > Starting with loaded last best loss %f", self.best_loss) def test(self, model=None, test_samples=None) -> None: