From 123757cd6a78a1ae6efd2bf48f1b460b625db925 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 20 Dec 2023 17:06:26 +0100 Subject: [PATCH 01/28] Update environment.yml with newer torch version and less straining requirements --- environment.yml | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/environment.yml b/environment.yml index 3c2050ee..f7598ce6 100644 --- a/environment.yml +++ b/environment.yml @@ -1,17 +1,14 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d +name: myria3d_upgrade channels: - conda-forge - anaconda dependencies: - python==3.9.* - pip - # cudatoolkit to specify the cuda driver in the conda env - - conda-forge::cudatoolkit=11.3.1 # single equal sign there, not a typo - - numba==0.55.1 # --------- data formats --------- # - - numpy==1.20 + # - numpy - h5py # --------- geo --------- # - pygeos @@ -38,25 +35,25 @@ dependencies: - pip: # --------- Deep Learning --------- # # Extra index may need to be on first line - - --extra-index-url https://download.pytorch.org/whl/cu113 - - torch==1.11.* + - --extra-index-url https://download.pytorch.org/whl/cu118 + - torch==2.1.* - torchvision - - pytorch-lightning==1.5.9 - - torchmetrics==0.7.* # Else, pytorch-lightning will install the latest - - comet_ml==3.31.* + - pytorch-lightning + - torchmetrics + - comet_ml==3.31.* # VErsion to update ! + - torch_geometric # Wheels for torch-geometric optionnal dependencies - - https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_cluster-1.6.0-cp39-cp39-linux_x86_64.whl - - https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_scatter-2.0.9-cp39-cp39-linux_x86_64.whl - - https://data.pyg.org/whl/torch-1.11.0%2Bcu113/torch_sparse-0.6.14-cp39-cp39-linux_x86_64.whl - - git+https://github.com/pyg-team/pytorch_geometric.git@2.1.0 + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_cluster-1.6.3%2Bpt21cu118-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_scatter-2.1.2%2Bpt21cu118-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_sparse-0.6.18%2Bpt21cu118-cp39-cp39-linux_x86_64.whl # Nota: if libcusparse.so.11. errors occur, run # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH" # ou # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 # --------- Visualization --------- # - - pandas==1.4.* - - matplotlib==3.5.* + - pandas + - matplotlib # --------- hydra configs --------- # - hydra-core==1.1.* - hydra-colorlog==1.1.* From 99257b5074eae2998d9fb1498b7e12e10b8cd393 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 20 Dec 2023 17:40:25 +0100 Subject: [PATCH 02/28] Fix all incorrect imports and invalid Trainer flag --- configs/experiment/DebugFineTune.yaml | 1 - .../RandLaNet_base_run_FR-MultiGPU.yaml | 4 +- configs/predict/default.yaml | 1 - configs/task/default.yaml | 3 +- configs/trainer/all_params.yaml | 50 ------------------- configs/trainer/default.yaml | 12 ++--- docs/source/apidoc/default_config.yml | 7 ++- docs/source/guides/train_new_model.md | 2 +- myria3d/callbacks/comet_callbacks.py | 4 +- myria3d/callbacks/logging_callbacks.py | 17 +++---- myria3d/train.py | 6 +-- myria3d/utils/utils.py | 2 +- tests/myria3d/test_train_and_predict.py | 21 +++++--- ...0_Myria3DV3.1.0_predict_config_V3.5.0.yaml | 9 +--- 14 files changed, 41 insertions(+), 98 deletions(-) delete mode 100755 configs/trainer/all_params.yaml diff --git a/configs/experiment/DebugFineTune.yaml b/configs/experiment/DebugFineTune.yaml index 1637344c..d90bc832 100644 --- a/configs/experiment/DebugFineTune.yaml +++ b/configs/experiment/DebugFineTune.yaml @@ -18,7 +18,6 @@ trainer: limit_test_batches: 1 max_epochs: 1 num_sanity_val_steps: 0 - # gpus: [1] callbacks: finetune: diff --git a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml index 5a9e8727..cd8123f5 100755 --- a/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml +++ b/configs/experiment/RandLaNet_base_run_FR-MultiGPU.yaml @@ -8,7 +8,5 @@ logger: trainer: strategy: ddp_find_unused_parameters_false - # Replace by gpu to simulate multi-gpus training. accelerator: gpu - num_processes: 2 - gpus: 2 + devices: 2 diff --git a/configs/predict/default.yaml b/configs/predict/default.yaml index 9a4ca973..ed52b284 100644 --- a/configs/predict/default.yaml +++ b/configs/predict/default.yaml @@ -1,7 +1,6 @@ src_las: "/path/to/input.las" # Any glob pattern can be used to predict on multiple files. output_dir: "/path/to/output_dir/" # Predictions are saved in a new file which shares src_las basename. ckpt_path: "/path/to/lightning_model.ckpt" # Checkpoint of trained model. -gpus: 0 # 0 for none, 1 for one, [gpu_id] to specify which gpu to use e.g [1] # Probas interpolation parameters # subtile_overlap=25 to use a sliding window of inference of which predictions will be merged. diff --git a/configs/task/default.yaml b/configs/task/default.yaml index 45716d12..ef0d0e10 100644 --- a/configs/task/default.yaml +++ b/configs/task/default.yaml @@ -1,2 +1,3 @@ # Task at hand. Can be train or predict -task_name: fit # "fit" or "test" or "fit+test", or "predict", or "finetune" \ No newline at end of file +task_name: fit # "fit" or "test" or "fit+test", or "predict", or "finetune" +auto_lr_find: false # override with true to run the LR-range test in train.py. diff --git a/configs/trainer/all_params.yaml b/configs/trainer/all_params.yaml deleted file mode 100755 index 067c2953..00000000 --- a/configs/trainer/all_params.yaml +++ /dev/null @@ -1,50 +0,0 @@ -_target_: pytorch_lightning.Trainer - -# default values for all trainer parameters -checkpoint_callback: True -default_root_dir: null -gradient_clip_val: 0.0 -process_position: 0 -num_nodes: 1 -num_processes: 1 -gpus: null -auto_select_gpus: False -tpu_cores: null -log_gpu_memory: null -progress_bar_refresh_rate: 1 -overfit_batches: 0.0 -track_grad_norm: -1 -check_val_every_n_epoch: 1 -fast_dev_run: False -accumulate_grad_batches: 1 -max_epochs: 1 -min_epochs: 1 -max_steps: null -min_steps: null -limit_train_batches: 1.0 -limit_val_batches: 1.0 -limit_test_batches: 1.0 -val_check_interval: 1.0 -flush_logs_every_n_steps: 100 -log_every_n_steps: 50 -accelerator: null -sync_batchnorm: False -precision: 32 -weights_summary: "top" -weights_save_path: null -num_sanity_val_steps: 2 -truncated_bptt_steps: null -resume_from_checkpoint: null -profiler: null -benchmark: False -deterministic: False -reload_dataloaders_every_epoch: False -auto_lr_find: False -replace_sampler_ddp: True -terminate_on_nan: False -auto_scale_batch_size: False -prepare_data_per_node: True -plugins: null -amp_backend: "native" -amp_level: "O2" -move_metrics_to_cpu: False diff --git a/configs/trainer/default.yaml b/configs/trainer/default.yaml index b7968c1e..f8019e86 100755 --- a/configs/trainer/default.yaml +++ b/configs/trainer/default.yaml @@ -1,14 +1,10 @@ _target_: pytorch_lightning.Trainer -# set `1` to train on GPU, `0` to train on CPU only -gpus: 0 - min_epochs: 1 max_epochs: 1300 log_every_n_steps: 1 -weights_summary: null -progress_bar_refresh_rate: 1 - -auto_lr_find: false # override with true to run the LR-range test in train.py. - +# set to gpu for gpu training (if devices > 1, set ddp_find_unused_parameters_false: true) +accelerator: cpu +devices: 1 +num_nodes: 1 diff --git a/docs/source/apidoc/default_config.yml b/docs/source/apidoc/default_config.yml index ffef6ec8..eb6d004a 100644 --- a/docs/source/apidoc/default_config.yml +++ b/docs/source/apidoc/default_config.yml @@ -5,13 +5,11 @@ print_config: true ignore_warnings: true trainer: _target_: pytorch_lightning.Trainer - gpus: 0 + accelerator: cpu + devices: 1 min_epochs: 1 max_epochs: 1 log_every_n_steps: 1 - weights_summary: null - progress_bar_refresh_rate: 1 - auto_lr_find: false limit_train_batches: 1 limit_val_batches: 1 limit_test_batches: 1 @@ -253,6 +251,7 @@ logger: disabled: true task: task_name: fit + auto_lr_find: false predict: src_las: /path/to/input.las output_dir: /path/to/output_dir/ diff --git a/docs/source/guides/train_new_model.md b/docs/source/guides/train_new_model.md index 64bcead0..ccc22fc5 100644 --- a/docs/source/guides/train_new_model.md +++ b/docs/source/guides/train_new_model.md @@ -36,7 +36,7 @@ After training, you model best checkpoints and hydra config will be saved in a ` ### Optimized learning rate Pytorch Lightning support au [automated learning rate finder](https://pytorch-lightning.readthedocs.io/en/stable/common/trainer.html#auto-lr-find), by means of an Learning Rate-range test (see section 3.3 in [this paper](https://arxiv.org/pdf/1506.01186.pdf) for reference). -You can perfom this automatically before training by setting `trainer.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once. +You can perfom this automatically before training by setting `task.auto_lr_find=true` when calling training on your dataset. The best learning rate will be logged and results saved as an image, so that you do not need to perform this test more than once. ### Multi-GPUs diff --git a/myria3d/callbacks/comet_callbacks.py b/myria3d/callbacks/comet_callbacks.py index c1309ec6..a16f9a04 100755 --- a/myria3d/callbacks/comet_callbacks.py +++ b/myria3d/callbacks/comet_callbacks.py @@ -12,7 +12,7 @@ from typing import Optional from pytorch_lightning import Callback, Trainer -from pytorch_lightning.loggers import CometLogger, LoggerCollection +from pytorch_lightning.loggers import CometLogger from pytorch_lightning.utilities import rank_zero_only from myria3d.utils import utils @@ -27,7 +27,7 @@ def get_comet_logger(trainer: Trainer) -> Optional[CometLogger]: if isinstance(trainer.logger, CometLogger): return trainer.logger - if isinstance(trainer.logger, LoggerCollection): + if isinstance(trainer.logger, list): for logger in trainer.logger: if isinstance(logger, CometLogger): return logger diff --git a/myria3d/callbacks/logging_callbacks.py b/myria3d/callbacks/logging_callbacks.py index aa983a54..0d2329b0 100644 --- a/myria3d/callbacks/logging_callbacks.py +++ b/myria3d/callbacks/logging_callbacks.py @@ -5,7 +5,7 @@ from pytorch_lightning import Callback from pytorch_lightning.utilities.types import STEP_OUTPUT from torchmetrics import JaccardIndex -from torchmetrics.functional.classification.jaccard import _jaccard_from_confmat +from torchmetrics.functional.classification.jaccard import _jaccard_index_reduce from myria3d.utils import utils @@ -141,14 +141,13 @@ def __init__( ) def compute(self): - """Computes intersection over union (JaccardIndex)""" - - iou_no_reduction = _jaccard_from_confmat( - self.confmat, - self.num_classes, - self.ignore_index, - self.absent_score, - self.reduction, + """Computes intersection over union (JaccardIndex). + + Default behavior changed in torchmetrics and an absent class will yield a score of 0 instead of 1 + """ + + iou_no_reduction = _jaccard_index_reduce( + confmat=self.confmat, average=self.reduction, ignore_index=self.ignore_index ) class_of_interest_iou = iou_no_reduction[self.class_of_interest_idx] return class_of_interest_iou diff --git a/myria3d/train.py b/myria3d/train.py index e364990e..1bac4dae 100755 --- a/myria3d/train.py +++ b/myria3d/train.py @@ -19,7 +19,7 @@ Trainer, seed_everything, ) -from pytorch_lightning.loggers import LightningLoggerBase +from pytorch_lightning.loggers.logger import Logger from myria3d.models.model import Model from myria3d.utils import utils @@ -87,7 +87,7 @@ def train(config: DictConfig) -> Trainer: callbacks.append(hydra.utils.instantiate(cb_conf)) # Init lightning loggers - logger: List[LightningLoggerBase] = [] + logger: List[Logger] = [] if "logger" in config: for lg_conf in config.logger.values(): if "_target_" in lg_conf: @@ -111,7 +111,7 @@ def train(config: DictConfig) -> Trainer: task_name = config.task.get("task_name") if task_name == TASK_NAMES.FIT.value: - if config.trainer.auto_lr_find: + if config.task.auto_lr_find: log.info("Finding best lr with auto_lr_find!") # Run learn ing rate finder lr_finder = trainer.tuner.lr_find( diff --git a/myria3d/utils/utils.py b/myria3d/utils/utils.py index 141aa5d4..0364fcc0 100755 --- a/myria3d/utils/utils.py +++ b/myria3d/utils/utils.py @@ -113,7 +113,7 @@ def log_hyperparameters( datamodule: pl.LightningDataModule, trainer: pl.Trainer, callbacks: List[pl.Callback], - logger: List[pl.loggers.LightningLoggerBase], + logger: List[pl.logging.Logger], ) -> None: """This method controls which parameters from Hydra config are saved by Lightning loggers. diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index c0d1a7b0..3f4d6d87 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -63,7 +63,12 @@ def test_FrenchLidar_RandLaNetDebug_with_gpu(toy_dataset_hdf5_path, tmpdir_facto # Attention to concurrency with other processes using the GPU when running tests. gpu_id = 0 cfg_one_epoch = make_default_hydra_cfg( - overrides=["experiment=RandLaNetDebug", f"trainer.gpus=[{gpu_id}]"] + tmp_paths_overrides + overrides=[ + "experiment=RandLaNetDebug", + "trainer.accelerator=gpu", + f"trainer.devices=[{gpu_id}]", + ] + + tmp_paths_overrides ) train(cfg_one_epoch) @@ -110,7 +115,10 @@ def test_command_without_epsg(one_epoch_trained_RandLaNet_checkpoint, tmpdir): "+predict.interpolator.probas_to_save=[building,unclassified]", "task.task_name=predict", ] - assert "No EPSG provided, neither in the lidar file or as parameter" in run_hydra_decorated_command_with_return_error(command) + assert ( + "No EPSG provided, neither in the lidar file or as parameter" + in run_hydra_decorated_command_with_return_error(command) + ) def test_predict_on_single_point_cloud(one_epoch_trained_RandLaNet_checkpoint, tmpdir): @@ -177,10 +185,7 @@ def test_run_test_with_trained_model_on_toy_dataset_on_cpu( one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir ): _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - "null", + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, 0 ) @@ -217,11 +222,13 @@ def _run_test_right_after_training( tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) + accelerator = "cpu" if trainer_gpus == 0 else "gpu" cfg_test_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=test", # sets task.task_name to "test" f"model.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", - f"trainer.gpus={trainer_gpus}", + f"trainer.devices={trainer_gpus}", + f"trainer.accelerator={accelerator}", ] + tmp_paths_overrides ) diff --git a/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml b/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml index bfbc1e9a..a9970595 100644 --- a/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml +++ b/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml @@ -5,20 +5,14 @@ print_config: true ignore_warnings: true trainer: _target_: pytorch_lightning.Trainer - gpus: - - 0 - - 1 min_epochs: 100 max_epochs: 150 log_every_n_steps: 1 - weights_summary: null - progress_bar_refresh_rate: 1 - auto_lr_find: false num_sanity_val_steps: 2 accumulate_grad_batches: 3 strategy: ddp accelerator: gpu - num_processes: 2 + devices: 2 datamodule: transforms: preparations: @@ -264,6 +258,7 @@ logger: disabled: false task: task_name: predict + auto_lr_find: false predict: src_las: /path/to/input.las output_dir: /path/to/output_dir/ From 9b84ca0af9ba3e83a342a3c8a0f9237928395696 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 10:43:46 +0100 Subject: [PATCH 03/28] Fix logging --- configs/callbacks/default.yaml | 6 - configs/model/default.yaml | 9 +- myria3d/callbacks/logging_callbacks.py | 153 ------------------------- myria3d/models/model.py | 64 ++++++++--- myria3d/pctl/datamodule/hdf5.py | 4 +- 5 files changed, 50 insertions(+), 186 deletions(-) delete mode 100644 myria3d/callbacks/logging_callbacks.py diff --git a/configs/callbacks/default.yaml b/configs/callbacks/default.yaml index 1f221750..dcab6e52 100755 --- a/configs/callbacks/default.yaml +++ b/configs/callbacks/default.yaml @@ -12,12 +12,6 @@ lr_monitor: logging_interval: "step" log_momentum: true -# This logs IoU at validation and test time -# Predictions are aggregated and saved at test time in a way coherent with prediction logic. -log_iou_by_class: - _target_: myria3d.callbacks.logging_callbacks.LogIoUByClass - classification_dict: ${dataset_description.classification_dict} - model_checkpoint: _target_: pytorch_lightning.callbacks.ModelCheckpoint monitor: "val/loss_epoch" # name of the logged metric which determines when model is improving diff --git a/configs/model/default.yaml b/configs/model/default.yaml index d6d418d1..2223d6fd 100644 --- a/configs/model/default.yaml +++ b/configs/model/default.yaml @@ -3,6 +3,7 @@ _target_: myria3d.models.model.Model ## Inputs and outputs d_in: ${dataset_description.d_in} # XYZ (3) + Other features (N) num_classes: ${dataset_description.num_classes} +classification_dict: ${dataset_description.classification_dict} # Architecture defined in sub-configs ckpt_path: null # str, for resuming training and finetuning. @@ -13,14 +14,6 @@ neural_net_hparams: ??? interpolation_k: ${predict.interpolator.interpolation_k} # interpolation at eval time num_workers: 4 # for knn_interpolate -## Evaluation metric - partial for triple (train/val/test) init -iou: - _target_: functools.partial - _args_: - - "${get_method:torchmetrics.JaccardIndex}" - - ${model.num_classes} - absent_score: 1.0 # do not penalize if a class is absent from labels. - ## Optimization momentum: 0.9 # arbitrary monitor: "val/loss_epoch" diff --git a/myria3d/callbacks/logging_callbacks.py b/myria3d/callbacks/logging_callbacks.py deleted file mode 100644 index 0d2329b0..00000000 --- a/myria3d/callbacks/logging_callbacks.py +++ /dev/null @@ -1,153 +0,0 @@ -from typing import Any, Dict, Optional - -import pytorch_lightning as pl -import torch -from pytorch_lightning import Callback -from pytorch_lightning.utilities.types import STEP_OUTPUT -from torchmetrics import JaccardIndex -from torchmetrics.functional.classification.jaccard import _jaccard_index_reduce - -from myria3d.utils import utils - -log = utils.get_logger(__name__) - - -# Training was not lenghtend so we keep "as-is" for now, but this -# is not optimal at the moment, and a single class JaccardIndex by phase could -# be used # with specific class of interest specified before each logging. - - -class LogIoUByClass(Callback): - """ - A Callback to log JaccardIndex for each class. - """ - - def __init__(self, classification_dict: Dict[int, str]): - self.classification_names = classification_dict.values() - self.num_classes = len(classification_dict) - self.metric = SingleClassIoU - - def get_all_iou_by_class_object(self): - """Get a dict with schema {class_name:iou_for_class_name, ...}""" - iou_dict = { - name: self.metric(self.num_classes, idx) - for idx, name in enumerate(self.classification_names) - } - return iou_dict - - def on_fit_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): - """Setup IoU torchmetrics objects for train and val phases.""" - self.train_iou_by_class_dict = self.get_all_iou_by_class_object() - self.val_iou_by_class_dict = self.get_all_iou_by_class_object() - - def on_test_start(self, trainer: pl.Trainer, pl_module: pl.LightningModule): - """Setup IoU torchmetrics objects for test phase.""" - self.test_iou_by_class_dict = self.get_all_iou_by_class_object() - - def on_init_end(self, trainer: pl.Trainer) -> None: - """Setup logging functionnalities.""" - self.experiment = trainer.logger.experiment[0] - - def on_train_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "train", self.train_iou_by_class_dict) - - def on_validation_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "val", self.val_iou_by_class_dict) - - def on_test_batch_end( - self, - trainer: pl.Trainer, - pl_module: pl.LightningModule, - outputs: Optional[STEP_OUTPUT], - batch: Any, - batch_idx: int, - dataloader_idx: int, - ): - """Log IoU for each class.""" - logits = outputs["logits"] - targets = outputs["targets"] - self.log_iou(logits, targets, "test", self.test_iou_by_class_dict) - - def log_iou(self, logits, targets, phase: str, iou_dict): - device = logits.device - preds = torch.argmax(logits, dim=1) - for class_name, class_iou in iou_dict.items(): - class_iou = class_iou.to(device) - class_iou(preds, targets) - metric_name = f"{phase}/iou_CLASS_{class_name}" - self.log( - metric_name, - class_iou, - on_step=False, - on_epoch=True, - metric_attribute=metric_name, - ) - - -class SingleClassIoU(JaccardIndex): - """ - Custom JaccardIndex metrics to log single class JaccardIndex using PytorchLighting log system. - This enables good computation of epoch-level JaccardIndex. - i.e. use the full confusion matrix instead of averaging many step-level JaccardIndex. - Default parameters of JaccardIndex are used except for absent_score set to 1.0 and none reduction. - - """ - - def __init__( - self, - num_classes: int, - class_of_interest_idx: int, - ignore_index: Optional[int] = None, - absent_score: float = 1.0, - threshold: float = 0.5, - reduction: str = "none", - compute_on_step: bool = True, - dist_sync_on_step: bool = False, - process_group: Optional[Any] = None, - ) -> None: - self.class_of_interest_idx = class_of_interest_idx - - super().__init__( - num_classes, - ignore_index, - absent_score, - threshold, - reduction, - compute_on_step, - dist_sync_on_step, - process_group, - ) - - def compute(self): - """Computes intersection over union (JaccardIndex). - - Default behavior changed in torchmetrics and an absent class will yield a score of 0 instead of 1 - """ - - iou_no_reduction = _jaccard_index_reduce( - confmat=self.confmat, average=self.reduction, ignore_index=self.ignore_index - ) - class_of_interest_iou = iou_no_reduction[self.class_of_interest_idx] - return class_of_interest_iou diff --git a/myria3d/models/model.py b/myria3d/models/model.py index 0f837f19..d37d0f13 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -8,6 +8,7 @@ from myria3d.models.modules.pyg_randla_net import PyGRandLANet from myria3d.utils import utils +from torchmetrics.classification import MulticlassJaccardIndex log = utils.get_logger(__name__) @@ -71,10 +72,10 @@ def __init__(self, **kwargs): def setup(self, stage: Optional[str]) -> None: """Setup stage: prepare to compute IoU and loss.""" if stage == "fit": - self.train_iou = self.hparams.iou() - self.val_iou = self.hparams.iou() + self.train_iou = MulticlassJaccardIndex(self.hparams.num_classes) + self.val_iou = MulticlassJaccardIndex(self.hparams.num_classes) if stage == "test": - self.test_iou = self.hparams.iou() + self.test_iou = MulticlassJaccardIndex(self.hparams.num_classes) def forward(self, batch: Batch) -> torch.Tensor: """Forward pass of neural network. @@ -116,7 +117,7 @@ def forward(self, batch: Batch) -> torch.Tensor: def on_fit_start(self) -> None: """On fit start: get the experiment for easier access.""" - self.experiment = self.logger.experiment[0] + # self.experiment = self.logger.experiment self.criterion = self.criterion.to(self.device) def training_step(self, batch: Batch, batch_idx: int) -> dict: @@ -140,15 +141,13 @@ def training_step(self, batch: Batch, batch_idx: int) -> dict: with torch.no_grad(): preds = torch.argmax(logits.detach(), dim=1) self.train_iou(preds, targets) - self.log( - "train/iou", - self.train_iou, - on_step=True, - on_epoch=True, - prog_bar=True, - ) + self.log("train/iou", self.train_iou, on_step=True, on_epoch=True, prog_bar=True) return {"loss": loss, "logits": logits, "targets": targets} + def on_train_epoch_end(self) -> None: + self.train_iou.compute() + self.log_all_ious(self.train_iou.confmat, "train") + def validation_step(self, batch: Batch, batch_idx: int) -> dict: """Validation step. @@ -182,6 +181,7 @@ def on_validation_epoch_end(self) -> None: """ self.val_iou.compute() + self.log_all_ious(self.val_iou.confmat, "val") def test_step(self, batch: Batch, batch_idx: int): """Test step. @@ -202,16 +202,20 @@ def test_step(self, batch: Batch, batch_idx: int): preds = torch.argmax(logits, dim=1) self.test_iou = self.test_iou.to(preds.device) self.test_iou(preds, targets) - self.log( - "test/iou", - self.test_iou, - on_step=False, - on_epoch=True, - prog_bar=True, - ) + self.log("test/iou", self.test_iou, on_step=False, on_epoch=True, prog_bar=True) return {"loss": loss, "logits": logits, "targets": targets} + def on_test_epoch_end(self) -> None: + """At the end of a validation epoch, compute the IoU. + + Args: + outputs : output of test + + """ + self.test_iou.compute() + self.log_all_ious(self.test_iou.confmat, "test") + def predict_step(self, batch: Batch) -> dict: """Prediction step. @@ -254,3 +258,27 @@ def _get_batch_tensor_by_enumeration(self, pos_x: torch.Tensor) -> torch.Tensor: from shape B,N,... to shape (N,...). """ return torch.cat([torch.full((len(sample_pos),), i) for i, sample_pos in enumerate(pos_x)]) + + def log_all_ious(self, confmat, phase: str): + ious = iou(confmat) + for class_iou, class_name in zip(ious, self.hparams.classification_dict.values()): + metric_name = f"{phase}/iou_CLASS_{class_name}" + self.log( + metric_name, class_iou, on_step=False, on_epoch=True, metric_attribute=metric_name + ) + + +def iou(confmat): + """Computes the Intersection over Union of each class in the + confusion matrix + + Return: + (iou, missing_class_mask) - iou for class as well as a mask + highlighting existing classes + """ + TP_plus_FN = confmat.sum(dim=0) + TP_plus_FP = confmat.sum(dim=1) + TP = confmat.diag() + union = TP_plus_FN + TP_plus_FP - TP + iou = 1e-8 + TP / (union + 1e-8) + return iou diff --git a/myria3d/pctl/datamodule/hdf5.py b/myria3d/pctl/datamodule/hdf5.py index 167744df..eb0e6da4 100644 --- a/myria3d/pctl/datamodule/hdf5.py +++ b/myria3d/pctl/datamodule/hdf5.py @@ -42,6 +42,8 @@ def __init__( transforms: Optional[Dict[str, TRANSFORMS_LIST]] = None, **kwargs, ): + super().__init__() + self.split_csv_path = split_csv_path self.data_dir = data_dir self.hdf5_file_path = hdf5_file_path @@ -86,7 +88,7 @@ def eval_transform(self) -> CustomCompose: def predict_transform(self) -> CustomCompose: return CustomCompose(self.preparation_predict_transform + self.normalization_transform) - def prepare_data(self, stage: Optional[str] = None): + def prepare_data_per_node(self, stage: Optional[str] = None): """Prepare dataset containing train, val, test data.""" if stage in ["fit", "test"] or stage is None: From cc510084a716f523496c3bf8fee12613281811ea Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 10:52:13 +0100 Subject: [PATCH 04/28] Fix loading of model at inference time --- configs/predict/default.yaml | 1 + myria3d/predict.py | 8 +++++--- tests/myria3d/test_train_and_predict.py | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/configs/predict/default.yaml b/configs/predict/default.yaml index ed52b284..4c078533 100644 --- a/configs/predict/default.yaml +++ b/configs/predict/default.yaml @@ -1,6 +1,7 @@ src_las: "/path/to/input.las" # Any glob pattern can be used to predict on multiple files. output_dir: "/path/to/output_dir/" # Predictions are saved in a new file which shares src_las basename. ckpt_path: "/path/to/lightning_model.ckpt" # Checkpoint of trained model. +gpus: 0 # Probas interpolation parameters # subtile_overlap=25 to use a sliding window of inference of which predictions will be merged. diff --git a/myria3d/predict.py b/myria3d/predict.py index bc71bfd5..399c57e9 100644 --- a/myria3d/predict.py +++ b/myria3d/predict.py @@ -36,7 +36,7 @@ def predict(config: DictConfig) -> str: """ # Those are the 2 needed inputs, in addition to the hydra config. - assert os.path.exists(config.predict.ckpt_path) + assert os.path.exists(config.model.ckpt_path) assert os.path.exists(config.predict.src_las) datamodule: LightningDataModule = hydra.utils.instantiate(config.datamodule) @@ -46,7 +46,7 @@ def predict(config: DictConfig) -> str: torch.set_grad_enabled(False) model: LightningModule = hydra.utils.instantiate(config.model) - model = model.load_from_checkpoint(config.predict.ckpt_path) + # model = model.load_from_checkpoint(config.predict.ckpt_path) device = utils.define_device_from_config_param(config.predict.gpus) model.to(device) model.eval() @@ -67,5 +67,7 @@ def predict(config: DictConfig) -> str: logits = model.predict_step(batch)["logits"] itp.store_predictions(logits, batch.idx_in_original_cloud) - out_f = itp.reduce_predictions_and_save(config.predict.src_las, config.predict.output_dir, config.datamodule.get("epsg")) + out_f = itp.reduce_predictions_and_save( + config.predict.src_las, config.predict.output_dir, config.datamodule.get("epsg") + ) return out_f diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index 3f4d6d87..389594c4 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -153,7 +153,7 @@ def test_RandLaNet_predict_with_invariance_checks(one_epoch_trained_RandLaNet_ch cfg_predict_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=predict", - f"predict.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", + f"model.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", f"datamodule.epsg={DEFAULT_EPSG}", f"predict.src_las={TOY_LAS_DATA}", f"predict.output_dir={tmpdir}", From 35492a2c3f6ce3cbe4d7aa76c09a17f31035f3d9 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 10:56:18 +0100 Subject: [PATCH 05/28] Do not change API: keep predict.ckpt_path as parameter for inference --- myria3d/predict.py | 4 ++-- tests/myria3d/test_train_and_predict.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/myria3d/predict.py b/myria3d/predict.py index 399c57e9..b7662a64 100644 --- a/myria3d/predict.py +++ b/myria3d/predict.py @@ -36,7 +36,7 @@ def predict(config: DictConfig) -> str: """ # Those are the 2 needed inputs, in addition to the hydra config. - assert os.path.exists(config.model.ckpt_path) + assert os.path.exists(config.predict.ckpt_path) assert os.path.exists(config.predict.src_las) datamodule: LightningDataModule = hydra.utils.instantiate(config.datamodule) @@ -44,7 +44,7 @@ def predict(config: DictConfig) -> str: # Do not require gradient for faster predictions torch.set_grad_enabled(False) - + config.model.ckpt_path = config.predict.ckpt_path model: LightningModule = hydra.utils.instantiate(config.model) # model = model.load_from_checkpoint(config.predict.ckpt_path) device = utils.define_device_from_config_param(config.predict.gpus) diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index 389594c4..3f4d6d87 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -153,7 +153,7 @@ def test_RandLaNet_predict_with_invariance_checks(one_epoch_trained_RandLaNet_ch cfg_predict_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=predict", - f"model.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", + f"predict.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", f"datamodule.epsg={DEFAULT_EPSG}", f"predict.src_las={TOY_LAS_DATA}", f"predict.output_dir={tmpdir}", From d527e5516f8faf7199375fa9ce51ff0830688ac8 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 11:06:56 +0100 Subject: [PATCH 06/28] Fix use of cpu accelerator in test using the new syntax --- tests/myria3d/test_train_and_predict.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index 3f4d6d87..221b4af0 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -185,7 +185,7 @@ def test_run_test_with_trained_model_on_toy_dataset_on_cpu( one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir ): _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, 0 + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, "cpu" ) @@ -194,18 +194,12 @@ def test_run_test_with_trained_model_on_toy_dataset_on_gpu( one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir ): _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - "[0]", + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, "gpu" ) def _run_test_right_after_training( - one_epoch_trained_RandLaNet_checkpoint, - toy_dataset_hdf5_path, - tmpdir, - trainer_gpus, + one_epoch_trained_RandLaNet_checkpoint, toy_dataset_hdf5_path, tmpdir, accelerator ): """Run test using the model that was just trained for one epoch. @@ -222,12 +216,12 @@ def _run_test_right_after_training( tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) - accelerator = "cpu" if trainer_gpus == 0 else "gpu" + devices = "[0]" if accelerator == "gpu" else 1 cfg_test_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=test", # sets task.task_name to "test" f"model.ckpt_path={one_epoch_trained_RandLaNet_checkpoint}", - f"trainer.devices={trainer_gpus}", + f"trainer.devices={devices}", f"trainer.accelerator={accelerator}", ] + tmp_paths_overrides From 99e60f328ee77ca4494f76220585b4bb45cd7fb3 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 11:15:46 +0100 Subject: [PATCH 07/28] Refactor iou into a separate script for clarity --- myria3d/metrics/iou.py | 17 +++++++++++++++++ myria3d/models/model.py | 39 ++++++++++++--------------------------- 2 files changed, 29 insertions(+), 27 deletions(-) create mode 100644 myria3d/metrics/iou.py diff --git a/myria3d/metrics/iou.py b/myria3d/metrics/iou.py new file mode 100644 index 00000000..92a71b31 --- /dev/null +++ b/myria3d/metrics/iou.py @@ -0,0 +1,17 @@ +from torch import Tensor + + +def iou(confmat: Tensor): + """Computes the Intersection over Union of each class in the + confusion matrix + + Return: + (iou, missing_class_mask) - iou for class as well as a mask + highlighting existing classes + """ + TP_plus_FN = confmat.sum(dim=0) + TP_plus_FP = confmat.sum(dim=1) + TP = confmat.diag() + union = TP_plus_FN + TP_plus_FP - TP + iou = 1e-8 + TP / (union + 1e-8) + return iou diff --git a/myria3d/models/model.py b/myria3d/models/model.py index d37d0f13..36e204ea 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -5,6 +5,7 @@ from torch import nn from torch_geometric.data import Batch from torch_geometric.nn import knn_interpolate +from myria3d.metrics.iou import iou from myria3d.models.modules.pyg_randla_net import PyGRandLANet from myria3d.utils import utils @@ -77,6 +78,14 @@ def setup(self, stage: Optional[str]) -> None: if stage == "test": self.test_iou = MulticlassJaccardIndex(self.hparams.num_classes) + def log_all_class_ious(self, confmat, phase: str): + ious = iou(confmat) + for class_iou, class_name in zip(ious, self.hparams.classification_dict.values()): + metric_name = f"{phase}/iou_CLASS_{class_name}" + self.log( + metric_name, class_iou, on_step=False, on_epoch=True, metric_attribute=metric_name + ) + def forward(self, batch: Batch) -> torch.Tensor: """Forward pass of neural network. @@ -146,7 +155,7 @@ def training_step(self, batch: Batch, batch_idx: int) -> dict: def on_train_epoch_end(self) -> None: self.train_iou.compute() - self.log_all_ious(self.train_iou.confmat, "train") + self.log_all_class_ious(self.train_iou.confmat, "train") def validation_step(self, batch: Batch, batch_idx: int) -> dict: """Validation step. @@ -181,7 +190,7 @@ def on_validation_epoch_end(self) -> None: """ self.val_iou.compute() - self.log_all_ious(self.val_iou.confmat, "val") + self.log_all_class_ious(self.val_iou.confmat, "val") def test_step(self, batch: Batch, batch_idx: int): """Test step. @@ -214,7 +223,7 @@ def on_test_epoch_end(self) -> None: """ self.test_iou.compute() - self.log_all_ious(self.test_iou.confmat, "test") + self.log_all_class_ious(self.test_iou.confmat, "test") def predict_step(self, batch: Batch) -> dict: """Prediction step. @@ -258,27 +267,3 @@ def _get_batch_tensor_by_enumeration(self, pos_x: torch.Tensor) -> torch.Tensor: from shape B,N,... to shape (N,...). """ return torch.cat([torch.full((len(sample_pos),), i) for i, sample_pos in enumerate(pos_x)]) - - def log_all_ious(self, confmat, phase: str): - ious = iou(confmat) - for class_iou, class_name in zip(ious, self.hparams.classification_dict.values()): - metric_name = f"{phase}/iou_CLASS_{class_name}" - self.log( - metric_name, class_iou, on_step=False, on_epoch=True, metric_attribute=metric_name - ) - - -def iou(confmat): - """Computes the Intersection over Union of each class in the - confusion matrix - - Return: - (iou, missing_class_mask) - iou for class as well as a mask - highlighting existing classes - """ - TP_plus_FN = confmat.sum(dim=0) - TP_plus_FP = confmat.sum(dim=1) - TP = confmat.diag() - union = TP_plus_FN + TP_plus_FP - TP - iou = 1e-8 + TP / (union + 1e-8) - return iou From 6167fded2215bf2f8e7157fa891aec7217519825 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 11:20:18 +0100 Subject: [PATCH 08/28] Revert conda environment name to myria3d --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index f7598ce6..6a2ccdec 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d_upgrade +name: myria3d channels: - conda-forge - anaconda From 83e34d1d78966a319602875b2880e9dc0c087d02 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 12:08:51 +0100 Subject: [PATCH 09/28] Load checkpoint via the parent class directly --- myria3d/predict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/myria3d/predict.py b/myria3d/predict.py index b7662a64..50c03f62 100644 --- a/myria3d/predict.py +++ b/myria3d/predict.py @@ -8,6 +8,8 @@ from pytorch_lightning import LightningDataModule, LightningModule from tqdm import tqdm +from myria3d.models.model import Model + sys.path.append(osp.dirname(osp.dirname(__file__))) from myria3d.models.interpolation import Interpolator # noqa from myria3d.utils import utils # noqa @@ -44,9 +46,7 @@ def predict(config: DictConfig) -> str: # Do not require gradient for faster predictions torch.set_grad_enabled(False) - config.model.ckpt_path = config.predict.ckpt_path - model: LightningModule = hydra.utils.instantiate(config.model) - # model = model.load_from_checkpoint(config.predict.ckpt_path) + model = Model.load_from_checkpoint(config.predict.ckpt_path) device = utils.define_device_from_config_param(config.predict.gpus) model.to(device) model.eval() From d28b756af6d9fc88ca44be7cbfddf8e260e3efa2 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 12:10:47 +0100 Subject: [PATCH 10/28] Update version and changelog to V3.7.0 --- CHANGELOG.md | 5 ++++- package_metadata.yaml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d979ca51..f5864502 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # CHANGELOG -### 3.6.0 +## 3.7.0 +- Update all versions of Pytorch, Pytorch Lightning, and Pytorch Geometric. + +## 3.6.0 - Remove the "EPSG:2154" by default and use the metadata of the lidar file, unless a parameter is given. ### 3.5.2 diff --git a/package_metadata.yaml b/package_metadata.yaml index e6f0e165..a861d216 100644 --- a/package_metadata.yaml +++ b/package_metadata.yaml @@ -1,4 +1,4 @@ -__version__: "3.6.0" +__version__: "3.7.0" __name__: "myria3d" __url__: "https://github.com/IGNF/myria3d" __description__: "Deep Learning for the Semantic Segmentation of Aerial Lidar Point Clouds" From 86e497703b1a927a572c6ed5c382fc934b215229 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 2 Jan 2024 17:25:37 +0100 Subject: [PATCH 11/28] Downgrade to pytorch-lightning==2.0.8 to avoid error --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 2831bc43..6049cdee 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d +name: myria3d_latest_pytorch channels: - conda-forge - anaconda @@ -38,7 +38,7 @@ dependencies: - --extra-index-url https://download.pytorch.org/whl/cu118 - torch==2.1.* - torchvision - - pytorch-lightning + - pytorch-lightning==2.0.8 - torchmetrics - comet_ml==3.31.* # VErsion to update ! - torch_geometric From 4c7c208d4ab683226ab5b36b80e3a8852509958f Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Mon, 8 Jan 2024 16:52:31 +0100 Subject: [PATCH 12/28] Manually reset the metrics after each end of epoch --- myria3d/models/model.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/myria3d/models/model.py b/myria3d/models/model.py index 36e204ea..df451b77 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -70,13 +70,13 @@ def __init__(self, **kwargs): self.softmax = nn.Softmax(dim=1) self.criterion = self.hparams.criterion - def setup(self, stage: Optional[str]) -> None: - """Setup stage: prepare to compute IoU and loss.""" - if stage == "fit": - self.train_iou = MulticlassJaccardIndex(self.hparams.num_classes) - self.val_iou = MulticlassJaccardIndex(self.hparams.num_classes) - if stage == "test": - self.test_iou = MulticlassJaccardIndex(self.hparams.num_classes) + def on_fit_start(self) -> None: + self.criterion = self.criterion.to(self.device) + self.train_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) + self.val_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) + + def on_test_start(self) -> None: + self.test_iou = MulticlassJaccardIndex(self.hparams.num_classes).to(self.device) def log_all_class_ious(self, confmat, phase: str): ious = iou(confmat) @@ -124,11 +124,6 @@ def forward(self, batch: Batch) -> torch.Tensor: targets = batch.copies["transformed_y_copy"].to(logits.device) return targets, logits - def on_fit_start(self) -> None: - """On fit start: get the experiment for easier access.""" - # self.experiment = self.logger.experiment - self.criterion = self.criterion.to(self.device) - def training_step(self, batch: Batch, batch_idx: int) -> dict: """Training step. @@ -156,6 +151,7 @@ def training_step(self, batch: Batch, batch_idx: int) -> dict: def on_train_epoch_end(self) -> None: self.train_iou.compute() self.log_all_class_ious(self.train_iou.confmat, "train") + self.train_iou.reset() def validation_step(self, batch: Batch, batch_idx: int) -> dict: """Validation step. @@ -191,6 +187,7 @@ def on_validation_epoch_end(self) -> None: """ self.val_iou.compute() self.log_all_class_ious(self.val_iou.confmat, "val") + self.val_iou.reset() def test_step(self, batch: Batch, batch_idx: int): """Test step. @@ -224,6 +221,7 @@ def on_test_epoch_end(self) -> None: """ self.test_iou.compute() self.log_all_class_ious(self.test_iou.confmat, "test") + self.test_iou.reset() def predict_step(self, batch: Batch) -> dict: """Prediction step. From 03d12c07d0d52686ed177e616cc61156734464f9 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Mon, 8 Jan 2024 16:53:18 +0100 Subject: [PATCH 13/28] Update signature of class LogLogsPath's setup hook --- myria3d/callbacks/comet_callbacks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/myria3d/callbacks/comet_callbacks.py b/myria3d/callbacks/comet_callbacks.py index a16f9a04..4446b83b 100755 --- a/myria3d/callbacks/comet_callbacks.py +++ b/myria3d/callbacks/comet_callbacks.py @@ -65,9 +65,9 @@ class LogLogsPath(Callback): """Logs run working directory to comet.ml""" @rank_zero_only - def on_init_end(self, trainer): + def setup(self, trainer, pl_module, stage): logger = get_comet_logger(trainer=trainer) if logger: log_path = os.getcwd() log.info(f"----------------\n LOGS DIR is {log_path}\n ----------------") - logger.experiment.log_parameter("experiment_logs_dirpath", log_path) + logger.experiment.log_parameter("experiment_logs_dirpath", log_path) \ No newline at end of file From 05c681c9c25c12e03f0631e95703c1e4f402b9c1 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 10 Jan 2024 12:00:20 +0100 Subject: [PATCH 14/28] Rename conda env to myria3d --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 6049cdee..2f6342b0 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d_latest_pytorch +name: myria3d channels: - conda-forge - anaconda From ee70453de4d698377a84c3b5342ff23b43748497 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 10 Jan 2024 12:01:12 +0100 Subject: [PATCH 15/28] Remove dead comments in environment.yml --- environment.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/environment.yml b/environment.yml index 2f6342b0..7a3af4e6 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,6 @@ dependencies: - python==3.9.* - pip # --------- data formats --------- # - # - numpy - h5py # --------- geo --------- # - pygeos @@ -28,9 +27,7 @@ dependencies: - python-dotenv # loading env variables from .env file - rich # beautiful text formatting in terminal - sh # for running bash commands in some tests - # - scikit-learn # used in some callbacks - seaborn # used in some callbacks - # - jupyterlab # better jupyter notebooks - pudb # debugger - pip: # --------- Deep Learning --------- # From 2c4edba6459cddb0c6394638db6030ea14534f7c Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 10 Jan 2024 14:59:55 +0100 Subject: [PATCH 16/28] Install with conda whenever possible in environment.yml --- environment.yml | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/environment.yml b/environment.yml index 7a3af4e6..399624c1 100644 --- a/environment.yml +++ b/environment.yml @@ -1,12 +1,23 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d +name: myria3d_torch_2_1 channels: + - pytorch + - nvidia + - pyg + - comet_ml - conda-forge - anaconda dependencies: - - python==3.9.* + - python=3.9.* - pip + # --------- Deep Learning --------- # + - pytorch=2.1 + - pytorch-cuda=11.8 + - torchvision=0.16 + - lightning=2.0 + - torchmetrics=0.11 + - pyg=2.4 # --------- data formats --------- # - h5py # --------- geo --------- # @@ -14,6 +25,10 @@ dependencies: - shapely - python-pdal # --------- loggers --------- # + - comet_ml=3.35 + # --------- Visualization --------- # + - pandas + - matplotlib # --------- linters --------- # - pre-commit # hooks for applying linters on commit - black # code formatting @@ -30,15 +45,7 @@ dependencies: - seaborn # used in some callbacks - pudb # debugger - pip: - # --------- Deep Learning --------- # # Extra index may need to be on first line - - --extra-index-url https://download.pytorch.org/whl/cu118 - - torch==2.1.* - - torchvision - - pytorch-lightning==2.0.8 - - torchmetrics - - comet_ml==3.31.* # VErsion to update ! - - torch_geometric - urllib3<2 # To solve for https://github.com/GeneralMills/pytrends/issues/591 # Wheels for torch-geometric optionnal dependencies - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_cluster-1.6.3%2Bpt21cu118-cp39-cp39-linux_x86_64.whl @@ -49,9 +56,6 @@ dependencies: # ou # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 - # --------- Visualization --------- # - - pandas - - matplotlib # --------- hydra configs --------- # - hydra-core==1.1.* - hydra-colorlog==1.1.* From dac04f2d06d29c1888050a1d9e1ea1132125e6ef Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Wed, 10 Jan 2024 15:00:12 +0100 Subject: [PATCH 17/28] Revert name of conda env to myria3d --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 399624c1..ca1957d0 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # Simple install with # mamba env create -f environment.yml -name: myria3d_torch_2_1 +name: myria3d channels: - pytorch - nvidia From 2300917d45e8f7883bf66b29b0a200374763eb27 Mon Sep 17 00:00:00 2001 From: Lea Vauchier Date: Thu, 11 Jan 2024 16:06:15 +0100 Subject: [PATCH 18/28] Update docker image to use mamba based image + use conda packages as much as possible --- .dockerignore | 14 +++++++++ Dockerfile | 63 ++++++++++++++--------------------------- environment.yml | 75 ++++++++++++++++++++++++++----------------------- 3 files changed, 75 insertions(+), 77 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..016b68f0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,14 @@ +.hydra +.vscode +htmlcov + +# Caches +**/__pycache__ +.pytest_cache + + +# Distribution / packaging +build/ +dist/ +*.egg-info +*.egg \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 7fc19daf..6d309ea8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,49 +1,28 @@ -FROM nvidia/cuda:11.3.1-base-ubuntu20.04 - -# set the IGN proxy, otherwise apt-get and other applications don't work -# Should be commented out outside of IGN -ENV http_proxy 'http://192.168.4.9:3128/' -ENV https_proxy 'http://192.168.4.9:3128/' - -# Remove any third-party apt sources to avoid issues with expiring keys. -RUN rm -f /etc/apt/sources.list.d/*.list - -# Install some basic utilities -RUN apt-get update && apt-get install -y \ - nano \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - && rm -rf /var/lib/apt/lists/* - -# Create a working directory -RUN mkdir /app - -# Set up the Conda environment and make python accessible via PATH. -ENV CONDA_AUTO_UPDATE_CONDA=false -ENV PATH=/miniconda:/miniconda/bin:$PATH -COPY environment.yml /app/environment.yml -RUN curl -sLo /miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh \ - && chmod +x /miniconda.sh \ - && /miniconda.sh -b -p /miniconda \ - && rm /miniconda.sh \ - && /miniconda/bin/conda env update -n base -f /app/environment.yml \ - && rm /app/environment.yml \ - && /miniconda/bin/conda clean -ya +FROM mambaorg/micromamba:focal-cuda-11.3.1 +# focal is Ubuntu 20.04 +WORKDIR /app + +# use chown to prevent permission issues +COPY --chown=$MAMBA_USER:$MAMBA_USER . . + +COPY environment.yml environment.yml + +RUN micromamba env create -f /app/environment.yml + +ENV PATH=$PATH:/opt/conda/envs/myria3d/bin/ # Need to export this for torch_geometric to find where cuda is. # See https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 -ENV LD_LIBRARY_PATH="/miniconda/lib/:$LD_LIBRARY_PATH" +ENV LD_LIBRARY_PATH="/opt/conda/envs/myria3d/lib/:$LD_LIBRARY_PATH" -# Check succes of environment creation. +# Check success of environment creation. RUN python -c "import torch_geometric;" -# Copy the repository content in /app -WORKDIR /app -COPY . . +# locate proj +ENV PROJ_LIB=/opt/conda/envs/myria3d/share/proj/ + +# Check that myria3d can run +RUN python run.py task.task_name=predict --help -# Set the default command to bash for image inspection. -CMD ["bash"] +# # Set the default command to bash for image inspection. +# CMD ["bash"] diff --git a/environment.yml b/environment.yml index ca1957d0..40911fce 100644 --- a/environment.yml +++ b/environment.yml @@ -7,28 +7,41 @@ channels: - pyg - comet_ml - conda-forge - - anaconda dependencies: - python=3.9.* - pip # --------- Deep Learning --------- # - - pytorch=2.1 - - pytorch-cuda=11.8 - - torchvision=0.16 - - lightning=2.0 - - torchmetrics=0.11 - - pyg=2.4 + - pytorch::pytorch=2.1 + - pytorch::pytorch-cuda=11.8 + - pytorch::torchvision=0.16 + - conda-forge::lightning=2.0 + - conda-forge::torchmetrics=0.11 + - pyg::pyg=2.4 + - pyg::pytorch-cluster + - pyg::pytorch-scatter + - pyg::pytorch-sparse + # Troubleshooting: if libcusparse.so.11. errors occur, run + # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH" + # ou + # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" + # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 # --------- data formats --------- # + - numpy - h5py # --------- geo --------- # - - pygeos - - shapely + - pdal - python-pdal + - pyproj + # --------- Visualization --------- # + - pandas + - matplotlib # --------- loggers --------- # - - comet_ml=3.35 + - comet_ml::comet_ml=3.35 + - conda-forge::urllib3<2 # To solve for https://github.com/GeneralMills/pytrends/issues/591 # --------- Visualization --------- # - pandas - matplotlib + - seaborn # used in some callbacks # --------- linters --------- # - pre-commit # hooks for applying linters on commit - black # code formatting @@ -42,30 +55,22 @@ dependencies: - python-dotenv # loading env variables from .env file - rich # beautiful text formatting in terminal - sh # for running bash commands in some tests - - seaborn # used in some callbacks - pudb # debugger + # # --------- Documentation --------- # + - sphinx==4.5.* + - recommonmark==0.7.* + - sphinx_rtd_theme==1.0.* + - docutils==0.17 + - rstcheck==3.3.* # RST Linter - pip: - # Extra index may need to be on first line - - urllib3<2 # To solve for https://github.com/GeneralMills/pytrends/issues/591 - # Wheels for torch-geometric optionnal dependencies - - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_cluster-1.6.3%2Bpt21cu118-cp39-cp39-linux_x86_64.whl - - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_scatter-2.1.2%2Bpt21cu118-cp39-cp39-linux_x86_64.whl - - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_sparse-0.6.18%2Bpt21cu118-cp39-cp39-linux_x86_64.whl - # Nota: if libcusparse.so.11. errors occur, run - # export LD_LIBRARY_PATH="/home/${USER}/miniconda/envs/lib:$LD_LIBRARY_PATH" - # ou - # export LD_LIBRARY_PATH="/home/${USER}/anaconda3/envs/lib:$LD_LIBRARY_PATH" - # see https://github.com/pyg-team/pytorch_geometric/issues/2040#issuecomment-766610625 - # --------- hydra configs --------- # - - hydra-core==1.1.* - - hydra-colorlog==1.1.* - # # --------- Documentation --------- # - - sphinx==4.5.* - - sphinx_rtd_theme==1.0.* - - myst_parser==0.17.* - - sphinx_paramlinks==0.5.* - - recommonmark==0.7.* - - sphinxnotes-mock==1.0.0b0 # still a beta - - docutils==0.17 - - rstcheck==3.3.* # RST Linter - - pyproj==3.6.* + # --------- torch-geometric optionnal dependencies --------- + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_cluster-1.6.3%2Bpt21cu118-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_scatter-2.1.2%2Bpt21cu118-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/torch_sparse-0.6.18%2Bpt21cu118-cp39-cp39-linux_x86_64.whl + # --------- hydra configs --------- # + - hydra-core==1.1.* + - hydra-colorlog==1.1.* + # --------- Documentation --------- # + - myst_parser==0.17.* + - sphinxnotes-mock==1.0.0b0 # still a beta + - sphinx_paramlinks==0.5.* \ No newline at end of file From f57d9af269402e27127f84dd7de22cbeb2215511 Mon Sep 17 00:00:00 2001 From: Lea Vauchier Date: Tue, 30 Jan 2024 15:04:08 +0100 Subject: [PATCH 19/28] Add proxy parameters in gh action --- .github/workflows/cicd.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml index 4d007cc1..8b9a5452 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/cicd.yaml @@ -25,7 +25,7 @@ jobs: uses: actions/checkout@v3 - name: Build docker image - run: docker build -t myria3d . + run: docker build --build-arg http_proxy=${{ secrets.PROXY_URL }} --build-arg https_proxy=${{ secrets.PROXY_URL }} -t myria3d . - name: Run pytest run: > @@ -36,7 +36,7 @@ jobs: pytest -rA -v --ignore=actions-runner - # IMPORTANT: Always run images with --ipc=host and --shm-size=2gb (at least) to enable + # IMPORTANT: Always run images with --ipc=host and --shm-size=2gb (at least) to enable # sufficient shared memory when predicting on large files. - name: Example inference run via Docker with default config and checkpoint run: > @@ -77,12 +77,12 @@ jobs: run: docker run myria3d python -m flake8 # Everything ran so we tag the valid docker image to keep it - # This happens for push events, which are in particular + # This happens for push events, which are in particular # triggered when a pull request is merged. - name: Tag the docker image with branch name if: github.event_name == 'push' run: | - docker tag myria3d:latest myria3d:${{github.ref_name}} + docker tag myria3d:latest myria3d:${{github.ref_name}} docker run myria3d:${{github.ref_name}} bash # Run the new, tagged image at least once so that is it not prunned by mistake when using docker system prune # docker save myria3d:${{github.ref_name}} -o /var/data/cicd/CICD_github_assets/CICD_docker_images/myria3d_${github.ref_name}.tar # Save the docker image as myria3d_${github.ref_name}.tar @@ -101,7 +101,7 @@ jobs: - name: push main docker on nexus (tagged with a date) # we push on nexus an image from the main branch when it has been updated (push or accepted pull request) - if: ((github.ref_name == 'main') && (github.event_name == 'push')) + if: ((github.ref_name == 'main') && (github.event_name == 'push')) run: | docker tag myria3d $nexus_server/lidar_hd/myria3d:${{steps.tag.outputs.version}}-${{steps.tag.outputs.date}} docker login $nexus_server --username svc_lidarhd --password ${{ secrets.PASSWORD_SVC_LIDARHD }} From deecc641fe6237a6e96647ffdb2e402a91388ae9 Mon Sep 17 00:00:00 2001 From: Lea Vauchier Date: Tue, 30 Jan 2024 18:51:29 +0100 Subject: [PATCH 20/28] Use root user to build conda env in docker image --- Dockerfile | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6d309ea8..0b4c6fd3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,13 @@ FROM mambaorg/micromamba:focal-cuda-11.3.1 WORKDIR /app -# use chown to prevent permission issues -COPY --chown=$MAMBA_USER:$MAMBA_USER . . - COPY environment.yml environment.yml +# Switching to root does not seem necessary in the general case, but the github ci/cd process +# does not seem to work without (rresults in a permission error when running pip packages +# installation similar to https://github.com/mamba-org/micromamba-docker/issues/356) +USER root + RUN micromamba env create -f /app/environment.yml ENV PATH=$PATH:/opt/conda/envs/myria3d/bin/ @@ -18,6 +20,9 @@ ENV LD_LIBRARY_PATH="/opt/conda/envs/myria3d/lib/:$LD_LIBRARY_PATH" # Check success of environment creation. RUN python -c "import torch_geometric;" +# use chown to prevent permission issues +COPY . . + # locate proj ENV PROJ_LIB=/opt/conda/envs/myria3d/share/proj/ From 9387ddb34c0c6e2b4edb11a1533d250aa0796af0 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Thu, 1 Feb 2024 10:18:54 +0100 Subject: [PATCH 21/28] No need to save criterion as a hyperparameter since already checkpointed --- myria3d/models/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/myria3d/models/model.py b/myria3d/models/model.py index df451b77..cc672150 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -62,7 +62,7 @@ def __init__(self, **kwargs): # this line ensures params passed to LightningModule will be saved to ckpt # it also allows to access params with 'self.hparams' attribute - self.save_hyperparameters() + self.save_hyperparameters(ignore=["criterion"]) neural_net_class = get_neural_net_class(self.hparams.neural_net_class_name) self.model = neural_net_class(**self.hparams.neural_net_hparams) From 9c6a1a6a06a67c40920ce30a7187f3b915f7e882 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Thu, 1 Feb 2024 10:22:17 +0100 Subject: [PATCH 22/28] Fix setting model.criterio, using kwargs instead of hparams. --- myria3d/models/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/myria3d/models/model.py b/myria3d/models/model.py index cc672150..8e49b319 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -64,11 +64,11 @@ def __init__(self, **kwargs): # it also allows to access params with 'self.hparams' attribute self.save_hyperparameters(ignore=["criterion"]) - neural_net_class = get_neural_net_class(self.hparams.neural_net_class_name) - self.model = neural_net_class(**self.hparams.neural_net_hparams) + neural_net_class = get_neural_net_class(kwargs.get("neural_net_class_name")) + self.model = neural_net_class(**kwargs.get("neural_net_hparams")) self.softmax = nn.Softmax(dim=1) - self.criterion = self.hparams.criterion + self.criterion = kwargs.get("criterion") def on_fit_start(self) -> None: self.criterion = self.criterion.to(self.device) From 997b4b39e7c1adc8f09c30c932bcd67afd9f2773 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 11:05:00 +0100 Subject: [PATCH 23/28] refactor: follow python conventions of lowercase/uppercase use --- myria3d/metrics/iou.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/myria3d/metrics/iou.py b/myria3d/metrics/iou.py index 92a71b31..f9281b37 100644 --- a/myria3d/metrics/iou.py +++ b/myria3d/metrics/iou.py @@ -1,5 +1,7 @@ from torch import Tensor +EPSILON = 1e-8 + def iou(confmat: Tensor): """Computes the Intersection over Union of each class in the @@ -9,9 +11,11 @@ def iou(confmat: Tensor): (iou, missing_class_mask) - iou for class as well as a mask highlighting existing classes """ - TP_plus_FN = confmat.sum(dim=0) - TP_plus_FP = confmat.sum(dim=1) - TP = confmat.diag() - union = TP_plus_FN + TP_plus_FP - TP - iou = 1e-8 + TP / (union + 1e-8) + true_positives_and_false_negatives = confmat.sum(dim=0) + true_positives_and_false_positives = confmat.sum(dim=1) + true_positives = confmat.diag() + union = ( + true_positives_and_false_negatives + true_positives_and_false_positives - true_positives + ) + iou = EPSILON + true_positives / (union + EPSILON) return iou From 03d4921aaaf7ca54126942cc77cf1ac700fe8b18 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 15:40:39 +0100 Subject: [PATCH 24/28] dev: autofind available gpu in tests --- tests/myria3d/test_train_and_predict.py | 8 ++++---- tests/runif.py | 9 +++++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/myria3d/test_train_and_predict.py b/tests/myria3d/test_train_and_predict.py index 221b4af0..0ef0c388 100644 --- a/tests/myria3d/test_train_and_predict.py +++ b/tests/myria3d/test_train_and_predict.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from lightning.pytorch.accelerators import find_usable_cuda_devices + from myria3d.pctl.dataset.toy_dataset import TOY_LAS_DATA from myria3d.pctl.dataset.utils import pdal_read_las_array @@ -59,9 +61,7 @@ def test_FrenchLidar_RandLaNetDebug_with_gpu(toy_dataset_hdf5_path, tmpdir_facto tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) - # We will always use the first GPU id for tests, because it always exists if there are some GPUs. - # Attention to concurrency with other processes using the GPU when running tests. - gpu_id = 0 + gpu_id = find_usable_cuda_devices(1) cfg_one_epoch = make_default_hydra_cfg( overrides=[ "experiment=RandLaNetDebug", @@ -216,7 +216,7 @@ def _run_test_right_after_training( tmp_paths_overrides = _make_list_of_necesary_hydra_overrides_with_tmp_paths( toy_dataset_hdf5_path, tmpdir ) - devices = "[0]" if accelerator == "gpu" else 1 + devices = find_usable_cuda_devices(1) if accelerator == "gpu" else 1 cfg_test_using_trained_model = make_default_hydra_cfg( overrides=[ "experiment=test", # sets task.task_name to "test" diff --git a/tests/runif.py b/tests/runif.py index 8f17699e..ec5da504 100644 --- a/tests/runif.py +++ b/tests/runif.py @@ -1,5 +1,6 @@ import pytest import torch +from lightning.pytorch.accelerators import find_usable_cuda_devices """ Simplified from: @@ -35,8 +36,12 @@ def __new__( reasons = [] if min_gpus: - conditions.append(torch.cuda.device_count() < min_gpus) - reasons.append(f"GPUs>={min_gpus}") + try: + find_usable_cuda_devices(min_gpus) + conditions.append(False) + except (ValueError, RuntimeError) as _: + conditions.append(True) + reasons.append(f"GPUs>={min_gpus}") reasons = [rs for cond, rs in zip(conditions, reasons) if cond] return pytest.mark.skipif( From 1a1514302386c164416d193178dbfc7fe804ada0 Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 15:43:18 +0100 Subject: [PATCH 25/28] Changelog: indicate refactor of single-class IoUs --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 579ce8d0..38146d48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 3.7.0 - Update all versions of Pytorch, Pytorch Lightning, and Pytorch Geometric. +- Refactor logging of single-class IoUs to go from num_classes+1 torchmetrics instances to only 1. ### 3.6.1 - Set urllib3<2 for comet logging to function and add back seaborn for plotting optimal LR graph. From 9033a39bed8a702849f10dd08f02cb95e507085b Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 16:12:06 +0100 Subject: [PATCH 26/28] Flake8 --- myria3d/callbacks/comet_callbacks.py | 2 +- myria3d/models/model.py | 6 ++---- myria3d/predict.py | 2 +- tests/runif.py | 3 +-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/myria3d/callbacks/comet_callbacks.py b/myria3d/callbacks/comet_callbacks.py index 4446b83b..84a82d0c 100755 --- a/myria3d/callbacks/comet_callbacks.py +++ b/myria3d/callbacks/comet_callbacks.py @@ -70,4 +70,4 @@ def setup(self, trainer, pl_module, stage): if logger: log_path = os.getcwd() log.info(f"----------------\n LOGS DIR is {log_path}\n ----------------") - logger.experiment.log_parameter("experiment_logs_dirpath", log_path) \ No newline at end of file + logger.experiment.log_parameter("experiment_logs_dirpath", log_path) diff --git a/myria3d/models/model.py b/myria3d/models/model.py index 8e49b319..67c2752d 100755 --- a/myria3d/models/model.py +++ b/myria3d/models/model.py @@ -1,15 +1,13 @@ -from typing import Optional - import torch from pytorch_lightning import LightningModule from torch import nn from torch_geometric.data import Batch from torch_geometric.nn import knn_interpolate -from myria3d.metrics.iou import iou +from torchmetrics.classification import MulticlassJaccardIndex +from myria3d.metrics.iou import iou from myria3d.models.modules.pyg_randla_net import PyGRandLANet from myria3d.utils import utils -from torchmetrics.classification import MulticlassJaccardIndex log = utils.get_logger(__name__) diff --git a/myria3d/predict.py b/myria3d/predict.py index 50c03f62..7c50219e 100644 --- a/myria3d/predict.py +++ b/myria3d/predict.py @@ -5,7 +5,7 @@ import hydra import torch from omegaconf import DictConfig -from pytorch_lightning import LightningDataModule, LightningModule +from pytorch_lightning import LightningDataModule from tqdm import tqdm from myria3d.models.model import Model diff --git a/tests/runif.py b/tests/runif.py index ec5da504..7a2ac5f6 100644 --- a/tests/runif.py +++ b/tests/runif.py @@ -1,5 +1,4 @@ import pytest -import torch from lightning.pytorch.accelerators import find_usable_cuda_devices """ @@ -39,7 +38,7 @@ def __new__( try: find_usable_cuda_devices(min_gpus) conditions.append(False) - except (ValueError, RuntimeError) as _: + except (ValueError, RuntimeError): conditions.append(True) reasons.append(f"GPUs>={min_gpus}") From ca919f2ff92863afbcb88b1a2c0118f76e6394bc Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 16:16:31 +0100 Subject: [PATCH 27/28] Mention the retrocompatibility of changes and the need to update predict_config --- CHANGELOG.md | 1 + run.py | 3 ++- ...51_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml} | 0 3 files changed, 3 insertions(+), 1 deletion(-) rename trained_model_assets/{proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml => proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38146d48..0507c492 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## 3.7.0 - Update all versions of Pytorch, Pytorch Lightning, and Pytorch Geometric. + Changes are retrocompatible for models trained with older versions (with adjustment to the configuration file). - Refactor logging of single-class IoUs to go from num_classes+1 torchmetrics instances to only 1. ### 3.6.1 diff --git a/run.py b/run.py index e9731a7b..4e47aef8 100755 --- a/run.py +++ b/run.py @@ -22,7 +22,7 @@ TASK_NAME_DETECTION_STRING = "task.task_name=" DEFAULT_DIRECTORY = "trained_model_assets/" -DEFAULT_CONFIG_FILE = "proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml" +DEFAULT_CONFIG_FILE = "proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml" DEFAULT_CHECKPOINT = "proto151_V2.0_epoch_100_Myria3DV3.1.0.ckpt" DEFAULT_ENV = "placeholder.env" @@ -48,6 +48,7 @@ def launch_train( # Imports should be nested inside @hydra.main to optimize tab completion # Read more here: https://github.com/facebookresearch/hydra/issues/934 from myria3d.train import train + utils.extras(config) # Pretty print config using Rich library diff --git a/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml b/trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml similarity index 100% rename from trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.5.0.yaml rename to trained_model_assets/proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0.yaml From e5f8a64f8f3c814c5b89c6b3153e0392e402b61e Mon Sep 17 00:00:00 2001 From: Charles Gaydon Date: Tue, 6 Feb 2024 16:19:19 +0100 Subject: [PATCH 28/28] Update config used in cicd --- .github/workflows/cicd.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/cicd.yaml b/.github/workflows/cicd.yaml index 8b9a5452..b6cb2171 100644 --- a/.github/workflows/cicd.yaml +++ b/.github/workflows/cicd.yaml @@ -41,8 +41,8 @@ jobs: - name: Example inference run via Docker with default config and checkpoint run: > docker run - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/ - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/ --ipc=host --shm-size=2gb myria3d @@ -56,14 +56,14 @@ jobs: - name: Example inference run via Docker with inference-time subtiles overlap to smooth-out results. run: > docker run - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/inputs/:/inputs/ - -v /var/data/cicd/CICD_github_assets/myria3d_V3.6.0/outputs/:/outputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/inputs/:/inputs/ + -v /var/data/cicd/CICD_github_assets/myria3d_V3.7.0/outputs/:/outputs/ --ipc=host --shm-size=2gb myria3d python run.py --config-path /inputs/ - --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.6.0 + --config-name proto151_V2.0_epoch_100_Myria3DV3.1.0_predict_config_V3.7.0 predict.ckpt_path=/inputs/proto151_V2.0_epoch_100_Myria3DV3.1.0.ckpt datamodule.epsg=2154 predict.src_las=/inputs/792000_6272000_subset_buildings.las