IGNF · leavauchier · Apr 18, 2024 · Apr 17, 2024 · Apr 18, 2024 · CharlesGaydon
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
 # main
 
+Save optimized thresholds as yaml instead of pickle to make it easier to read
+
 ### 1.10.2
 - Add support for metadata propagation through compound pdal pipelines:
   - fix epsg propagation

diff --git a/configs/building_validation/optimization/default.yaml b/configs/building_validation/optimization/default.yaml
@@ -13,7 +13,7 @@ paths:
   group_info_pickle_path: ${.results_output_dir}/group_info.pickle
   prepared_las_dir: ${.results_output_dir}/prepared/
   updated_las_dir: ${.results_output_dir}/updated/
-  building_validation_thresholds_pickle: ${.results_output_dir}/optimized_thresholds.pickle # Wher
+  building_validation_thresholds: ${.results_output_dir}/optimized_thresholds.yaml # Wher
 
 # CLASSIFICATION CODES of a dataset which was inspected
 # and labeled post TerraSolid macro

diff --git a/docs/source/guides/thresholds_optimization.md b/docs/source/guides/thresholds_optimization.md
@@ -37,7 +37,7 @@ building_validation.optimization.paths.results_output_dir=[path/to/save/results]
 
 ### Evaluation of optimized thresholds on a test set
 
-Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the pickled decision thresholds from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.
+Once an optimal solution was found, you may want to evaluate the decision process on unseen data to evaluate generalization capability. For that, you will need another test folder of corrected data in the same format as before (a different `input_las_dir`). You need to specify that no optimization is required using the `todo` params. You also need to give the path to the decision thresholds file (yaml file) from the previous step, and specify a different `results_output_dir` so that prepared data of test and val test are not pooled together.
 
 
 ```bash
@@ -48,7 +48,7 @@ python lidar_prod/run.py \
 building_validation.optimization.todo='prepare+evaluate+update' \
 building_validation.optimization.paths.input_las_dir=[path/to/labelled/test/dataset/] \
 building_validation.optimization.paths.results_output_dir=[path/to/save/results] \
-building_validation.optimization.paths.building_validation_thresholds_pickle=[path/to/optimized_thresholds.pickle]
+building_validation.optimization.paths.building_validation_thresholds=[path/to/optimized_thresholds.yaml]
 ```
 
 ### Utils
@@ -57,4 +57,4 @@ Debug mode: to run on a single file during development, add a `+building_validat
 
 
 Reference:
-- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017)).
+- [Deb et al. (2002) - A fast and elitist multiobjective genetic algorithm\: NSGA-II](https://ieeexplore.ieee.org/document/996017).
diff --git a/lidar_prod/tasks/building_validation.py b/lidar_prod/tasks/building_validation.py
@@ -9,6 +9,7 @@
 import geopandas
 import numpy as np
 import pdal
+import yaml
 from tqdm import tqdm
 
 from lidar_prod.tasks.utils import (
@@ -378,3 +379,14 @@ class thresholds:
     min_frac_refutation: float
     min_entropy_uncertainty: float
     min_frac_entropy_uncertain: float
+
+    def dump(self, filename: str):
+        with open(filename, "w") as f:
+            yaml.safe_dump(self.__dict__, f)
+
+    @staticmethod
+    def load(filename: str):
+        with open(filename, "r") as f:
+            data = yaml.safe_load(f)
+
+        return thresholds(**data)
diff --git a/lidar_prod/tasks/building_validation_optimization.py b/lidar_prod/tasks/building_validation_optimization.py
@@ -185,22 +185,22 @@ def evaluate(self) -> dict:
 
         """
         clusters = self._load_clusters()
-        self._set_thresholds_from_pickle_if_available()
+        self._set_thresholds_from_file_if_available()
         decisions = np.array([self.bv._make_group_decision(c) for c in clusters])
         mts_gt = np.array([c.target for c in clusters])
         metrics_dict = self.evaluate_decisions(mts_gt, decisions)
         log.info(f"\n Results:\n{self._get_results_logs_str(metrics_dict)}")
         return metrics_dict
 
-    def _set_thresholds_from_pickle_if_available(self):
+    def _set_thresholds_from_file_if_available(self):
         try:
-            with open(self.paths.building_validation_thresholds_pickle, "rb") as f:
-                self.bv.thresholds = pickle.load(f)
+            self.bv.thresholds = thresholds.load(self.paths.building_validation_thresholds)
+
         except FileNotFoundError:
             warnings.warn(
                 "Using default thresholds from hydra config to perform decisions. "
-                "You may want to specify different thresholds via a pickled object by specifying "
-                "building_validation.optimization.paths.building_validation_thresholds_pickle",
+                "You may want to specify different thresholds via a yaml file by specifying "
+                "building_validation.optimization.paths.building_validation_thresholds",
                 UserWarning,
             )
 
@@ -213,7 +213,7 @@ def update(self):
 
         """
         log.info(f"Updated las will be saved in {self.paths.results_output_dir}")
-        self._set_thresholds_from_pickle_if_available()
+        self._set_thresholds_from_file_if_available()
         for prepared_las_path, target_las_path in tqdm(
             zip(self.prepared_las_filepaths, self.out_las_filepaths),
             total=len(self.prepared_las_filepaths),
@@ -354,11 +354,10 @@ def _select_best_rules(self, study):
         best_rules = thresholds(**best.params)
         return best_rules
 
-    def _dump_best_rules(self, best_trial_params):
-        """Serializes best thresholds."""
-        with open(self.paths.building_validation_thresholds_pickle, "wb") as f:
-            pickle.dump(best_trial_params, f)
-            log.info(f"Pickled best params to {self.paths.building_validation_thresholds_pickle}")
+    def _dump_best_rules(self, best_trial_params: thresholds):
+        """Saves best thresholds to a yaml file."""
+        best_trial_params.dump(self.paths.building_validation_thresholds)
+        log.info(f"Saved best params to {self.paths.building_validation_thresholds}")
 
     def _dump_clusters(self, clusters):
         """Serializes the list of cluster-level information objects."""

diff --git a/tests/files/building_optimization_data/preds/874000_6617000.subset.laz b/tests/files/building_optimization_data/preds/874000_6617000.subset.laz
diff --git a/tests/files/building_optimization_data/preds/876000_6617000.subset.laz b/tests/files/building_optimization_data/preds/876000_6617000.subset.laz
diff --git a/tests/lidar_prod/tasks/test_building_validation.py b/tests/lidar_prod/tasks/test_building_validation.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pytest
 
-from lidar_prod.tasks.building_validation import BuildingValidator
+from lidar_prod.tasks.building_validation import BuildingValidator, thresholds
 from lidar_prod.tasks.utils import BDUniConnectionParams, get_las_data_from_las
 from tests.conftest import (
     check_expected_classification,
@@ -171,3 +171,24 @@ def test_run(hydra_cfg):
             dims.candidate_buildings_flag,
         ],
     )
+
+
+def test_thresholds():
+    dump_file = str(TMP_DIR / "threshold_dump.yml")
+
+    th = thresholds(
+        min_confidence_confirmation=0.1,
+        min_frac_confirmation=0.2,
+        min_frac_confirmation_factor_if_bd_uni_overlay=0.3,
+        min_uni_db_overlay_frac=0.4,
+        min_confidence_refutation=0.5,
+        min_frac_refutation=0.6,
+        min_entropy_uncertainty=0.7,
+        min_frac_entropy_uncertain=0.8,
+    )
+
+    th.dump(dump_file)
+
+    th1 = th.load(dump_file)
+
+    assert th1 == th
diff --git a/tests/lidar_prod/tasks/test_building_validation_optimization.py b/tests/lidar_prod/tasks/test_building_validation_optimization.py
@@ -0,0 +1,47 @@
+import os
+import shutil
+from pathlib import Path
+
+import hydra
+
+from lidar_prod.tasks.building_validation import thresholds
+from lidar_prod.tasks.building_validation_optimization import (
+    BuildingValidationOptimizer,
+)
+from lidar_prod.tasks.utils import BDUniConnectionParams
+
+TMP_DIR = Path("tmp/lidar_prod/tasks/building_validation_optimization")
+
+
+def setup_module(module):
+    try:
+        shutil.rmtree(TMP_DIR)
+    except FileNotFoundError:
+        pass
+    TMP_DIR.mkdir(parents=True, exist_ok=True)
+
+
+def test_BuildingValidationOptimizer_run(hydra_cfg):
+    config = hydra_cfg.copy()
+    opt_cfg = config.building_validation.optimization
+    opt_cfg.paths.input_las_dir = "tests/files/building_optimization_data/preds"
+    opt_cfg.paths.results_output_dir = str(TMP_DIR / "run")
+
+    bvo: BuildingValidationOptimizer = hydra.utils.instantiate(
+        config.building_validation.optimization
+    )
+
+    bd_uni_connection_params: BDUniConnectionParams = hydra.utils.instantiate(
+        hydra_cfg.bd_uni_connection_params
+    )
+    bvo.bv.bd_uni_connection_params = bd_uni_connection_params
+    bvo.run()
+
+    th_yaml = opt_cfg.paths.building_validation_thresholds
+
+    assert os.path.isfile(th_yaml)
+    assert isinstance(thresholds.load(th_yaml), thresholds)
+
+    for filename in os.listdir(opt_cfg.paths.input_las_dir):
+        assert (TMP_DIR / "run" / "prepared" / filename).is_file()
+        assert (TMP_DIR / "run" / "updated" / filename).is_file()