From e4b772876086ba084a602392aa9606cdfb2a89fd Mon Sep 17 00:00:00 2001 From: Matic Lubej Date: Tue, 7 Nov 2023 16:16:28 +0100 Subject: [PATCH 01/10] Use trigger API to build internal docker (#297) * use trigger API to build internal docker instead * fix github action --------- Co-authored-by: Matic Lubej --- .github/workflows/ci_action.yml | 17 +++++++++++++++++ .github/workflows/ci_trigger.yml | 30 ++++++++++-------------------- .gitlab-ci.yml | 18 ------------------ 3 files changed, 27 insertions(+), 38 deletions(-) delete mode 100644 .gitlab-ci.yml diff --git a/.github/workflows/ci_action.yml b/.github/workflows/ci_action.yml index 0d012b74..c7ed56ad 100644 --- a/.github/workflows/ci_action.yml +++ b/.github/workflows/ci_action.yml @@ -113,3 +113,20 @@ jobs: files: coverage.xml fail_ci_if_error: true verbose: false + + mirror-to-gitlab: + if: github.event_name == 'push' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Mirror + trigger CI + uses: SvanBoxel/gitlab-mirror-and-ci-action@master + with: + args: "https://git.sinergise.com/eo/code/eo-grow" + env: + FOLLOW_TAGS: "true" + GITLAB_HOSTNAME: "git.sinergise.com" + GITLAB_USERNAME: "github-action" + GITLAB_PASSWORD: ${{ secrets.GITLAB_PASSWORD }} + GITLAB_PROJECT_ID: "878" + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/ci_trigger.yml b/.github/workflows/ci_trigger.yml index 24a21073..5bdd9419 100644 --- a/.github/workflows/ci_trigger.yml +++ b/.github/workflows/ci_trigger.yml @@ -1,29 +1,19 @@ -name: mirror_and_trigger +name: trigger on: - pull_request: - push: - branches: - - "master" - - "develop" - workflow_call: release: types: - published jobs: - mirror-to-gitlab: + trigger: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 - - name: Mirror + trigger CI - uses: SvanBoxel/gitlab-mirror-and-ci-action@master - with: - args: "https://git.sinergise.com/eo/code/eo-grow" - env: - FOLLOW_TAGS: "true" - GITLAB_HOSTNAME: "git.sinergise.com" - GITLAB_USERNAME: "github-action" - GITLAB_PASSWORD: ${{ secrets.GITLAB_PASSWORD }} - GITLAB_PROJECT_ID: "878" - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Trigger API + run: > + curl -X POST --fail \ + -F token=${{ secrets.GITLAB_PIPELINE_TRIGGER_TOKEN }} \ + -F ref=main \ + -F variables[CUSTOM_RUN_TAG]=auto \ + -F variables[LAYER_NAME]=dotai-eo \ + https://git.sinergise.com/api/v4/projects/1031/trigger/pipeline diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml deleted file mode 100644 index 9f2b3c0a..00000000 --- a/.gitlab-ci.yml +++ /dev/null @@ -1,18 +0,0 @@ -image: python:3.9 - -stages: - - build - -build_docker_image: - stage: build - needs: [] - rules: - - if: $CI_COMMIT_TAG # run only on releases - when: always - variables: - CUSTOM_RUN_TAG: auto # this will create images with the latest tag and the version tag - LAYER_NAME: dotai-eo - - when: manual - trigger: - project: eo/infra/docker - allow_failure: true From f736a8bfe7b26c099b76fe73c59e333469b701f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Fri, 10 Nov 2023 12:47:39 +0100 Subject: [PATCH 02/10] Switch from ray worker types to a full resource request specification (#298) * add option to pass full resources * add dot in docstring * forgot about type style restrictions * forgot to add factory --- eogrow/core/pipeline.py | 3 +-- eogrow/core/schemas.py | 10 +++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/eogrow/core/pipeline.py b/eogrow/core/pipeline.py index 5e8a2583..4552b89c 100644 --- a/eogrow/core/pipeline.py +++ b/eogrow/core/pipeline.py @@ -175,8 +175,7 @@ def run_execution( extra_kwargs = {} if execution_kind is ProcessingType.RAY: executor_class = RayExecutor - if self.config.ray_worker_type is not None: - extra_kwargs = {"ray_remote_kwargs": {"resources": {self.config.ray_worker_type: 0.001}}} + extra_kwargs = {"ray_remote_kwargs": self.config.ray_remote_kwargs} else: executor_class = EOExecutor executor_run_params["workers"] = self.config.workers diff --git a/eogrow/core/schemas.py b/eogrow/core/schemas.py index 804caa4f..4ce1d2f5 100644 --- a/eogrow/core/schemas.py +++ b/eogrow/core/schemas.py @@ -8,7 +8,7 @@ from __future__ import annotations from inspect import isclass -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from pydantic import BaseModel, Field from pydantic.fields import ModelField @@ -48,11 +48,11 @@ class PipelineSchema(BaseSchema): workers: int = Field( 1, description="Number of workers for parallel execution of workflows. Parameter does not affect ray clusters." ) - ray_worker_type: Optional[str] = Field( + ray_remote_kwargs: Dict[str, Any] = Field( + default_factory=dict, description=( - "Restricts execution of parallelized tasks only to `ray` worker instances of the requested type. The worker" - " section of the `cluster.yaml` file should specify the custom resource with a matching name and the value" - " set to 1." + "Keyword arguments passed to ray tasks when executing via `RayExecutor`. The options are specified [here]" + "(https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html)." ), ) use_ray: BoolOrAuto = Field( From 4f8d2cb971d820a2c821ee74f1f6c2a2218b65db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Fri, 10 Nov 2023 13:01:15 +0100 Subject: [PATCH 03/10] make logging a tiny bit more customizable (#299) --- eogrow/core/logging.py | 24 +++++++++++------------- eogrow/core/pipeline.py | 5 +++-- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/eogrow/core/logging.py b/eogrow/core/logging.py index f63c6029..7c2bc2ab 100644 --- a/eogrow/core/logging.py +++ b/eogrow/core/logging.py @@ -117,7 +117,7 @@ def get_pipeline_logs_folder(self, pipeline_execution_name: str, full_path: bool return join_path(main_logs_folder, pipeline_execution_name) return fs.path.combine(main_logs_folder, pipeline_execution_name) - def start_logging(self, pipeline_execution_name: str) -> list[Handler]: + def start_logging(self, logger: logging.Logger, pipeline_execution_name: str, filename: str) -> list[Handler]: """Creates a folder for logs and sets up (and returns) logging handlers Supported handlers: @@ -129,22 +129,21 @@ def start_logging(self, pipeline_execution_name: str) -> list[Handler]: self.storage.filesystem.makedirs(logs_folder, recreate=True) self._add_cluster_config_to_logs(logs_folder) - global_logger = logging.getLogger() - global_logger.setLevel(logging.DEBUG) + logger.setLevel(logging.DEBUG) - for default_handler in global_logger.handlers: + for default_handler in logger.handlers: default_handler.setLevel(logging.WARNING) new_handlers: list[Handler] = [] if self.config.save_logs: - file_handler = self._create_file_handler(pipeline_execution_name) - global_logger.addHandler(file_handler) + file_handler = self._create_file_handler(pipeline_execution_name, filename) + logger.addHandler(file_handler) new_handlers.append(file_handler) if self.config.show_logs: stdout_handler = self._create_stdout_handler() - global_logger.addHandler(stdout_handler) + logger.addHandler(stdout_handler) new_handlers.append(stdout_handler) if self.config.capture_warnings: @@ -160,9 +159,9 @@ def _add_cluster_config_to_logs(self, logs_folder: str) -> None: os_fs = OSFS(os_folder) # the file is on the head node, might not be visible in storage.filesystem fs.copy.copy_file(os_fs, os_file, self.storage.filesystem, fs.path.join(logs_folder, "cluster.yaml")) - def _create_file_handler(self, pipeline_execution_name: str) -> Handler: + def _create_file_handler(self, pipeline_execution_name: str, filename: str) -> Handler: """Creates a logging handler to write a pipeline log to a file.""" - logs_filename = fs.path.combine(self.get_pipeline_logs_folder(pipeline_execution_name), "pipeline.log") + logs_filename = fs.path.combine(self.get_pipeline_logs_folder(pipeline_execution_name), filename) file_handler = RegularBackupHandler( logs_filename, filesystem=self.storage.filesystem, @@ -194,19 +193,18 @@ def _create_stdout_handler(self) -> Handler: return stdout_handler - def stop_logging(self, handlers: list[Handler]) -> None: + def stop_logging(self, logger: logging.Logger, handlers: list[Handler]) -> None: """Updates logs, removes pipeline handlers from the global logger and puts global logging level back to default """ if self.config.capture_warnings: logging.captureWarnings(False) - global_logger = logging.getLogger() for handler in handlers: handler.close() - global_logger.removeHandler(handler) + logger.removeHandler(handler) - global_logger.setLevel(logging.WARNING) + logger.setLevel(logging.WARNING) def update_pipeline_report( self, diff --git a/eogrow/core/pipeline.py b/eogrow/core/pipeline.py index 4552b89c..e20fafa9 100644 --- a/eogrow/core/pipeline.py +++ b/eogrow/core/pipeline.py @@ -222,7 +222,8 @@ def run(self) -> None: timestamp = current_timestamp() self.current_execution_name = self.get_pipeline_execution_name(timestamp) - handlers = self.logging_manager.start_logging(self.current_execution_name) + root_logger = logging.getLogger() + handlers = self.logging_manager.start_logging(root_logger, self.current_execution_name, "pipeline.log") try: self.logging_manager.update_pipeline_report( pipeline_execution_name=self.current_execution_name, @@ -259,7 +260,7 @@ def run(self) -> None: pipeline_execution_name=self.current_execution_name, finished=finished, failed=failed ) finally: - self.logging_manager.stop_logging(handlers) + self.logging_manager.stop_logging(root_logger, handlers) def run_procedure(self) -> tuple[list[str], list[str]]: """Execution procedure of pipeline. Can be overridden if needed. From 2a7d14acd0f311ba51e663ab913afc8ffda83791 Mon Sep 17 00:00:00 2001 From: Matic Lubej Date: Tue, 14 Nov 2023 09:42:26 +0100 Subject: [PATCH 04/10] remove "whether to" from docs (#301) Co-authored-by: Matic Lubej --- eogrow/core/schemas.py | 4 ++-- eogrow/core/storage.py | 2 +- eogrow/pipelines/batch_to_eopatch.py | 4 +--- eogrow/pipelines/byoc.py | 2 +- eogrow/pipelines/download.py | 2 +- eogrow/pipelines/prediction.py | 2 +- eogrow/tasks/spatial.py | 2 +- eogrow/utils/filter.py | 4 ++-- eogrow/utils/general.py | 2 +- 9 files changed, 11 insertions(+), 13 deletions(-) diff --git a/eogrow/core/schemas.py b/eogrow/core/schemas.py index 4ce1d2f5..7c446121 100644 --- a/eogrow/core/schemas.py +++ b/eogrow/core/schemas.py @@ -58,7 +58,7 @@ class PipelineSchema(BaseSchema): use_ray: BoolOrAuto = Field( "auto", description=( - "Whether to run the pipeline locally or using a (local or remote) ray cluster. When using `auto` the" + "Run the pipeline locally or using a (local or remote) ray cluster. When using `auto` the" " pipeline checks if it can connect to a cluster, and if none are available runs locally." ), ) @@ -77,7 +77,7 @@ class PipelineSchema(BaseSchema): ), ) raise_on_temporal_mismatch: bool = Field( - False, description="Whether to treat `TemporalDimensionWarning` as an exception during EOExecution." + False, description="Treat `TemporalDimensionWarning` as an exception during EOExecution." ) diff --git a/eogrow/core/storage.py b/eogrow/core/storage.py index ade608fc..d6b42b2d 100644 --- a/eogrow/core/storage.py +++ b/eogrow/core/storage.py @@ -42,7 +42,7 @@ class Schema(ManagerSchema, BaseSettings): geopandas_backend: Literal["fiona", "pyogrio"] = Field( "fiona", description="Which backend is used for IO operations when using geopandas." ) - use_zarr: bool = Field(False, description="Whether to use the Zarr backend for EOPatch IO.") + use_zarr: bool = Field(False, description="Use the Zarr backend for EOPatch IO.") class Config(ManagerSchema.Config): case_sensitive = True diff --git a/eogrow/pipelines/batch_to_eopatch.py b/eogrow/pipelines/batch_to_eopatch.py index e6da9dfb..37ac0887 100644 --- a/eogrow/pipelines/batch_to_eopatch.py +++ b/eogrow/pipelines/batch_to_eopatch.py @@ -77,9 +77,7 @@ def check_nonempty_input(cls, value: list, values: RawSchemaDict) -> list: ), "At least one of `userdata_feature_name`, `userdata_timestamp_reader`, or `mapping` has to be set." return value - remove_batch_data: bool = Field( - True, description="Whether to remove the raw batch data after the conversion is complete" - ) + remove_batch_data: bool = Field(True, description="Remove the raw batch data after the conversion is complete") config: Schema diff --git a/eogrow/pipelines/byoc.py b/eogrow/pipelines/byoc.py index 64323e76..e0850477 100644 --- a/eogrow/pipelines/byoc.py +++ b/eogrow/pipelines/byoc.py @@ -74,7 +74,7 @@ def _parse_sensing_time(cls, value: Optional[str], values: Dict[str, object]) -> cover_geometry: Optional[str] = Field(description="Specifies a geometry file describing the cover geometry.") _ensure_cover_geometry = ensure_defined_together("cover_geometry_folder_key", "cover_geometry") - reingest_existing: bool = Field(False, description="Whether to reingest or skip already ingested tiles.") + reingest_existing: bool = Field(False, description="Reingest or skip already ingested tiles.") config: Schema diff --git a/eogrow/pipelines/download.py b/eogrow/pipelines/download.py index f97414bd..89a59fea 100644 --- a/eogrow/pipelines/download.py +++ b/eogrow/pipelines/download.py @@ -242,7 +242,7 @@ class Schema(BaseDownloadPipeline.Schema, CommonDownloadFields, TimeDependantFie bands: Optional[List[str]] = Field(description="Names of bands to download") additional_data: List[Feature] = Field(default_factory=list, description="Additional data to download") use_dn: bool = Field( - False, description="Whether to save bands as float32 reflectance (default), or int16 digital numbers." + False, description="Save bands as float32 reflectance (default), or int16 digital numbers." ) config: Schema diff --git a/eogrow/pipelines/prediction.py b/eogrow/pipelines/prediction.py index c665da34..65f53300 100644 --- a/eogrow/pipelines/prediction.py +++ b/eogrow/pipelines/prediction.py @@ -137,7 +137,7 @@ class Schema(BasePredictionPipeline.Schema): output_feature_name: str model_filename: str = Field(description="A filename of a regression model to be used for prediction.") clip_predictions: Optional[Tuple[float, float]] = Field( - description="Whether to clip values of predictions to specified interval" + description="Clip values of predictions to specified interval" ) config: Schema diff --git a/eogrow/tasks/spatial.py b/eogrow/tasks/spatial.py index 0646662f..11bb127a 100644 --- a/eogrow/tasks/spatial.py +++ b/eogrow/tasks/spatial.py @@ -208,7 +208,7 @@ def get_array_slices( :param slice_bbox: A bounding box of array to be sliced. :param resolution: A working resolution in CRS units. :param size: A working size. - :param raise_misaligned: Whether to raise an error if the slice would be pixel misaligned the initial array. + :param raise_misaligned: Raise an error if the slice would be pixel misaligned the initial array. :param limit_x: If provided it will clip the horizontal slice to a given interval, should be used to clip slice_bbox to bbox. :param limit_y: If provided it will clip the vertical slice to a given interval, should be used to clip slice_bbox diff --git a/eogrow/utils/filter.py b/eogrow/utils/filter.py index bcdee236..6cd4dd94 100644 --- a/eogrow/utils/filter.py +++ b/eogrow/utils/filter.py @@ -54,8 +54,8 @@ def get_patches_with_missing_features( :param patches_folder: A path to folder with EOPatches, relative to `filesystem` object. :param patch_list: A list of EOPatch names. :param features: A list of EOPatch features. - :param check_bbox: Whether to make sure that the bbox is present. - :param check_timestamps: Whether to make sure that the timestamps are present. + :param check_bbox: Make sure that the bbox is present. + :param check_timestamps: Make sure that the timestamps are present. :return: A sublist of `patch_list` with only EOPatch names that are missing some features. """ eopatch_paths = [fs.path.combine(patches_folder, eopatch) for eopatch, _ in patch_list] diff --git a/eogrow/utils/general.py b/eogrow/utils/general.py index 67610d41..4883a6f4 100644 --- a/eogrow/utils/general.py +++ b/eogrow/utils/general.py @@ -36,7 +36,7 @@ def convert_to_int(values: np.ndarray, raise_diff: bool, error: float = 1e-8) -> """Converts an array of floats into array of integers. :param values: An array of float values to be converted. - :param raise_diff: Whether to raise an error if float values differ from integer values for more than the expected + :param raise_diff: Raise an error if float values differ from integer values for more than the expected error. :param error: A joined maximal expected numerical error. """ From 19fd7137412ac2a9c7ef8f094cdfa390feb4fd02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Tue, 14 Nov 2023 10:46:32 +0100 Subject: [PATCH 05/10] Minor revisions (#302) * update pre-commit * remoe some redundant ignores --- .pre-commit-config.yaml | 6 ++-- eogrow/core/area/base.py | 2 +- eogrow/pipelines/sampling.py | 6 ++-- eogrow/pipelines/testing.py | 6 ++-- pyproject.toml | 68 +++++++++++++++++------------------- 5 files changed, 42 insertions(+), 46 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6ee9de27..15eb88f0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,20 +13,20 @@ repos: - id: debug-statements - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.0.3" + rev: "v3.1.0" hooks: - id: prettier exclude: "tests/(test_stats|test_project)/" types_or: [json] - repo: https://github.com/psf/black - rev: 23.10.1 + rev: 23.11.0 hooks: - id: black language_version: python3 - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: "v0.1.4" + rev: "v0.1.5" hooks: - id: ruff diff --git a/eogrow/core/area/base.py b/eogrow/core/area/base.py index 76d4c3bf..d889557a 100644 --- a/eogrow/core/area/base.py +++ b/eogrow/core/area/base.py @@ -114,7 +114,7 @@ def _save_grid(self, grid: dict[CRS, gpd.GeoDataFrame], grid_path: str) -> None: LOGGER.info("Saving grid to %s", grid_path) with LocalFile(grid_path, mode="w", filesystem=self.storage.filesystem) as local_file: - for _, crs_grid in grid.items(): + for crs_grid in grid.values(): crs_grid.to_file( local_file.path, driver="GPKG", diff --git a/eogrow/pipelines/sampling.py b/eogrow/pipelines/sampling.py index f5f90f93..57e4ff8e 100644 --- a/eogrow/pipelines/sampling.py +++ b/eogrow/pipelines/sampling.py @@ -95,7 +95,7 @@ def _get_loading_node(self) -> EONode: raise ValueError(f"Only spatial features can be sampled, but found {feature_type}: {feature_names}") for feature_name in feature_names: - load_features.append((feature_type, feature_name)) + load_features.append((feature_type, feature_name)) # noqa: PERF401 load_task = LoadTask( self.storage.get_folder(folder_name), @@ -118,7 +118,7 @@ def _get_sampling_node(self, previous_node: EONode) -> EONode: def _get_features_to_sample(self) -> list[tuple[FeatureType, str, str]]: """Get a list of features that will be sampled, together with their new names""" features_to_sample = [] - for _, features in self.config.apply_to.items(): + for features in self.config.apply_to.values(): for feature_type, feature_names in features.items(): for feature_name in feature_names: if self.config.sampled_suffix is None: @@ -169,7 +169,7 @@ def get_execution_arguments(self, workflow: EOWorkflow, patch_list: PatchList) - generator = np.random.default_rng(seed=self.config.seed) - for _, patch_args in exec_args.items(): + for patch_args in exec_args.values(): patch_args[sampling_node] = dict(seed=generator.integers(low=0, high=2**32)) return exec_args diff --git a/eogrow/pipelines/testing.py b/eogrow/pipelines/testing.py index bd261b22..76b19248 100644 --- a/eogrow/pipelines/testing.py +++ b/eogrow/pipelines/testing.py @@ -161,14 +161,14 @@ def get_execution_arguments(self, workflow: EOWorkflow, patch_list: PatchList) - for node, node_seed in per_node_seeds.items(): if isinstance(node.task, CreateEOPatchTask): - for _, patch_args in exec_args.items(): + for patch_args in exec_args.values(): patch_args[node]["meta_info"] = self.config.meta_info if isinstance(node.task, GenerateTimestampsTask) and same_timestamps: - for _, patch_args in exec_args.items(): + for patch_args in exec_args.values(): patch_args[node] = dict(seed=node_seed) elif isinstance(node.task, (GenerateRasterFeatureTask, GenerateTimestampsTask)): node_rng = np.random.default_rng(seed=node_seed) - for _, patch_args in exec_args.items(): + for patch_args in exec_args.values(): patch_args[node] = dict(seed=node_rng.integers(low=0, high=2**32)) return exec_args diff --git a/pyproject.toml b/pyproject.toml index e7627035..bd2ab203 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,11 +60,7 @@ dependencies = [ ] [project.optional-dependencies] -ml = [ - "joblib", - "lightgbm>=3.0.0", - "scikit-learn", -] +ml = ["joblib", "lightgbm>=3.0.0", "scikit-learn"] docs = [ "autodoc_pydantic", "nbsphinx", @@ -118,38 +114,39 @@ preview = true line-length = 120 target-version = "py38" select = [ - "F", # pyflakes - "E", # pycodestyle - "W", # pycodestyle - "C90", # mccabe - "N", # naming - "YTT", # flake-2020 - "B", # bugbear - "A", # built-ins - "COM", # commas - "C4", # comprehensions - "T10", # debugger statements - "ISC", # implicit string concatenation - "ICN", # import conventions - "G", # logging format - "PIE", # flake8-pie - "T20", # print statements - "PT", # pytest style - "RET", # returns - "SLF", # private member access - "SIM", # simplifications - "ARG", # unused arguments - "PD", # pandas - "PGH", # pygrep hooks (useless noqa comments, eval statements etc.) - "FLY", # flynt - "RUF", # ruff rules - "NPY", # numpy - "I", # isort + "F", # pyflakes + "E", # pycodestyle + "W", # pycodestyle + "C90", # mccabe + "I", # isort + "N", # naming + "YTT", # flake-2020 + "B", # bugbear + "A", # built-ins + "COM", # commas + "C4", # comprehensions + "T10", # debugger statements + "ISC", # implicit string concatenation + "ICN", # import conventions + "G", # logging format + "PIE", # flake8-pie + "T20", # print statements + "PT", # pytest style + "RET", # returns + "SLF", # private member access + "SIM", # simplifications + "ARG", # unused arguments + "PD", # pandas + "PGH", # pygrep hooks (useless noqa comments, eval statements etc.) + "FLY", # flynt + "NPY", # numpy + "PERF", # perflint, performance improvements + "RUF", # ruff rules ] fix = true fixable = [ - "I", # sort imports - "F401", # remove redundant imports + "I", # sort imports + "F401", # remove redundant imports ] ignore = [ "C408", # complains about `dict()` calls, we use them to avoid too many " in the code @@ -158,12 +155,11 @@ ignore = [ "A003", # complains when ATTRIBUTES shadow builtins, we have objects that implement `filter` and such "COM812", # trailing comma missing, fights with black "PD011", # suggests `.to_numpy` instead of `.values`, also does this for non-pandas objects... - "N805", # complains about first arguments not being self, but pydantic validators are like that... + "N805", # complains about first arguments not being self, but pydantic validators are like that... # potentially fixable "B904", # want `raise ... from None` instead of just `raise ...` "B028", # always demands a stacklevel argument when warning "PT011", # complains for `pytest.raises(ValueError)` but we use it a lot - "UP024", # wants to switch IOError with OSError ] per-file-ignores = { "__init__.py" = ["F401"] } exclude = [".git", "__pycache__", "build", "dist"] From 0368c674778bbb289879ca083a07a91296708c27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Wed, 15 Nov 2023 11:01:54 +0100 Subject: [PATCH 06/10] Accept ray as the one and only saviour of our souls (#300) * adjust core functions * adjust implementations * remove redundant functionality * update tests to work again * rename parameter * Update eogrow/core/pipeline.py Co-authored-by: Matic Lubej * enable debug mode * Update eogrow/pipelines/merge_samples.py Co-authored-by: Matic Lubej * Update eogrow/core/schemas.py Co-authored-by: Matic Lubej * some minor polish * satisfy mypy --------- Co-authored-by: Matic Lubej --- .github/workflows/ci_action.yml | 3 ++ docs/source/common-configuration-patterns.md | 2 - eogrow/core/pipeline.py | 31 +++++------- eogrow/core/schemas.py | 13 +---- eogrow/pipelines/download.py | 25 +++------- eogrow/pipelines/merge_samples.py | 37 ++++++++++++-- eogrow/pipelines/prediction.py | 7 --- eogrow/tasks/prediction.py | 14 ++---- eogrow/types.py | 11 +---- eogrow/utils/ray.py | 29 ----------- .../download_and_batch/download_custom.json | 3 +- .../download_custom_collection.json | 3 +- .../download_custom_raise.json | 3 +- .../download_and_batch/download_dem.json | 3 +- .../download_l1c_q1_dn.json | 3 +- .../download_l1c_q1_dn_rescaled.json | 3 +- .../download_l1c_yearly.json | 1 - .../download_and_batch/download_q3.json | 3 +- .../export_maps/export_maps_data.json | 3 +- .../export_maps_data_compressed.json | 1 - .../export_maps/export_maps_mask.json | 3 +- .../export_maps_mask_local_copy.json | 3 +- .../merge_samples/merge_features_samples.json | 2 +- .../merge_reference_samples.json | 2 +- .../other/simple_config.json | 1 - .../rasterize_feature_with_resolution.json | 1 - .../sampling/sampling_block_fraction.json | 3 +- .../sampling/sampling_block_number.json | 3 +- .../sampling/sampling_chain.json | 3 +- .../sampling/sampling_fraction.json | 3 +- .../sampling/sampling_grid.json | 3 +- tests/utils/test_ray.py | 48 ------------------- 32 files changed, 76 insertions(+), 197 deletions(-) delete mode 100644 tests/utils/test_ray.py diff --git a/.github/workflows/ci_action.yml b/.github/workflows/ci_action.yml index c7ed56ad..1127a1bd 100644 --- a/.github/workflows/ci_action.yml +++ b/.github/workflows/ci_action.yml @@ -90,6 +90,9 @@ jobs: pip install -e .[DEV,ML] pip install gdal==$(gdal-config --version) + - name: Set up local cluster + run: ray start --head + - name: Run fast tests if: ${{ !matrix.full_test_suite }} run: pytest -m "not integration" diff --git a/docs/source/common-configuration-patterns.md b/docs/source/common-configuration-patterns.md index 9abd4eec..ef3c094f 100644 --- a/docs/source/common-configuration-patterns.md +++ b/docs/source/common-configuration-patterns.md @@ -9,8 +9,6 @@ Invoking `eogrow-template "eogrow.pipelines.zipmap.ZipMapPipeline" "zipmap.json" { "pipeline": "eogrow.pipelines.zipmap.ZipMapPipeline", "pipeline_name": "<< Optional[str] >>", - "workers": "<< 1 : int >>", - "use_ray": "<< 'auto' : Union[Literal['auto'], bool] >>", "input_features": { "<< type >>": "List[InputFeatureSchema]", "<< nested schema >>": "", diff --git a/eogrow/core/pipeline.py b/eogrow/core/pipeline.py index e20fafa9..4e48fd40 100644 --- a/eogrow/core/pipeline.py +++ b/eogrow/core/pipeline.py @@ -7,13 +7,14 @@ import uuid from typing import Any, TypeVar +import ray + from eolearn.core import CreateEOPatchTask, EOExecutor, EONode, EOWorkflow, LoadTask, SaveTask, WorkflowResults from eolearn.core.extra.ray import RayExecutor -from ..types import ExecKwargs, PatchList, ProcessingType +from ..types import ExecKwargs, PatchList from ..utils.general import current_timestamp from ..utils.meta import import_object -from ..utils.ray import handle_ray_connection from .area.base import BaseAreaManager from .base import EOGrowObject from .config import RawConfig @@ -148,13 +149,6 @@ def get_execution_arguments(self, workflow: EOWorkflow, patch_list: PatchList) - exec_kwargs[name] = patch_args return exec_kwargs - def _init_processing(self) -> ProcessingType: - """Figures out which execution mode is used and configures connection to Ray if required.""" - is_connected = handle_ray_connection(self.config.use_ray) - if is_connected: - return ProcessingType.RAY - return ProcessingType.MULTI if self.config.workers > 1 else ProcessingType.SINGLE - def run_execution( self, workflow: EOWorkflow, @@ -169,18 +163,15 @@ def run_execution( self.patch_list will be used :return: Lists of successfully/unsuccessfully executed EOPatch names and the result of the EOWorkflow execution """ - executor_class: type[EOExecutor] - - execution_kind = self._init_processing() - extra_kwargs = {} - if execution_kind is ProcessingType.RAY: - executor_class = RayExecutor - extra_kwargs = {"ray_remote_kwargs": self.config.ray_remote_kwargs} + if self.config.debug: + executor_class: type[EOExecutor] = EOExecutor + executor_kwargs = {} else: - executor_class = EOExecutor - executor_run_params["workers"] = self.config.workers + ray.init(address="auto", ignore_reinit_error=True) + executor_class = RayExecutor + executor_kwargs = {"ray_remote_kwargs": self.config.ray_remote_kwargs} - LOGGER.info("Starting %s for %d EOPatches", executor_class.__name__, len(execution_kwargs)) + LOGGER.info("Starting processing for %d EOPatches", len(execution_kwargs)) # Unpacking manually to ensure order matches list_of_kwargs, execution_names = [], [] @@ -198,7 +189,7 @@ def run_execution( logs_filter=EOExecutionFilter(ignore_packages=self.logging_manager.config.eoexecution_ignore_packages), logs_handler_factory=EOExecutionHandler, raise_on_temporal_mismatch=self.config.raise_on_temporal_mismatch, - **extra_kwargs, + **executor_kwargs, ) execution_results = executor.run(**executor_run_params) diff --git a/eogrow/core/schemas.py b/eogrow/core/schemas.py index 7c446121..0413bfa3 100644 --- a/eogrow/core/schemas.py +++ b/eogrow/core/schemas.py @@ -13,7 +13,7 @@ from pydantic import BaseModel, Field from pydantic.fields import ModelField -from ..types import BoolOrAuto, ImportPath +from ..types import ImportPath from ..utils.validators import field_validator, validate_manager from .base import EOGrowObject @@ -45,9 +45,6 @@ class PipelineSchema(BaseSchema): logging: ManagerSchema = Field(description="A schema of an implementation of LoggingManager class") validate_logging = field_validator("logging", validate_manager, pre=True) - workers: int = Field( - 1, description="Number of workers for parallel execution of workflows. Parameter does not affect ray clusters." - ) ray_remote_kwargs: Dict[str, Any] = Field( default_factory=dict, description=( @@ -55,13 +52,6 @@ class PipelineSchema(BaseSchema): "(https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html)." ), ) - use_ray: BoolOrAuto = Field( - "auto", - description=( - "Run the pipeline locally or using a (local or remote) ray cluster. When using `auto` the" - " pipeline checks if it can connect to a cluster, and if none are available runs locally." - ), - ) test_subset: Optional[List[Union[int, str]]] = Field( description=( @@ -79,6 +69,7 @@ class PipelineSchema(BaseSchema): raise_on_temporal_mismatch: bool = Field( False, description="Treat `TemporalDimensionWarning` as an exception during EOExecution." ) + debug: bool = Field(False, description="Run pipeline without the `ray` wrapper to enable debugging.") def build_schema_template( diff --git a/eogrow/pipelines/download.py b/eogrow/pipelines/download.py index 89a59fea..146375b0 100644 --- a/eogrow/pipelines/download.py +++ b/eogrow/pipelines/download.py @@ -5,7 +5,6 @@ import abc import datetime as dt import logging -from contextlib import nullcontext from typing import Any, Callable, List, Optional, Tuple import fs @@ -24,12 +23,11 @@ SentinelHubSession, Unit, ) -from sentinelhub.download import SessionSharing, collect_shared_session from ..core.pipeline import Pipeline from ..core.schemas import BaseSchema from ..tasks.common import LinearFunctionTask -from ..types import ExecKwargs, PatchList, ProcessingType, TimePeriod +from ..types import ExecKwargs, PatchList, TimePeriod from ..utils.filter import get_patches_with_missing_features from ..utils.validators import ( ensure_exactly_one_defined, @@ -116,12 +114,10 @@ def _get_output_features(self) -> list[Feature]: def _get_download_node(self, session_loader: SessionLoaderType) -> EONode: """Provides node for downloading data.""" - def _create_session_loader(self, execution_kind: ProcessingType) -> SessionLoaderType: - if execution_kind is ProcessingType.RAY: - session = SentinelHubSession(self.sh_config) - actor = RaySessionActor.remote(session) # type: ignore[attr-defined] - return lambda: ray.get(actor.get_valid_session.remote()) - return collect_shared_session if execution_kind is ProcessingType.MULTI else None + def _create_session_loader(self) -> SessionLoaderType: + session = SentinelHubSession(self.sh_config) + actor = RaySessionActor.remote(session) # type: ignore[attr-defined] + return lambda: ray.get(actor.get_valid_session.remote()) @staticmethod def get_postprocessing_node(postprocessing_config: PostprocessingRescale, previous_node: EONode) -> EONode: @@ -176,18 +172,11 @@ def get_execution_arguments(self, workflow: EOWorkflow, patch_list: PatchList) - return exec_args def run_procedure(self) -> tuple[list[str], list[str]]: - execution_kind = self._init_processing() - session_loader = self._create_session_loader(execution_kind) - patch_list = self.get_patch_list() - workflow = self.build_workflow(session_loader) + workflow = self.build_workflow(self._create_session_loader()) exec_args = self.get_execution_arguments(workflow, patch_list) - context: SessionSharing | nullcontext = nullcontext() - if execution_kind is ProcessingType.MULTI: - context = SessionSharing(SentinelHubSession(self.sh_config)) - with context: - finished, failed, _ = self.run_execution(workflow, exec_args) + finished, failed, _ = self.run_execution(workflow, exec_args) return finished, failed diff --git a/eogrow/pipelines/merge_samples.py b/eogrow/pipelines/merge_samples.py index c0b5da9c..67a314f1 100644 --- a/eogrow/pipelines/merge_samples.py +++ b/eogrow/pipelines/merge_samples.py @@ -9,10 +9,11 @@ import numpy as np from pydantic import Field -from eolearn.core import EOPatch, EOWorkflow, FeatureType, LoadTask, OutputTask, linearly_connect_tasks +from eolearn.core import EOExecutor, EOPatch, EOWorkflow, FeatureType, LoadTask, OutputTask, linearly_connect_tasks from eolearn.core.types import Feature from eolearn.core.utils.fs import get_full_path +from ..core.logging import EOExecutionFilter, EOExecutionHandler from ..core.pipeline import Pipeline from ..utils.validators import ensure_storage_key_presence @@ -42,8 +43,7 @@ class Schema(Pipeline.Schema): ) ) suffix: str = Field("", description="String to append to array filenames") - workers: int = Field(1, description="Number of threads used to load data from EOPatches in parallel.") - use_ray: Literal[False] = Field(False, description="Pipeline does not parallelize properly.") + num_threads: int = Field(1, description="Number of threads used to load data from EOPatches in parallel.") skip_existing: Literal[False] = False config: Schema @@ -59,9 +59,36 @@ def run_procedure(self) -> tuple[list[str], list[str]]: # It doesn't make sense to parallelize loading over a cluster, but it would # make sense to parallelize over # features that have to be concatenated or, if we would concatenate into multiple files, parallelize creating # batches of features - successful, failed, results = self.run_execution(workflow, exec_args, multiprocess=False) + LOGGER.info("Starting processing for %d EOPatches", len(exec_args)) + + # Unpacking manually to ensure order matches + list_of_kwargs, execution_names = [], [] + for exec_name, exec_kwargs in exec_args.items(): + list_of_kwargs.append(exec_kwargs) + execution_names.append(exec_name) + + executor = EOExecutor( + workflow, + list_of_kwargs, + execution_names=execution_names, + save_logs=self.logging_manager.config.save_logs, + logs_folder=self.logging_manager.get_pipeline_logs_folder(self.current_execution_name), + filesystem=self.storage.filesystem, + logs_filter=EOExecutionFilter(ignore_packages=self.logging_manager.config.eoexecution_ignore_packages), + logs_handler_factory=EOExecutionHandler, + raise_on_temporal_mismatch=self.config.raise_on_temporal_mismatch, + ) + execution_results = executor.run(multiprocess=True, workers=self.config.num_threads) + + successful = [execution_names[idx] for idx in executor.get_successful_executions()] + failed = [execution_names[idx] for idx in executor.get_failed_executions()] + LOGGER.info("EOExecutor finished with %d / %d success rate", len(successful), len(successful) + len(failed)) + + if self.logging_manager.config.save_logs: + executor.make_report(include_logs=self.logging_manager.config.include_logs_to_report) + LOGGER.info("Saved EOExecution report to %s", executor.get_report_path(full_path=True)) - result_patches = [cast(EOPatch, result.outputs.get(self._OUTPUT_NAME)) for result in results] + result_patches = [cast(EOPatch, result.outputs.get(self._OUTPUT_NAME)) for result in execution_results] self.merge_and_save_features(result_patches) diff --git a/eogrow/pipelines/prediction.py b/eogrow/pipelines/prediction.py index 65f53300..ef910c36 100644 --- a/eogrow/pipelines/prediction.py +++ b/eogrow/pipelines/prediction.py @@ -66,11 +66,6 @@ class Schema(Pipeline.Schema): def _get_output_features(self) -> list[Feature]: """Lists all features that are to be saved upon the pipeline completion""" - @property - def _is_mp_lock_needed(self) -> bool: - """If a multiprocessing lock is needed when executing""" - return not self.config.use_ray and self.config.workers > 1 - def filter_patch_list(self, patch_list: PatchList) -> PatchList: """EOPatches are filtered according to existence of specified output features""" output_features = self._get_output_features() @@ -154,7 +149,6 @@ def _get_prediction_node(self, previous_node: EONode) -> EONode: mask_feature=_optional_typed_feature(FeatureType.MASK_TIMELESS, self.config.prediction_mask_feature_name), output_feature=(FeatureType.DATA_TIMELESS, self.config.output_feature_name), output_dtype=self.config.dtype, - mp_lock=self._is_mp_lock_needed, clip_predictions=self.config.clip_predictions, ) return EONode(prediction_task, inputs=[previous_node]) @@ -192,7 +186,6 @@ def _get_prediction_node(self, previous_node: EONode) -> EONode: FeatureType.DATA_TIMELESS, self.config.output_probability_feature_name ), output_dtype=self.config.dtype, - mp_lock=self._is_mp_lock_needed, label_encoder_filename=self.config.label_encoder_filename, ) return EONode(prediction_task, inputs=[previous_node]) diff --git a/eogrow/tasks/prediction.py b/eogrow/tasks/prediction.py index dcbe2bdc..d57ca0f6 100644 --- a/eogrow/tasks/prediction.py +++ b/eogrow/tasks/prediction.py @@ -3,14 +3,14 @@ from __future__ import annotations import abc -from typing import Any, Callable, cast +from typing import Any, Callable import fs import joblib import numpy as np from fs.base import FS -from eolearn.core import EOPatch, EOTask, execute_with_mp_lock +from eolearn.core import EOPatch, EOTask from eolearn.core.types import Feature from eolearn.core.utils.fs import pickle_fs, unpickle_fs @@ -27,7 +27,6 @@ def __init__( mask_feature: Feature, output_feature: Feature, output_dtype: np.dtype | None, - mp_lock: bool, ): """ :param model_path: A file path to the model. The path is relative to the filesystem object. @@ -46,8 +45,6 @@ def __init__( self.output_feature = output_feature self.output_dtype = output_dtype - self.mp_lock = mp_lock - def process_data(self, eopatch: EOPatch, mask: np.ndarray) -> np.ndarray: """Masks and reshapes data into a form suitable for the model""" all_features = [] @@ -82,12 +79,7 @@ def apply_predictor( if processed_features.shape[0] == 0 and return_on_empty is not None: return return_on_empty - if self.mp_lock: - predictions = execute_with_mp_lock(predictor, processed_features) - else: - predictions = predictor(processed_features) - predictions = cast(np.ndarray, predictions) - + predictions: np.ndarray = predictor(processed_features) return predictions.astype(self.output_dtype) if self.output_dtype else predictions @abc.abstractmethod diff --git a/eogrow/types.py b/eogrow/types.py index 0ae80d7f..7f543dc2 100644 --- a/eogrow/types.py +++ b/eogrow/types.py @@ -3,8 +3,7 @@ import datetime import sys -from enum import Enum -from typing import Any, Dict, List, Literal, Tuple, Union +from typing import Any, Dict, List, Tuple if sys.version_info >= (3, 10): from typing import TypeAlias @@ -20,13 +19,5 @@ ImportPath: TypeAlias = str TimePeriod: TypeAlias = Tuple[datetime.date, datetime.date] -BoolOrAuto: TypeAlias = Union[Literal["auto"], bool] - JsonDict: TypeAlias = Dict[str, Any] RawSchemaDict: TypeAlias = Dict[str, Any] - - -class ProcessingType(Enum): - RAY = "ray" - SINGLE = "single" - MULTI = "multi" diff --git a/eogrow/utils/ray.py b/eogrow/utils/ray.py index 1a29f14a..1c8026a4 100644 --- a/eogrow/utils/ray.py +++ b/eogrow/utils/ray.py @@ -8,41 +8,12 @@ import os import subprocess -import ray - -from ..types import BoolOrAuto from .general import current_timestamp LOGGER = logging.getLogger(__name__) CLUSTER_CONFIG_DIR = "~/.synced_configs" -def handle_ray_connection(use_ray: BoolOrAuto = "auto") -> bool: - """According to the given parameter it will try to connect to an existing Ray cluster. - - :param use_ray: Either a boolean flag or `"auto"` to define if the connection should be established or not. - :return: `True` if connection is established and `False` otherwise. - """ - if use_ray == "auto": - try: - _try_connect_to_ray() - return True - except ConnectionError: - LOGGER.info("No Ray cluster found, will not use Ray.") - return False - - if use_ray: - _try_connect_to_ray() - return True - return False - - -def _try_connect_to_ray() -> None: - """Try connecting and log if successful.""" - ray.init(address="auto", ignore_reinit_error=True) - LOGGER.info("Connected to an existing Ray cluster.") - - def is_cluster_running(cluster_yaml: str) -> bool: """Checks if cluster is running or not.""" try: diff --git a/tests/test_config_files/download_and_batch/download_custom.json b/tests/test_config_files/download_and_batch/download_custom.json index def99425..6cda0a13 100644 --- a/tests/test_config_files/download_and_batch/download_custom.json +++ b/tests/test_config_files/download_and_batch/download_custom.json @@ -8,6 +8,5 @@ "resolution": 10, "maxcc": 0.3, "time_difference": 120, - "additional_data": [["mask", "dataMask"]], - "workers": 2 + "additional_data": [["mask", "dataMask"]] } diff --git a/tests/test_config_files/download_and_batch/download_custom_collection.json b/tests/test_config_files/download_and_batch/download_custom_collection.json index cb2ad1e0..5cc96a61 100644 --- a/tests/test_config_files/download_and_batch/download_custom_collection.json +++ b/tests/test_config_files/download_and_batch/download_custom_collection.json @@ -23,6 +23,5 @@ "additional_data": [ ["mask", "CLM"], ["mask", "dataMask"] - ], - "workers": 2 + ] } diff --git a/tests/test_config_files/download_and_batch/download_custom_raise.json b/tests/test_config_files/download_and_batch/download_custom_raise.json index c1fc01d5..a738dac3 100644 --- a/tests/test_config_files/download_and_batch/download_custom_raise.json +++ b/tests/test_config_files/download_and_batch/download_custom_raise.json @@ -11,6 +11,5 @@ "additional_data": [ ["mask", "CLM"], ["mask", "dataMask"] - ], - "workers": 2 + ] } diff --git a/tests/test_config_files/download_and_batch/download_dem.json b/tests/test_config_files/download_and_batch/download_dem.json index 44119c61..93d0d719 100644 --- a/tests/test_config_files/download_and_batch/download_dem.json +++ b/tests/test_config_files/download_and_batch/download_dem.json @@ -5,6 +5,5 @@ "output_folder_key": "temp", "data_collection": "DEM", "feature_name": "DEM", - "resolution": 10, - "workers": 2 + "resolution": 10 } diff --git a/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json b/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json index e07a976d..52a7c6d3 100644 --- a/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json +++ b/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json @@ -12,6 +12,5 @@ ["mask", "CLM"], ["mask", "dataMask"] ], - "use_dn": true, - "workers": 2 + "use_dn": true } diff --git a/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json b/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json index 059b5fde..f6d12605 100644 --- a/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json +++ b/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json @@ -20,6 +20,5 @@ "features_to_rescale": [["data", "BANDS-S2-L1C"]] } ] - }, - "workers": 2 + } } diff --git a/tests/test_config_files/download_and_batch/download_l1c_yearly.json b/tests/test_config_files/download_and_batch/download_l1c_yearly.json index 813e99a4..7cfb60f6 100644 --- a/tests/test_config_files/download_and_batch/download_l1c_yearly.json +++ b/tests/test_config_files/download_and_batch/download_l1c_yearly.json @@ -12,6 +12,5 @@ ["mask", "CLM"], ["mask", "dataMask"] ], - "workers": 2, "threads_per_worker": null } diff --git a/tests/test_config_files/download_and_batch/download_q3.json b/tests/test_config_files/download_and_batch/download_q3.json index b67e3b08..4963df8e 100644 --- a/tests/test_config_files/download_and_batch/download_q3.json +++ b/tests/test_config_files/download_and_batch/download_q3.json @@ -10,6 +10,5 @@ "additional_data": [ ["mask", "CLM"], ["mask", "dataMask"] - ], - "workers": 2 + ] } diff --git a/tests/test_config_files/export_maps/export_maps_data.json b/tests/test_config_files/export_maps/export_maps_data.json index af100ddf..0423e45b 100644 --- a/tests/test_config_files/export_maps/export_maps_data.json +++ b/tests/test_config_files/export_maps/export_maps_data.json @@ -14,6 +14,5 @@ "band_indices": [0, 1], "cogify": true, "interim_results_suffix": "no-clash-plz", - "warp_resampling": "bilinear", - "workers": 1 + "warp_resampling": "bilinear" } diff --git a/tests/test_config_files/export_maps/export_maps_data_compressed.json b/tests/test_config_files/export_maps/export_maps_data_compressed.json index 61434e57..8cc3acc6 100644 --- a/tests/test_config_files/export_maps/export_maps_data_compressed.json +++ b/tests/test_config_files/export_maps/export_maps_data_compressed.json @@ -12,6 +12,5 @@ "map_name": "result.tiff", "map_dtype": "float32", "band_indices": [0], - "workers": 1, "split_per_timestamp": false } diff --git a/tests/test_config_files/export_maps/export_maps_mask.json b/tests/test_config_files/export_maps/export_maps_mask.json index eb614099..77aeae19 100644 --- a/tests/test_config_files/export_maps/export_maps_mask.json +++ b/tests/test_config_files/export_maps/export_maps_mask.json @@ -13,6 +13,5 @@ "band_indices": [0], "split_per_timestamp": false, "cogify": true, - "cogification_resampling": "MODE", - "workers": 1 + "cogification_resampling": "MODE" } diff --git a/tests/test_config_files/export_maps/export_maps_mask_local_copy.json b/tests/test_config_files/export_maps/export_maps_mask_local_copy.json index 72f94857..a6961a75 100644 --- a/tests/test_config_files/export_maps/export_maps_mask_local_copy.json +++ b/tests/test_config_files/export_maps/export_maps_mask_local_copy.json @@ -12,6 +12,5 @@ "map_dtype": "uint16", "no_data_value": 0, "band_indices": [0], - "force_local_copies": true, - "workers": 1 + "force_local_copies": true } diff --git a/tests/test_config_files/merge_samples/merge_features_samples.json b/tests/test_config_files/merge_samples/merge_features_samples.json index 6aafecb3..d259e09d 100644 --- a/tests/test_config_files/merge_samples/merge_features_samples.json +++ b/tests/test_config_files/merge_samples/merge_features_samples.json @@ -6,5 +6,5 @@ "features_to_merge": [["data", "FEATURES"]], "id_filename": "PATCH_IDS", "suffix": "", - "workers": 2 + "num_threads": 2 } diff --git a/tests/test_config_files/merge_samples/merge_reference_samples.json b/tests/test_config_files/merge_samples/merge_reference_samples.json index 4173ac4a..83c1a2f1 100644 --- a/tests/test_config_files/merge_samples/merge_reference_samples.json +++ b/tests/test_config_files/merge_samples/merge_reference_samples.json @@ -6,5 +6,5 @@ "features_to_merge": [["mask_timeless", "LULC_ID"]], "id_filename": "PATCH_IDS", "suffix": "", - "workers": 2 + "num_threads": 2 } diff --git a/tests/test_config_files/other/simple_config.json b/tests/test_config_files/other/simple_config.json index 1acc19de..1542b38f 100644 --- a/tests/test_config_files/other/simple_config.json +++ b/tests/test_config_files/other/simple_config.json @@ -4,7 +4,6 @@ "**global_config": "${config_path}/../global_config.json", "test_param": 10, "test_subset": [0, "eopatch-id-1-col-0-row-1"], - "workers": 3, "logging": { "save_logs": true, "show_logs": true, diff --git a/tests/test_config_files/rasterize/rasterize_feature_with_resolution.json b/tests/test_config_files/rasterize/rasterize_feature_with_resolution.json index 9d88d546..295cc6cb 100644 --- a/tests/test_config_files/rasterize/rasterize_feature_with_resolution.json +++ b/tests/test_config_files/rasterize/rasterize_feature_with_resolution.json @@ -8,6 +8,5 @@ "resolution": 10, "overlap_value": 0, "dtype": "int16", - "workers": 2, "**global_config": "${config_path}/../global_config.json" } diff --git a/tests/test_config_files/sampling/sampling_block_fraction.json b/tests/test_config_files/sampling/sampling_block_fraction.json index 5a1e8a89..49876826 100644 --- a/tests/test_config_files/sampling/sampling_block_fraction.json +++ b/tests/test_config_files/sampling/sampling_block_fraction.json @@ -14,6 +14,5 @@ "sample_size": [25, 30], "fraction_of_samples": 0.001, "mask_of_samples_name": "MASK_OF_SAMPLES", - "seed": 100, - "workers": 2 + "seed": 100 } diff --git a/tests/test_config_files/sampling/sampling_block_number.json b/tests/test_config_files/sampling/sampling_block_number.json index e11a0d4d..87dae02a 100644 --- a/tests/test_config_files/sampling/sampling_block_number.json +++ b/tests/test_config_files/sampling/sampling_block_number.json @@ -13,6 +13,5 @@ }, "sample_size": [30, 25], "number_of_samples": 20, - "mask_of_samples_name": "MASK_OF_SAMPLES", - "workers": 2 + "mask_of_samples_name": "MASK_OF_SAMPLES" } diff --git a/tests/test_config_files/sampling/sampling_chain.json b/tests/test_config_files/sampling/sampling_chain.json index dbddf50d..602bf2a0 100644 --- a/tests/test_config_files/sampling/sampling_chain.json +++ b/tests/test_config_files/sampling/sampling_chain.json @@ -14,6 +14,5 @@ }, "fraction_of_samples": 0.03, "exclude_values": [0], - "mask_of_samples_name": "MASK_OF_SAMPLES", - "workers": 2 + "mask_of_samples_name": "MASK_OF_SAMPLES" } diff --git a/tests/test_config_files/sampling/sampling_fraction.json b/tests/test_config_files/sampling/sampling_fraction.json index 0a4baf56..e99bb67b 100644 --- a/tests/test_config_files/sampling/sampling_fraction.json +++ b/tests/test_config_files/sampling/sampling_fraction.json @@ -15,6 +15,5 @@ }, "exclude_values": [0, 1], "mask_of_samples_name": "MASK_OF_SAMPLES", - "seed": 21, - "workers": 2 + "seed": 21 } diff --git a/tests/test_config_files/sampling/sampling_grid.json b/tests/test_config_files/sampling/sampling_grid.json index 273731cb..28f6384f 100644 --- a/tests/test_config_files/sampling/sampling_grid.json +++ b/tests/test_config_files/sampling/sampling_grid.json @@ -13,6 +13,5 @@ }, "sample_size": [30, 25], "stride": [40, 20], - "mask_of_samples_name": "MASK_OF_SAMPLES", - "workers": 2 + "mask_of_samples_name": "MASK_OF_SAMPLES" } diff --git a/tests/utils/test_ray.py b/tests/utils/test_ray.py deleted file mode 100644 index a2501f92..00000000 --- a/tests/utils/test_ray.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest -import ray - -from eogrow.utils.ray import handle_ray_connection - - -@pytest.fixture(name="_ray_cluster", scope="class") -def _ray_cluster_fixture(): - ray.init(log_to_driver=False) - yield - ray.shutdown() - - -class TestWithRayCluster: - """These are the tests that require a running Ray cluster while tests outside this class must not have it.""" - - @pytest.mark.parametrize( - ("use_ray", "expected_connection"), - [ - ("auto", True), - (True, True), - (False, False), - ], - ) - @pytest.mark.usefixtures("_ray_cluster") - def test_handle_ray_connection_with_cluster(self, use_ray, expected_connection): - is_connected = handle_ray_connection(use_ray) - assert is_connected is expected_connection - - if is_connected: - assert ray.is_initialized() - - -@pytest.mark.parametrize( - ("use_ray", "expected_connection"), - [ - ("auto", False), - (True, ConnectionError), - (False, False), - ], -) -def test_handle_ray_connection_without_cluster(use_ray, expected_connection): - if isinstance(expected_connection, bool): - is_connected = handle_ray_connection(use_ray) - assert is_connected is expected_connection - else: - with pytest.raises(expected_connection): - handle_ray_connection(use_ray) From 9ff076ad6f577e1906d1118d7c11055680bb88fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Mon, 20 Nov 2023 10:27:20 +0100 Subject: [PATCH 07/10] Reduce the amount of download tests in the suite (#305) * reduce the amount of download tests in the suite * adjust threads parameter --- tests/pipelines/test_download.py | 11 - tests/pipelines/test_features.py | 2 - .../download_and_batch/download_custom.json | 12 - .../download_custom_collection.json | 2 +- .../download_custom_raise.json | 15 - .../download_l1c_q1_dn.json | 16 -- .../download_l1c_q1_dn_rescaled.json | 4 +- .../download_l1c_yearly.json | 3 +- .../download_and_batch/download_q3.json | 14 - .../features/features_mosaicking_custom.json | 22 -- .../features/features_on_rescaled_dn.json | 23 -- tests/test_config_files/global_config.json | 2 - .../download_and_batch/download_custom.json | 182 ------------ .../download_custom_collection.json | 258 ++++++++---------- .../download_l1c_q1_dn.json | 240 ---------------- .../download_and_batch/download_q3.json | 256 ----------------- .../features/features_interpolation.json | 182 ------------ .../features/features_mosaicking_custom.json | 142 ---------- .../features/features_on_rescaled_dn.json | 160 ----------- 19 files changed, 116 insertions(+), 1430 deletions(-) delete mode 100644 tests/test_config_files/download_and_batch/download_custom.json delete mode 100644 tests/test_config_files/download_and_batch/download_custom_raise.json delete mode 100644 tests/test_config_files/download_and_batch/download_l1c_q1_dn.json delete mode 100644 tests/test_config_files/download_and_batch/download_q3.json delete mode 100644 tests/test_config_files/features/features_mosaicking_custom.json delete mode 100644 tests/test_config_files/features/features_on_rescaled_dn.json delete mode 100644 tests/test_stats/download_and_batch/download_custom.json delete mode 100644 tests/test_stats/download_and_batch/download_l1c_q1_dn.json delete mode 100644 tests/test_stats/download_and_batch/download_q3.json delete mode 100644 tests/test_stats/features/features_interpolation.json delete mode 100644 tests/test_stats/features/features_mosaicking_custom.json delete mode 100644 tests/test_stats/features/features_on_rescaled_dn.json diff --git a/tests/pipelines/test_download.py b/tests/pipelines/test_download.py index f75214fe..91dfd813 100644 --- a/tests/pipelines/test_download.py +++ b/tests/pipelines/test_download.py @@ -1,5 +1,4 @@ import pytest -from pydantic import ValidationError from eogrow.utils.testing import compare_content, run_config @@ -16,11 +15,8 @@ def test_preparation(): @pytest.mark.parametrize( "experiment_name", [ - "download_l1c_q1_dn", "download_l1c_q1_dn_rescaled", "download_custom_collection", - "download_custom", - "download_q3", "download_dem", pytest.param("download_l1c_yearly", marks=pytest.mark.chain), ], @@ -29,10 +25,3 @@ def test_download_pipeline(config_and_stats_paths, experiment_name): config_path, stats_path = config_and_stats_paths("download_and_batch", experiment_name) output_path = run_config(config_path) compare_content(output_path, stats_path) - - -@pytest.mark.parametrize("experiment_name", ["download_custom_raise"]) -def test_validation_error(config_and_stats_paths, experiment_name): - config_path, _ = config_and_stats_paths("download_and_batch", experiment_name) - with pytest.raises(ValidationError): - run_config(config_path) diff --git a/tests/pipelines/test_features.py b/tests/pipelines/test_features.py index 6e2ded21..a6e776e3 100644 --- a/tests/pipelines/test_features.py +++ b/tests/pipelines/test_features.py @@ -9,8 +9,6 @@ @pytest.mark.parametrize( "experiment_name", [ - "features_mosaicking_custom", - "features_on_rescaled_dn", "features_mosaicking", "features_dtype", pytest.param("features_on_sampled_data", marks=pytest.mark.chain), diff --git a/tests/test_config_files/download_and_batch/download_custom.json b/tests/test_config_files/download_and_batch/download_custom.json deleted file mode 100644 index 6cda0a13..00000000 --- a/tests/test_config_files/download_and_batch/download_custom.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.download.DownloadPipeline", - "**global_config": "${config_path}/../global_config.json", - "output_folder_key": "data_custom_range", - "data_collection": "SENTINEL2_L1C", - "time_period": ["2017-11-14", "2018-03-5"], - "bands_feature_name": "BANDS-S2-L1C", - "resolution": 10, - "maxcc": 0.3, - "time_difference": 120, - "additional_data": [["mask", "dataMask"]] -} diff --git a/tests/test_config_files/download_and_batch/download_custom_collection.json b/tests/test_config_files/download_and_batch/download_custom_collection.json index 5cc96a61..e8ef0ad1 100644 --- a/tests/test_config_files/download_and_batch/download_custom_collection.json +++ b/tests/test_config_files/download_and_batch/download_custom_collection.json @@ -15,7 +15,7 @@ ], "has_cloud_coverage": true }, - "time_period": ["season", 2018], + "time_period": ["Q3", 2020], "bands_feature_name": "BANDS-S2-L1C", "resolution": 10, "maxcc": 0.3, diff --git a/tests/test_config_files/download_and_batch/download_custom_raise.json b/tests/test_config_files/download_and_batch/download_custom_raise.json deleted file mode 100644 index a738dac3..00000000 --- a/tests/test_config_files/download_and_batch/download_custom_raise.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.download.DownloadPipeline", - "**global_config": "${config_path}/../global_config.json", - "output_folder_key": "data_custom_range", - "data_collection": "SENTINEL2_L1C", - "time_period": ["2018-11-28", "2018-04-28"], - "bands_feature_name": "BANDS-S2-L1C", - "resolution": 10, - "maxcc": 0.3, - "time_difference": 120, - "additional_data": [ - ["mask", "CLM"], - ["mask", "dataMask"] - ] -} diff --git a/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json b/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json deleted file mode 100644 index 52a7c6d3..00000000 --- a/tests/test_config_files/download_and_batch/download_l1c_q1_dn.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.download.DownloadPipeline", - "**global_config": "${config_path}/../global_config.json", - "output_folder_key": "temp", - "data_collection": "SENTINEL2_L1C", - "time_period": ["Q1", 2019], - "bands_feature_name": "BANDS-S2-L1C", - "resolution": 10, - "maxcc": 0.3, - "time_difference": 120, - "additional_data": [ - ["mask", "CLM"], - ["mask", "dataMask"] - ], - "use_dn": true -} diff --git a/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json b/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json index f6d12605..e46bbe39 100644 --- a/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json +++ b/tests/test_config_files/download_and_batch/download_l1c_q1_dn_rescaled.json @@ -1,11 +1,11 @@ { "pipeline": "eogrow.pipelines.download.DownloadPipeline", "**global_config": "${config_path}/../global_config.json", - "output_folder_key": "data_2019", + "output_folder_key": "temp", "data_collection": "SENTINEL2_L1C", "time_period": ["Q1", 2019], "bands_feature_name": "BANDS-S2-L1C", - "resolution": 10, + "size": [264, 121], "maxcc": 0.3, "time_difference": 120, "additional_data": [ diff --git a/tests/test_config_files/download_and_batch/download_l1c_yearly.json b/tests/test_config_files/download_and_batch/download_l1c_yearly.json index 7cfb60f6..c4f8d25b 100644 --- a/tests/test_config_files/download_and_batch/download_l1c_yearly.json +++ b/tests/test_config_files/download_and_batch/download_l1c_yearly.json @@ -11,6 +11,5 @@ "additional_data": [ ["mask", "CLM"], ["mask", "dataMask"] - ], - "threads_per_worker": null + ] } diff --git a/tests/test_config_files/download_and_batch/download_q3.json b/tests/test_config_files/download_and_batch/download_q3.json deleted file mode 100644 index 4963df8e..00000000 --- a/tests/test_config_files/download_and_batch/download_q3.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.download.DownloadPipeline", - "**global_config": "${config_path}/../global_config.json", - "output_folder_key": "temp", - "data_collection": "SENTINEL2_L1C", - "time_period": ["Q3", 2018], - "bands_feature_name": "BANDS-S2-L1C", - "size": [264, 121], - "maxcc": 0.3, - "additional_data": [ - ["mask", "CLM"], - ["mask", "dataMask"] - ] -} diff --git a/tests/test_config_files/features/features_mosaicking_custom.json b/tests/test_config_files/features/features_mosaicking_custom.json deleted file mode 100644 index 052dfec1..00000000 --- a/tests/test_config_files/features/features_mosaicking_custom.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.features.MosaickingFeaturesPipeline", - "**global_config": "${config_path}/../global_config.json", - "input_folder_key": "data_custom_range", - "output_folder_key": "temp", - "bands_feature_name": "BANDS-S2-L1C", - "data_preparation": { - "validity_threshold": 0.8, - "valid_data_feature_name": "dataMask" - }, - "ndis": { - "NDVI": [7, 3], - "NDWI": [2, 7], - "NDBI": [11, 7] - }, - "mosaicking": { - "time_period": ["2017-11-28", "2018-02-21"], - "n_mosaics": 3, - "max_ndi_indices": [7, 3] - }, - "output_feature_name": "FEATURES" -} diff --git a/tests/test_config_files/features/features_on_rescaled_dn.json b/tests/test_config_files/features/features_on_rescaled_dn.json deleted file mode 100644 index 314a86fd..00000000 --- a/tests/test_config_files/features/features_on_rescaled_dn.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.features.MosaickingFeaturesPipeline", - "**global_config": "${config_path}/../global_config.json", - "input_folder_key": "data_2019", - "output_folder_key": "temp", - "bands_feature_name": "BANDS-S2-L1C", - "data_preparation": { - "validity_threshold": 0.8, - "cloud_mask_feature_name": "CLM", - "valid_data_feature_name": "dataMask" - }, - "ndis": { - "NDVI": [7, 3], - "NDWI": [2, 7], - "NDBI": [11, 7] - }, - "mosaicking": { - "time_period": ["yearly", 2019], - "n_mosaics": 12, - "max_ndi_indices": [7, 3] - }, - "output_feature_name": "FEATURES" -} diff --git a/tests/test_config_files/global_config.json b/tests/test_config_files/global_config.json index f6282d2f..2e47a409 100644 --- a/tests/test_config_files/global_config.json +++ b/tests/test_config_files/global_config.json @@ -24,8 +24,6 @@ "structure": { "data": "data", "batch_data": "batch-data", - "data_2019": "data-2019", - "data_custom_range": "data-custom-range", "data_sampled": "data-sampled", "features": "features", "features_sampled": "features-sampled", diff --git a/tests/test_stats/download_and_batch/download_custom.json b/tests/test_stats/download_and_batch/download_custom.json deleted file mode 100644 index 7c8b6649..00000000 --- a/tests/test_stats/download_and_batch/download_custom.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 6, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 1.5058, - "mean": 0.232859, - "median": 0.164, - "min": 0.0008, - "std": 0.199445 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 1434939, - 572096, - 310934, - 98388, - 47934, - 19983, - 6304, - 1054 - ], - "edges": [ - 0.0008, - 0.188925, - 0.37705, - 0.565175, - 0.7533, - 0.941425, - 1.12955, - 1.31767, - 1.5058 - ] - }, - "random_values": [ - 0.1464, - 0.1423, - 0.1292, - 0.4518, - 0.0052, - 0.1192, - 0.0018, - 0.1074 - ], - "subsample_basic_stats": { - "max": 1.4967, - "mean": 0.232848, - "median": 0.1643, - "min": 0.0008, - "std": 0.199061 - } - } - }, - "mask": { - "dataMask": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 191664, - "value": true - } - ] - } - }, - "timestamps": [ - "2017-11-25T07:42:33", - "2017-12-10T07:43:05", - "2017-12-30T07:43:14", - "2018-01-19T07:42:27", - "2018-02-28T07:46:50", - "2018-03-05T07:38:03" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 6, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 0.8772, - "mean": 0.181311, - "median": 0.1387, - "min": 0.0006, - "std": 0.147656 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 906857, - 913889, - 270390, - 168454, - 161654, - 58985, - 10589, - 814 - ], - "edges": [ - 0.0006, - 0.110175, - 0.21975, - 0.329325, - 0.4389, - 0.548475, - 0.65805, - 0.767625, - 0.8772 - ] - }, - "random_values": [ - 0.1414, - 0.1423, - 0.1896, - 0.0644, - 0.0012, - 0.1066, - 0.0019, - 0.0876 - ], - "subsample_basic_stats": { - "max": 0.868, - "mean": 0.181959, - "median": 0.1389, - "min": 0.0006, - "std": 0.148293 - } - } - }, - "mask": { - "dataMask": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 191664, - "value": true - } - ] - } - }, - "timestamps": [ - "2017-11-25T07:42:33", - "2017-12-10T07:43:05", - "2017-12-30T07:43:14", - "2018-01-19T07:42:27", - "2018-02-28T07:46:50", - "2018-03-05T07:38:03" - ] - } -} diff --git a/tests/test_stats/download_and_batch/download_custom_collection.json b/tests/test_stats/download_and_batch/download_custom_collection.json index 8409d1ed..26c4bdfd 100644 --- a/tests/test_stats/download_and_batch/download_custom_collection.json +++ b/tests/test_stats/download_and_batch/download_custom_collection.json @@ -4,17 +4,17 @@ "data": { "BANDS-S2-L1C": { "array_shape": [ - 31, + 14, 121, 264, 13 ], "basic_stats": { - "max": 1.5058, - "mean": 0.187434, - "median": 0.1576, + "max": 0.9018, + "mean": 0.163821, + "median": 0.1526, "min": 0.0008, - "std": 0.130618 + "std": 0.0905536 }, "counts": { "infinite": 0, @@ -23,79 +23,79 @@ "dtype": "float32", "histogram": { "counts": [ - 7695264, - 4252928, - 729376, - 119504, - 48952, - 20040, - 6314, - 1054 + 1604683, + 2738829, + 1285730, + 170551, + 13740, + 264, + 6, + 5 ], "edges": [ 0.0008, - 0.188925, - 0.37705, - 0.565175, - 0.7533, - 0.941425, - 1.12955, - 1.31767, - 1.5058 + 0.113425, + 0.22605, + 0.338675, + 0.4513, + 0.563925, + 0.67655, + 0.789175, + 0.9018 ] }, "random_values": [ - 0.3464, - 0.1154, - 0.1409, - 0.1483, - 0.2114, - 0.1741, - 0.2759, - 0.047 + 0.0434, + 0.2996, + 0.2374, + 0.1175, + 0.106, + 0.1537, + 0.3004, + 0.2527 ], "subsample_basic_stats": { - "max": 1.4977, - "mean": 0.187302, - "median": 0.1576, + "max": 0.7431, + "mean": 0.163852, + "median": 0.1527, "min": 0.0008, - "std": 0.130275 + "std": 0.0904792 } } }, "mask": { "CLM": { "array_shape": [ - 31, + 14, 121, 264, 1 ], "dtype": "uint8", "random_values": [ - 1, 0, 0, 0, - 1, - 1, + 0, + 0, + 0, 0, 0 ], "values": [ { - "count": 830205, + "count": 444240, "value": 0 }, { - "count": 160059, + "count": 2976, "value": 1 } ] }, "dataMask": { "array_shape": [ - 31, + 14, 121, 264, 1 @@ -103,44 +103,27 @@ "dtype": "bool", "values": [ { - "count": 990264, + "count": 447216, "value": true } ] } }, "timestamps": [ - "2017-09-01T07:43:08", - "2017-09-06T07:40:42", - "2017-10-11T07:46:43", - "2017-10-31T07:41:37", - "2017-11-05T07:41:49", - "2017-11-25T07:42:33", - "2017-12-10T07:43:05", - "2017-12-30T07:43:14", - "2018-01-19T07:42:27", - "2018-02-28T07:46:50", - "2018-03-05T07:38:03", - "2018-03-25T07:44:17", - "2018-04-09T07:38:40", - "2018-04-19T07:43:26", - "2018-04-24T07:46:03", - "2018-05-09T07:36:10", - "2018-05-19T07:45:29", - "2018-06-08T07:43:25", - "2018-06-13T07:43:56", - "2018-06-18T07:42:58", - "2018-06-23T07:46:07", - "2018-06-28T07:45:41", - "2018-07-03T07:36:13", - "2018-07-08T07:42:00", - "2018-07-13T07:44:13", - "2018-08-02T07:36:12", - "2018-08-07T07:41:24", - "2018-08-17T07:47:30", - "2018-08-22T07:45:09", - "2018-08-27T07:39:47", - "2018-09-01T07:36:10" + "2020-07-07T07:48:41", + "2020-07-12T07:48:44", + "2020-07-17T07:48:41", + "2020-07-27T07:48:42", + "2020-08-01T07:48:45", + "2020-08-06T07:48:42", + "2020-08-21T07:48:45", + "2020-08-26T07:48:43", + "2020-08-31T07:48:44", + "2020-09-05T07:48:42", + "2020-09-10T07:48:43", + "2020-09-15T07:48:55", + "2020-09-25T07:48:42", + "2020-09-30T07:48:59" ] }, "eopatch-id-1-col-0-row-1": { @@ -148,17 +131,17 @@ "data": { "BANDS-S2-L1C": { "array_shape": [ - 31, + 14, 121, 264, 13 ], "basic_stats": { - "max": 0.8772, - "mean": 0.172886, - "median": 0.1478, - "min": 0.0006, - "std": 0.11216 + "max": 1.0959, + "mean": 0.16227, + "median": 0.1465, + "min": 0.0007, + "std": 0.0935485 }, "counts": { "infinite": 0, @@ -167,79 +150,79 @@ "dtype": "float32", "histogram": { "counts": [ - 3580443, - 5541700, - 2605796, - 777273, - 287143, - 69162, - 11101, - 814 + 2577422, + 2467267, + 732425, + 36447, + 174, + 36, + 23, + 14 ], "edges": [ - 0.0006, - 0.110175, - 0.21975, - 0.329325, - 0.4389, - 0.548475, - 0.65805, - 0.767625, - 0.8772 + 0.0007, + 0.1376, + 0.2745, + 0.4114, + 0.5483, + 0.6852, + 0.8221, + 0.959, + 1.0959 ] }, "random_values": [ - 0.4662, - 0.1101, - 0.1369, - 0.1479, - 0.067, - 0.2655, - 0.1844, - 0.0576 + 0.049, + 0.2174, + 0.2445, + 0.1581, + 0.1284, + 0.1484, + 0.3089, + 0.2643 ], "subsample_basic_stats": { - "max": 0.8542, - "mean": 0.17285, - "median": 0.1479, + "max": 0.9663, + "mean": 0.162315, + "median": 0.1465, "min": 0.0007, - "std": 0.112033 + "std": 0.0934971 } } }, "mask": { "CLM": { "array_shape": [ - 31, + 14, 121, 264, 1 ], "dtype": "uint8", "random_values": [ - 1, 0, 0, 0, - 1, - 1, + 0, + 0, + 0, 0, 0 ], "values": [ { - "count": 860015, + "count": 431200, "value": 0 }, { - "count": 130249, + "count": 16016, "value": 1 } ] }, "dataMask": { "array_shape": [ - 31, + 14, 121, 264, 1 @@ -247,44 +230,27 @@ "dtype": "bool", "values": [ { - "count": 990264, + "count": 447216, "value": true } ] } }, "timestamps": [ - "2017-09-01T07:43:08", - "2017-09-06T07:40:42", - "2017-10-11T07:46:43", - "2017-10-31T07:41:37", - "2017-11-05T07:41:49", - "2017-11-25T07:42:33", - "2017-12-10T07:43:05", - "2017-12-30T07:43:14", - "2018-01-19T07:42:27", - "2018-02-28T07:46:50", - "2018-03-05T07:38:03", - "2018-03-25T07:44:17", - "2018-04-09T07:38:40", - "2018-04-19T07:43:26", - "2018-04-24T07:46:03", - "2018-05-09T07:36:10", - "2018-05-19T07:45:29", - "2018-06-08T07:43:25", - "2018-06-13T07:43:56", - "2018-06-18T07:42:58", - "2018-06-23T07:46:07", - "2018-06-28T07:45:41", - "2018-07-03T07:36:13", - "2018-07-08T07:42:00", - "2018-07-13T07:44:13", - "2018-08-02T07:36:12", - "2018-08-07T07:41:24", - "2018-08-17T07:47:30", - "2018-08-22T07:45:09", - "2018-08-27T07:39:47", - "2018-09-01T07:36:10" + "2020-07-07T07:48:41", + "2020-07-12T07:48:44", + "2020-07-17T07:48:41", + "2020-07-27T07:48:42", + "2020-08-01T07:48:45", + "2020-08-06T07:48:42", + "2020-08-21T07:48:45", + "2020-08-26T07:48:43", + "2020-08-31T07:48:44", + "2020-09-05T07:48:42", + "2020-09-10T07:48:43", + "2020-09-15T07:48:55", + "2020-09-25T07:48:42", + "2020-09-30T07:48:59" ] } } diff --git a/tests/test_stats/download_and_batch/download_l1c_q1_dn.json b/tests/test_stats/download_and_batch/download_l1c_q1_dn.json deleted file mode 100644 index b87e3ba2..00000000 --- a/tests/test_stats/download_and_batch/download_l1c_q1_dn.json +++ /dev/null @@ -1,240 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 6, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 14480, - "mean": 2383.5, - "median": 1693.0, - "min": 6, - "std": 1845.29 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "uint16", - "histogram": { - "counts": [ - 1334562, - 587753, - 276379, - 281290, - 11578, - 65, - 3, - 2 - ], - "edges": [ - 6.0, - 1815.25, - 3624.5, - 5433.75, - 7243.0, - 9052.25, - 10861.5, - 12670.75, - 14480.0 - ] - }, - "random_values": [ - 1442, - 1624, - 1270, - 2412, - 24, - 1217, - 13, - 844 - ], - "subsample_basic_stats": { - "max": 10483, - "mean": 2384.45, - "median": 1690.0, - "min": 6, - "std": 1848.58 - } - } - }, - "mask": { - "CLM": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "uint8", - "random_values": [ - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0 - ], - "values": [ - { - "count": 99288, - "value": 0 - }, - { - "count": 92376, - "value": 1 - } - ] - }, - "dataMask": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 191664, - "value": true - } - ] - } - }, - "timestamps": [ - "2019-01-04T07:48:37", - "2019-01-24T07:48:39", - "2019-02-13T07:48:39", - "2019-02-18T07:48:36", - "2019-02-23T07:49:38", - "2019-03-05T07:55:53" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 6, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 9331, - "mean": 2411.87, - "median": 1828.0, - "min": 6, - "std": 1785.95 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "uint16", - "histogram": { - "counts": [ - 679578, - 835647, - 377607, - 185812, - 250728, - 157069, - 5118, - 73 - ], - "edges": [ - 6.0, - 1171.625, - 2337.25, - 3502.875, - 4668.5, - 5834.125, - 6999.75, - 8165.375, - 9331.0 - ] - }, - "random_values": [ - 1384, - 1416, - 2159, - 2076, - 27, - 1246, - 10, - 919 - ], - "subsample_basic_stats": { - "max": 8563, - "mean": 2412.53, - "median": 1827.0, - "min": 9, - "std": 1787.84 - } - } - }, - "mask": { - "CLM": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "uint8", - "random_values": [ - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0 - ], - "values": [ - { - "count": 95832, - "value": 0 - }, - { - "count": 95832, - "value": 1 - } - ] - }, - "dataMask": { - "array_shape": [ - 6, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 191664, - "value": true - } - ] - } - }, - "timestamps": [ - "2019-01-04T07:48:37", - "2019-01-24T07:48:39", - "2019-02-13T07:48:39", - "2019-02-18T07:48:36", - "2019-02-23T07:49:38", - "2019-03-05T07:55:53" - ] - } -} diff --git a/tests/test_stats/download_and_batch/download_q3.json b/tests/test_stats/download_and_batch/download_q3.json deleted file mode 100644 index 5fef20a0..00000000 --- a/tests/test_stats/download_and_batch/download_q3.json +++ /dev/null @@ -1,256 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 14, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 1.0719, - "mean": 0.177753, - "median": 0.1597, - "min": 0.0007, - "std": 0.104689 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 1982164, - 2846206, - 780396, - 195213, - 9703, - 88, - 26, - 12 - ], - "edges": [ - 0.0007, - 0.1346, - 0.2685, - 0.4024, - 0.5363, - 0.6702, - 0.8041, - 0.938, - 1.0719 - ] - }, - "random_values": [ - 0.0609, - 0.277, - 0.2412, - 0.1191, - 0.3477, - 0.1637, - 0.2381, - 0.2496 - ], - "subsample_basic_stats": { - "max": 0.8409, - "mean": 0.17768, - "median": 0.1596, - "min": 0.0007, - "std": 0.104563 - } - } - }, - "mask": { - "CLM": { - "array_shape": [ - 14, - 121, - 264, - 1 - ], - "dtype": "uint8", - "random_values": [ - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0 - ], - "values": [ - { - "count": 415272, - "value": 0 - }, - { - "count": 31944, - "value": 1 - } - ] - }, - "dataMask": { - "array_shape": [ - 14, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 447216, - "value": true - } - ] - } - }, - "timestamps": [ - "2018-07-03T07:36:13", - "2018-07-08T07:42:00", - "2018-07-13T07:44:13", - "2018-08-02T07:36:12", - "2018-08-07T07:41:24", - "2018-08-17T07:47:30", - "2018-08-22T07:45:09", - "2018-08-27T07:39:47", - "2018-09-01T07:36:10", - "2018-09-06T07:40:44", - "2018-09-11T07:44:09", - "2018-09-16T07:41:35", - "2018-09-21T07:46:13", - "2018-09-26T07:47:11" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "BANDS-S2-L1C": { - "array_shape": [ - 14, - 121, - 264, - 13 - ], - "basic_stats": { - "max": 0.6062, - "mean": 0.174605, - "median": 0.1542, - "min": 0.0007, - "std": 0.104046 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 925034, - 1913984, - 1286673, - 1035422, - 427864, - 149440, - 65238, - 10153 - ], - "edges": [ - 0.0007, - 0.0763875, - 0.152075, - 0.227762, - 0.30345, - 0.379137, - 0.454825, - 0.530513, - 0.6062 - ] - }, - "random_values": [ - 0.0793, - 0.2678, - 0.2193, - 0.1869, - 0.3052, - 0.1578, - 0.2916, - 0.2484 - ], - "subsample_basic_stats": { - "max": 0.6021, - "mean": 0.174572, - "median": 0.1541, - "min": 0.0007, - "std": 0.103997 - } - } - }, - "mask": { - "CLM": { - "array_shape": [ - 14, - 121, - 264, - 1 - ], - "dtype": "uint8", - "random_values": [ - 0, - 0, - 0, - 0, - 1, - 0, - 0, - 0 - ], - "values": [ - { - "count": 414989, - "value": 0 - }, - { - "count": 32227, - "value": 1 - } - ] - }, - "dataMask": { - "array_shape": [ - 14, - 121, - 264, - 1 - ], - "dtype": "bool", - "values": [ - { - "count": 447216, - "value": true - } - ] - } - }, - "timestamps": [ - "2018-07-03T07:36:13", - "2018-07-08T07:42:00", - "2018-07-13T07:44:13", - "2018-08-02T07:36:12", - "2018-08-07T07:41:24", - "2018-08-17T07:47:30", - "2018-08-22T07:45:09", - "2018-08-27T07:39:47", - "2018-09-01T07:36:10", - "2018-09-06T07:40:44", - "2018-09-11T07:44:09", - "2018-09-16T07:41:35", - "2018-09-21T07:46:13", - "2018-09-26T07:47:11" - ] - } -} diff --git a/tests/test_stats/features/features_interpolation.json b/tests/test_stats/features/features_interpolation.json deleted file mode 100644 index ea96884a..00000000 --- a/tests/test_stats/features/features_interpolation.json +++ /dev/null @@ -1,182 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 23, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 1.01031, - "mean": 0.12296, - "median": 0.1381, - "min": -0.68999, - "std": 0.16146 - }, - "counts": { - "infinite": 0, - "nan": 1022208 - }, - "dtype": "float64", - "histogram": { - "counts": [ - 70357, - 360698, - 527532, - 5750765, - 3691296, - 271950, - 60530, - 56 - ], - "edges": [ - -0.68999, - -0.47746, - -0.26492, - -0.05238, - 0.16016, - 0.3727, - 0.58524, - 0.79777, - 1.01031 - ] - }, - "random_values": [ - 0.11596750169992447, - 0.11788000017404557, - 0.04343999996781349, - 0.29084000587463377, - 0.2957000136375427, - 0.15138444304466248, - -0.29289673125873433, - 0.09044000059366226 - ], - "subsample_basic_stats": { - "max": 0.8824, - "mean": 0.12279, - "median": 0.13798, - "min": -0.67464, - "std": 0.16146 - } - } - }, - "timestamps": [ - "2018-01-01T00:00:00", - "2018-01-17T00:00:00", - "2018-02-02T00:00:00", - "2018-02-18T00:00:00", - "2018-03-06T00:00:00", - "2018-03-22T00:00:00", - "2018-04-07T00:00:00", - "2018-04-23T00:00:00", - "2018-05-09T00:00:00", - "2018-05-25T00:00:00", - "2018-06-10T00:00:00", - "2018-06-26T00:00:00", - "2018-07-12T00:00:00", - "2018-07-28T00:00:00", - "2018-08-13T00:00:00", - "2018-08-29T00:00:00", - "2018-09-14T00:00:00", - "2018-09-30T00:00:00", - "2018-10-16T00:00:00", - "2018-11-01T00:00:00", - "2018-11-17T00:00:00", - "2018-12-03T00:00:00", - "2018-12-19T00:00:00" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 23, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 0.81978, - "mean": 0.12201, - "median": 0.13432, - "min": -0.69726, - "std": 0.16636 - }, - "counts": { - "infinite": 0, - "nan": 1022208 - }, - "dtype": "float64", - "histogram": { - "counts": [ - 40645, - 334992, - 422805, - 1570713, - 6706826, - 1430576, - 192046, - 34581 - ], - "edges": [ - -0.69726, - -0.50763, - -0.318, - -0.12837, - 0.06126, - 0.25089, - 0.44052, - 0.63015, - 0.81978 - ] - }, - "random_values": [ - 0.14778499752283097, - 0.1179600015282631, - 0.04780000001192093, - 0.23574000000953674, - 0.2442999929189682, - 0.1496311108271281, - -0.13448036655585383, - 0.10887333220905727 - ], - "subsample_basic_stats": { - "max": 0.80812, - "mean": 0.12186, - "median": 0.13432, - "min": -0.68514, - "std": 0.1665 - } - } - }, - "timestamps": [ - "2018-01-01T00:00:00", - "2018-01-17T00:00:00", - "2018-02-02T00:00:00", - "2018-02-18T00:00:00", - "2018-03-06T00:00:00", - "2018-03-22T00:00:00", - "2018-04-07T00:00:00", - "2018-04-23T00:00:00", - "2018-05-09T00:00:00", - "2018-05-25T00:00:00", - "2018-06-10T00:00:00", - "2018-06-26T00:00:00", - "2018-07-12T00:00:00", - "2018-07-28T00:00:00", - "2018-08-13T00:00:00", - "2018-08-29T00:00:00", - "2018-09-14T00:00:00", - "2018-09-30T00:00:00", - "2018-10-16T00:00:00", - "2018-11-01T00:00:00", - "2018-11-17T00:00:00", - "2018-12-03T00:00:00", - "2018-12-19T00:00:00" - ] - } -} diff --git a/tests/test_stats/features/features_mosaicking_custom.json b/tests/test_stats/features/features_mosaicking_custom.json deleted file mode 100644 index 650b64f3..00000000 --- a/tests/test_stats/features/features_mosaicking_custom.json +++ /dev/null @@ -1,142 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 3, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 1.4967, - "mean": 0.116054, - "median": 0.1293, - "min": -0.650452, - "std": 0.148757 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 18478, - 87475, - 890206, - 506506, - 27170, - 2554, - 853, - 70 - ], - "edges": [ - -0.650452, - -0.382058, - -0.113664, - 0.15473, - 0.423124, - 0.691518, - 0.959912, - 1.22831, - 1.4967 - ] - }, - "random_values": [ - 0.1391, - 0.1274, - 0.0854, - 0.1211, - -0.420511, - 0.0959, - 0.2489, - -0.43993 - ], - "subsample_basic_stats": { - "max": 1.312, - "mean": 0.115822, - "median": 0.1292, - "min": -0.640796, - "std": 0.148814 - } - } - }, - "timestamps": [ - "2017-12-12T08:00:00", - "2018-01-10T00:00:00", - "2018-02-07T16:00:00" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 3, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 0.815004, - "mean": 0.115278, - "median": 0.125, - "min": -0.678533, - "std": 0.156894 - }, - "counts": { - "infinite": 0, - "nan": 0 - }, - "dtype": "float32", - "histogram": { - "counts": [ - 4401, - 47662, - 56669, - 224620, - 1045002, - 118026, - 32786, - 4146 - ], - "edges": [ - -0.678533, - -0.49184, - -0.305148, - -0.118456, - 0.0682357, - 0.254928, - 0.44162, - 0.628312, - 0.815004 - ] - }, - "random_values": [ - 0.1648, - 0.1047, - 0.0987, - 0.0943, - -0.386683, - 0.1183, - 0.2601, - -0.505994 - ], - "subsample_basic_stats": { - "max": 0.803261, - "mean": 0.11499, - "median": 0.1248, - "min": -0.644115, - "std": 0.156825 - } - } - }, - "timestamps": [ - "2017-12-12T08:00:00", - "2018-01-10T00:00:00", - "2018-02-07T16:00:00" - ] - } -} diff --git a/tests/test_stats/features/features_on_rescaled_dn.json b/tests/test_stats/features/features_on_rescaled_dn.json deleted file mode 100644 index 8d7bebed..00000000 --- a/tests/test_stats/features/features_on_rescaled_dn.json +++ /dev/null @@ -1,160 +0,0 @@ -{ - "eopatch-id-0-col-0-row-0": { - "bbox": "BBox(((729480.0, 4390045.0), (732120.0, 4391255.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 12, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 1.4479999542, - "mean": 0.108596, - "median": 0.1234, - "min": -0.67189434738, - "std": 0.136154 - }, - "counts": { - "infinite": 0, - "nan": 4599936 - }, - "dtype": "float64", - "histogram": { - "counts": [ - 13829, - 77985, - 671287, - 743047, - 25889, - 1266, - 6, - 3 - ], - "edges": [ - -0.67189434738, - -0.40690755968, - -0.14192077198, - 0.12306601572, - 0.38805280342, - 0.65303959112, - 0.91802637882, - 1.1830131665, - 1.4479999542 - ] - }, - "random_values": [ - 0.13699999452, - 0.12610000372, - 0.072599999607, - 0.090199999511, - -0.14273166234, - 0.06400000304, - 0.14849999547, - -0.33542911455 - ], - "subsample_basic_stats": { - "max": 0.80012730644, - "mean": 0.108478, - "median": 0.123395, - "min": -0.65681234286, - "std": 0.136007 - } - } - }, - "timestamps": [ - "2019-01-16T05:00:00", - "2019-02-15T15:00:00", - "2019-03-18T01:00:00", - "2019-04-17T11:00:00", - "2019-05-17T21:00:00", - "2019-06-17T07:00:00", - "2019-07-17T17:00:00", - "2019-08-17T03:00:00", - "2019-09-16T13:00:00", - "2019-10-16T23:00:00", - "2019-11-16T09:00:00", - "2019-12-16T19:00:00" - ] - }, - "eopatch-id-1-col-0-row-1": { - "bbox": "BBox(((729480.0, 4391145.0), (732120.0, 4392355.0)), crs=CRS('32638'))", - "data": { - "FEATURES": { - "array_shape": [ - 12, - 121, - 264, - 16 - ], - "basic_stats": { - "max": 0.93309998512, - "mean": 0.108924, - "median": 0.1206, - "min": -0.64661968155, - "std": 0.14887 - }, - "counts": { - "infinite": 0, - "nan": 4599936 - }, - "dtype": "float64", - "histogram": { - "counts": [ - 11109, - 52730, - 67229, - 843730, - 508527, - 40518, - 9401, - 68 - ], - "edges": [ - -0.64661968155, - -0.44915472322, - -0.25168976488, - -0.05422480655, - 0.14324015178, - 0.34070511012, - 0.53817006845, - 0.73563502679, - 0.93309998512 - ] - }, - "random_values": [ - 0.13840000331, - 0.10580000281, - 0.17399999499, - 0.094400003552, - -0.20961925772, - 0.099100001156, - 0.12319999933, - -0.41377244705 - ], - "subsample_basic_stats": { - "max": 0.76345023131, - "mean": 0.108618, - "median": 0.1206, - "min": -0.62982314109, - "std": 0.148924 - } - } - }, - "timestamps": [ - "2019-01-16T05:00:00", - "2019-02-15T15:00:00", - "2019-03-18T01:00:00", - "2019-04-17T11:00:00", - "2019-05-17T21:00:00", - "2019-06-17T07:00:00", - "2019-07-17T17:00:00", - "2019-08-17T03:00:00", - "2019-09-16T13:00:00", - "2019-10-16T23:00:00", - "2019-11-16T09:00:00", - "2019-12-16T19:00:00" - ] - } -} From 9ebcbb307fc1584f58db21fd85847ff0979c0e78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Tue, 21 Nov 2023 13:01:36 +0100 Subject: [PATCH 08/10] Better chain configs (#304) * change config collection API * separate pipeline chain execution * add remote execution capabilities * move file * make mypy happy * Update eogrow/utils/pipeline_chain.py Co-authored-by: Matic Lubej * rename parameters * streamline names * add simple tests for pipeline chain * reenable test suite (hopefully) * update docs * add default description to docs --------- Co-authored-by: Matic Lubej --- .github/workflows/ci_action.yml | 6 ++- docs/source/config-language.md | 23 +++++++---- eogrow/cli.py | 56 +++++++++++++++----------- eogrow/core/config.py | 26 ++++++------ eogrow/core/pipeline.py | 2 +- eogrow/core/schemas.py | 2 +- eogrow/utils/pipeline_chain.py | 49 +++++++++++++++++++++++ eogrow/utils/testing.py | 3 +- tests/core/test_config.py | 35 +++++++++-------- tests/utils/test_pipeline_chain.py | 63 ++++++++++++++++++++++++++++++ 10 files changed, 199 insertions(+), 66 deletions(-) create mode 100644 eogrow/utils/pipeline_chain.py create mode 100644 tests/utils/test_pipeline_chain.py diff --git a/.github/workflows/ci_action.yml b/.github/workflows/ci_action.yml index 1127a1bd..f1f1c0cb 100644 --- a/.github/workflows/ci_action.yml +++ b/.github/workflows/ci_action.yml @@ -90,8 +90,10 @@ jobs: pip install -e .[DEV,ML] pip install gdal==$(gdal-config --version) - - name: Set up local cluster - run: ray start --head + - name: Set up local cluster # we need to install async-timeout until ray 2.9.0 fixes the issue + run: | + pip install async-timeout + ray start --head - name: Run fast tests if: ${{ !matrix.full_test_suite }} diff --git a/docs/source/config-language.md b/docs/source/config-language.md index fd401cd9..9ca6336d 100644 --- a/docs/source/config-language.md +++ b/docs/source/config-language.md @@ -26,19 +26,28 @@ Additional notes: ### Pipeline chains -A typical configuration is a dictionary with pipeline parameters. However, it can also be a list of dictionaries. In this case each dictionary must contain parameters of a single pipeline. The order of dictionaries defines the consecutive order in which pipelines will be run. Example: +A typical configuration is a dictionary with pipeline parameters. However, it can also be a list of pipeline-execution dictionaries that specify: +- `pipeline_config`: a configuration for a single pipeline, +- `pipeline_resources` (optional): a dictionary that is passed to `ray.remote` to configure which resources the main pipeline process will request from the cluster (see [here](https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html) for options). The pipeline requests 1 CPU by default (and nothing else). + +The order of dictionaries defines the consecutive order in which pipelines will be run. Example: ``` [ { - "pipeline": "FirstPipeline", - "param1": "value1", - ... + "pipeline_config": { + "pipeline": "FirstPipeline", + "param1": "value1", + ... + }, }, { - "pipeline": "SecondPipeline", - "param2": "value2", - ... + "pipeline_config": { + "pipeline": "SecondPipeline", + "param2": "value2", + ... + }, + "pipeline_resources": {"num_cpus": 2} }, ... ] diff --git a/eogrow/cli.py b/eogrow/cli.py index 94f4538f..4d73a46d 100644 --- a/eogrow/cli.py +++ b/eogrow/cli.py @@ -1,20 +1,23 @@ """Implements the command line interface for `eo-grow`.""" +from __future__ import annotations + import json import os import re import subprocess from tempfile import NamedTemporaryFile -from typing import Optional, Tuple +from typing import Iterable import click -from .core.config import collect_configs_from_path, interpret_config_from_dict +from .core.config import CrudeConfig, RawConfig, collect_configs_from_path, interpret_config_from_dict from .core.logging import CLUSTER_FILE_LOCATION_ON_HEAD from .core.schemas import build_schema_template from .pipelines.testing import TestPipeline from .utils.general import jsonify from .utils.meta import collect_schema, import_object, load_pipeline_class +from .utils.pipeline_chain import run_pipeline_chain, validate_pipeline_chain from .utils.ray import generate_cluster_config_path, start_cluster_if_needed variables_option = click.option( @@ -42,7 +45,7 @@ @click.argument("config_path", type=click.Path()) @variables_option @test_patches_option -def run_pipeline(config_path: str, cli_variables: Tuple[str, ...], test_patches: Tuple[int, ...]) -> None: +def run_pipeline(config_path: str, cli_variables: tuple[str, ...], test_patches: tuple[int, ...]) -> None: """Execute eo-grow pipeline using CLI. \b @@ -50,20 +53,19 @@ def run_pipeline(config_path: str, cli_variables: Tuple[str, ...], test_patches: eogrow config_files/config.json """ - raw_configs = collect_configs_from_path(config_path) + crude_config = collect_configs_from_path(config_path) cli_variable_mapping = dict(_parse_cli_variable(cli_var) for cli_var in cli_variables) - pipelines = [] - for raw_config in raw_configs: - config = interpret_config_from_dict(raw_config, cli_variable_mapping) - if test_patches: - config["test_subset"] = list(test_patches) - - pipelines.append(load_pipeline_class(config).from_raw_config(config)) - - for pipeline in pipelines: + if isinstance(crude_config, dict): + config = _prepare_config(crude_config, cli_variable_mapping, test_patches) + pipeline = load_pipeline_class(config).from_raw_config(config) pipeline.run() + else: + pipeline_chain = [_prepare_config(config, cli_variable_mapping, test_patches) for config in crude_config] + validate_pipeline_chain(pipeline_chain) + run_pipeline_chain(pipeline_chain) + @click.command() @click.argument("cluster_yaml", type=click.Path()) @@ -85,8 +87,8 @@ def run_pipeline_on_cluster( cluster_yaml: str, start_cluster: bool, use_tmux: bool, - cli_variables: Tuple[str, ...], - test_patches: Tuple[int, ...], + cli_variables: tuple[str, ...], + test_patches: tuple[int, ...], ) -> None: """Command for running an eo-grow pipeline on a remote Ray cluster of AWS EC2 instances. The provided config is fully constructed and uploaded to the cluster head in the `~/.synced_configs/` directory, where it is then @@ -99,11 +101,9 @@ def run_pipeline_on_cluster( if start_cluster: start_cluster_if_needed(cluster_yaml) - raw_configs = [interpret_config_from_dict(config) for config in collect_configs_from_path(config_path)] remote_path = generate_cluster_config_path(config_path) - with NamedTemporaryFile(mode="w", delete=True, suffix=".json") as local_path: - json.dump(raw_configs, local_path) + json.dump(collect_configs_from_path(config_path), local_path) local_path.flush() # without this the sync can happen before the file content is written subprocess.run(f"ray rsync_up {cluster_yaml} {local_path.name!r} {remote_path!r}", shell=True) @@ -156,7 +156,7 @@ def run_pipeline_on_cluster( ) def make_template( import_path: str, - template_path: Optional[str], + template_path: str | None, force_override: bool, template_format: str, required_only: bool, @@ -203,9 +203,12 @@ def validate_config(config_path: str) -> None: Example: eogrow-validate config_files/config.json """ - for config in collect_configs_from_path(config_path): - raw_config = interpret_config_from_dict(config) - load_pipeline_class(config).Schema.parse_obj(raw_config) + config = collect_configs_from_path(config_path) + if isinstance(config, dict): + pipeline_config = _prepare_config(config, {}, ()) + collect_schema(load_pipeline_class(pipeline_config)).parse_obj(pipeline_config) + else: + validate_pipeline_chain([_prepare_config(run_config, {}, ()) for run_config in config]) click.echo("Config validation succeeded!") @@ -226,7 +229,14 @@ def run_test_pipeline(config_path: str) -> None: pipeline.run() -def _parse_cli_variable(mapping_str: str) -> Tuple[str, str]: +def _prepare_config(config: CrudeConfig, variables: dict[str, str], test_patches: Iterable[int]) -> RawConfig: + raw_config = interpret_config_from_dict(config, variables) + if test_patches: + raw_config["test_subset"] = list(test_patches) + return raw_config + + +def _parse_cli_variable(mapping_str: str) -> tuple[str, str]: """Checks that the input is of shape `name:value` and then splits it into a tuple""" match = re.match(r"(?P.+?):(?P.+)", mapping_str) if match is None: diff --git a/eogrow/core/config.py b/eogrow/core/config.py index 7761046e..e688a222 100644 --- a/eogrow/core/config.py +++ b/eogrow/core/config.py @@ -5,7 +5,7 @@ import copy import re from functools import reduce -from typing import Any, Callable, NewType, cast +from typing import Any, Callable, List, NewType, Union, cast import fs.path import rapidjson @@ -16,7 +16,7 @@ RawConfig = NewType("RawConfig", dict) -def collect_configs_from_path(path: str, used_config_paths: set[str] | None = None) -> list[CrudeConfig]: +def collect_configs_from_path(path: str, used_config_paths: set[str] | None = None) -> CrudeConfig | list[CrudeConfig]: """Loads and builds a list of config dictionaries defined by the parameters stored in files This function performs the 1st stage of language interpretation as described in @@ -38,11 +38,9 @@ def collect_configs_from_path(path: str, used_config_paths: set[str] | None = No config = _recursive_config_build(config, used_config_paths) - if isinstance(config, dict): - config = [config] - if isinstance(config, list): - return config - raise ValueError(f"When interpreting config from {path} a dictionary or list was expected, got {type(config)}.") + if not isinstance(config, (dict, list)): + raise ValueError(f"When interpreting config from {path} a dictionary or list was expected, got {type(config)}.") + return cast(Union[CrudeConfig, List[CrudeConfig]], config) def _recursive_config_build(config: object, used_config_paths: set[str]) -> object: @@ -65,13 +63,13 @@ def _recursive_config_build(config: object, used_config_paths: set[str]) -> obje if value in used_config_paths: raise ValueError("Detected a cyclic import of configs") - imported_config_list = collect_configs_from_path(value, used_config_paths=used_config_paths) - if len(imported_config_list) != 1: + imported_config = collect_configs_from_path(value, used_config_paths=used_config_paths) + if not isinstance(imported_config, dict): raise ValueError( "Config lists cannot be imported inside configs. Found a config list when resolving key" f" {key} for path {value}" ) - imported_configs.append(imported_config_list[0]) + imported_configs.append(imported_config) else: joint_config[key] = _recursive_config_build(value, used_config_paths) @@ -113,10 +111,10 @@ def interpret_config_from_dict(config: CrudeConfig, external_variables: dict[str def interpret_config_from_path(path: str) -> RawConfig: """Loads from path in applies both steps of the config language.""" - configs = collect_configs_from_path(path) - if len(configs) != 1: - raise ValueError(f"The JSON file {path} was expected to contain a single dictionary, got {len(configs)}") - return interpret_config_from_dict(configs[0]) + config = collect_configs_from_path(path) + if isinstance(config, dict): + return interpret_config_from_dict(config) + raise ValueError(f"The JSON file {path} was expected to contain a single dictionary, got {len(config)}") def _resolve_config_paths(config_str: str, config_path: str) -> str: diff --git a/eogrow/core/pipeline.py b/eogrow/core/pipeline.py index 4e48fd40..2749c44c 100644 --- a/eogrow/core/pipeline.py +++ b/eogrow/core/pipeline.py @@ -169,7 +169,7 @@ def run_execution( else: ray.init(address="auto", ignore_reinit_error=True) executor_class = RayExecutor - executor_kwargs = {"ray_remote_kwargs": self.config.ray_remote_kwargs} + executor_kwargs = {"ray_remote_kwargs": self.config.worker_resources} LOGGER.info("Starting processing for %d EOPatches", len(execution_kwargs)) diff --git a/eogrow/core/schemas.py b/eogrow/core/schemas.py index 0413bfa3..075d4c62 100644 --- a/eogrow/core/schemas.py +++ b/eogrow/core/schemas.py @@ -45,7 +45,7 @@ class PipelineSchema(BaseSchema): logging: ManagerSchema = Field(description="A schema of an implementation of LoggingManager class") validate_logging = field_validator("logging", validate_manager, pre=True) - ray_remote_kwargs: Dict[str, Any] = Field( + worker_resources: Dict[str, Any] = Field( default_factory=dict, description=( "Keyword arguments passed to ray tasks when executing via `RayExecutor`. The options are specified [here]" diff --git a/eogrow/utils/pipeline_chain.py b/eogrow/utils/pipeline_chain.py new file mode 100644 index 00000000..ecde54de --- /dev/null +++ b/eogrow/utils/pipeline_chain.py @@ -0,0 +1,49 @@ +"""Module implementing utilities for chained configs.""" + +from __future__ import annotations + +from typing import Any, Dict + +import ray +from pydantic import Field, ValidationError + +from ..core.config import RawConfig +from ..core.schemas import BaseSchema +from .meta import collect_schema, load_pipeline_class + + +class PipelineRunSchema(BaseSchema): + pipeline_config: dict + pipeline_resources: Dict[str, Any] = Field( + default_factory=dict, + description=( + "Keyword arguments passed to ray when executing the main pipeline process. The options are specified [here]" + "(https://docs.ray.io/en/latest/ray-core/api/doc/ray.remote_function.RemoteFunction.options.html)." + ), + ) + + +def validate_pipeline_chain(pipeline_chain: list[RawConfig]) -> None: + for i, run_config in enumerate(pipeline_chain): + try: + run_schema = PipelineRunSchema.parse_obj(run_config) + except ValidationError as e: + raise TypeError( + f"Pipeline-chain element {i} should be a dictionary with the fields `pipeline_config` and the optional" + " `pipeline_resources`." + ) from e + + pipeline_schema = collect_schema(load_pipeline_class(run_schema.pipeline_config)) + pipeline_schema.parse_obj(run_schema.pipeline_config) + + +def run_pipeline_chain(pipeline_chain: list[RawConfig]) -> None: + for run_config in pipeline_chain: + run_schema = PipelineRunSchema.parse_obj(run_config) + runner = _pipeline_runner.options(**run_schema.pipeline_resources) # type: ignore[attr-defined] + ray.get(runner.remote(run_schema.pipeline_config)) + + +@ray.remote +def _pipeline_runner(config: RawConfig) -> None: + return load_pipeline_class(config).from_raw_config(config).run() diff --git a/eogrow/utils/testing.py b/eogrow/utils/testing.py index 436789b1..ca870cd0 100644 --- a/eogrow/utils/testing.py +++ b/eogrow/utils/testing.py @@ -269,7 +269,8 @@ def run_config( :param check_logs: If pipeline logs should be checked after the run completes. If EOWorkflows were used, the function fails if there were unsuccessful executions. """ - crude_configs = collect_configs_from_path(config_path) + collected_configs = collect_configs_from_path(config_path) + crude_configs = collected_configs if isinstance(collected_configs, list) else [collected_configs] raw_configs = [interpret_config_from_dict(config) for config in crude_configs] for config in raw_configs: diff --git a/tests/core/test_config.py b/tests/core/test_config.py index 5e3832d7..c25610ce 100644 --- a/tests/core/test_config.py +++ b/tests/core/test_config.py @@ -14,25 +14,26 @@ CONFIG_LIST = [CONFIG_DICT, CONFIG_DICT] -@pytest.mark.parametrize("config_object", [CONFIG_DICT, CONFIG_LIST]) -def test_config_from_file(config_object, temp_folder): +def test_config_from_file_single(temp_folder): path = os.path.join(temp_folder, "config.json") with open(path, "w") as fp: - json.dump(config_object, fp) - - config_list = list(map(interpret_config_from_dict, collect_configs_from_path(path))) - if isinstance(config_object, dict): - directly_loaded_config = interpret_config_from_path(path) - assert len(config_list) == 1 - assert isinstance(directly_loaded_config, dict) - assert isinstance(config_list[0], dict) - assert directly_loaded_config == config_object - assert config_list[0] == config_object - - else: - assert isinstance(config_list, list) - assert all(isinstance(config, dict) for config in config_list) - assert config_list == config_object + json.dump(CONFIG_DICT, fp) + + directly_loaded_config = interpret_config_from_path(path) + assert isinstance(directly_loaded_config, dict) + assert directly_loaded_config == CONFIG_DICT + assert directly_loaded_config == interpret_config_from_dict(collect_configs_from_path(path)) + + +def test_config_from_file_chain(temp_folder): + path = os.path.join(temp_folder, "config.json") + with open(path, "w") as fp: + json.dump(CONFIG_LIST, fp) + + config_list = collect_configs_from_path(path) + assert isinstance(config_list, list) + assert all(isinstance(config, dict) for config in config_list) + assert config_list == CONFIG_LIST def test_missing_config_loading(): diff --git a/tests/utils/test_pipeline_chain.py b/tests/utils/test_pipeline_chain.py new file mode 100644 index 00000000..9b623c62 --- /dev/null +++ b/tests/utils/test_pipeline_chain.py @@ -0,0 +1,63 @@ +import os + +import pytest +from pydantic import ValidationError + +from eogrow.core.config import collect_configs_from_path +from eogrow.utils.pipeline_chain import run_pipeline_chain, validate_pipeline_chain + + +@pytest.fixture(name="global_config") +def global_config_fixture(config_folder): + return collect_configs_from_path(os.path.join(config_folder, "global_config.json")) + + +@pytest.fixture(name="some_valid_pipeline_config") +def some_valid_pipeline_config_fixture(global_config): + return { + "pipeline": "eogrow.pipelines.import_tiff.ImportTiffPipeline", + "tiff_folder_key": "input_data", + "output_folder_key": "temp", + "output_feature": ["data", "ImportedData"], + "input_filename": "import_test.tiff", + **global_config, + } + + +def test_validate_pipeline_chain(some_valid_pipeline_config): + good_chain = [ + {"pipeline_config": some_valid_pipeline_config}, + {"pipeline_config": some_valid_pipeline_config, "pipeline_resources": {"num_cpus": 1}}, + ] + + validate_pipeline_chain(good_chain) + + +def test_validate_pipeline_chain_fail(some_valid_pipeline_config): + bad_config_in_chain = [ + {"pipeline_config": {"nonexisting_param": "quack quack", **some_valid_pipeline_config}}, + ] + with pytest.raises(ValidationError): + validate_pipeline_chain(bad_config_in_chain) + + bad_chain = [ + {"pipeline_config": some_valid_pipeline_config}, + {"pipeline_config": some_valid_pipeline_config, "PipelineResources": {"num_cpus": 1}}, + ] + with pytest.raises(TypeError): + validate_pipeline_chain(bad_chain) + + +def test_run_pipeline_chain(global_config): + pipeline_config = { + "pipeline": "eogrow.pipelines.testing.GenerateDataPipeline", + "output_folder_key": "temp", + "seed": 42, + "timestamps": {"time_period": ["2021-06-15", "2022-09-05"], "num_timestamps": 10}, + **global_config, + } + chain = [ + {"pipeline_config": pipeline_config}, + {"pipeline_config": pipeline_config, "pipeline_resources": {"num_cpus": 1}}, + ] + run_pipeline_chain(chain) From 4df35702dbe63cba127e96aad49c50a611f6d3c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:49:02 +0100 Subject: [PATCH 09/10] Remove TestPipeline and remove outdated example (#306) * adjust docstrings a bit * replace value errors with type errors where appropriate * remove TestPipeline * switch import format in a file * clean up tests * remove outdated example --- eogrow/cli.py | 17 - eogrow/core/config.py | 18 +- eogrow/core/pipeline.py | 9 +- eogrow/pipelines/export_maps.py | 2 +- eogrow/pipelines/sampling.py | 2 +- eogrow/pipelines/testing.py | 51 +- examples/workshop/bohinj-aoi.geojson | 29 - examples/workshop/cluster.yaml | 97 --- examples/workshop/configs/download.json | 12 - examples/workshop/configs/global_config.json | 32 - .../workshop/configs/water_detection.json | 4 - examples/workshop/workshop.ipynb | 762 ------------------ tests/test_cli.py | 1 - 13 files changed, 15 insertions(+), 1021 deletions(-) delete mode 100644 examples/workshop/bohinj-aoi.geojson delete mode 100644 examples/workshop/cluster.yaml delete mode 100644 examples/workshop/configs/download.json delete mode 100644 examples/workshop/configs/global_config.json delete mode 100644 examples/workshop/configs/water_detection.json delete mode 100644 examples/workshop/workshop.ipynb diff --git a/eogrow/cli.py b/eogrow/cli.py index 4d73a46d..5ff9e7a6 100644 --- a/eogrow/cli.py +++ b/eogrow/cli.py @@ -14,7 +14,6 @@ from .core.config import CrudeConfig, RawConfig, collect_configs_from_path, interpret_config_from_dict from .core.logging import CLUSTER_FILE_LOCATION_ON_HEAD from .core.schemas import build_schema_template -from .pipelines.testing import TestPipeline from .utils.general import jsonify from .utils.meta import collect_schema, import_object, load_pipeline_class from .utils.pipeline_chain import run_pipeline_chain, validate_pipeline_chain @@ -213,22 +212,6 @@ def validate_config(config_path: str) -> None: click.echo("Config validation succeeded!") -@click.command() -@click.argument("config_path", type=click.Path()) -def run_test_pipeline(config_path: str) -> None: - """Runs a test pipeline that only makes sure the managers work correctly. This can be used to select best - area manager parameters. - - \b - Example: - eogrow-test any_pipeline_config.json - """ - for crude_config in collect_configs_from_path(config_path): - raw_config = interpret_config_from_dict(crude_config) - pipeline = TestPipeline.with_defaults(raw_config) - pipeline.run() - - def _prepare_config(config: CrudeConfig, variables: dict[str, str], test_patches: Iterable[int]) -> RawConfig: raw_config = interpret_config_from_dict(config, variables) if test_patches: diff --git a/eogrow/core/config.py b/eogrow/core/config.py index e688a222..715bd409 100644 --- a/eogrow/core/config.py +++ b/eogrow/core/config.py @@ -39,7 +39,7 @@ def collect_configs_from_path(path: str, used_config_paths: set[str] | None = No config = _recursive_config_build(config, used_config_paths) if not isinstance(config, (dict, list)): - raise ValueError(f"When interpreting config from {path} a dictionary or list was expected, got {type(config)}.") + raise TypeError(f"When interpreting config from {path} a dictionary or list was expected, got {type(config)}.") return cast(Union[CrudeConfig, List[CrudeConfig]], config) @@ -57,7 +57,7 @@ def _recursive_config_build(config: object, used_config_paths: set[str]) -> obje for key, value in config.items(): if not isinstance(key, str): - raise ValueError(f"Dictionary keys should always be strings, but found: {key}") + raise TypeError(f"Dictionary keys should always be strings, but found: {key}") if key.startswith("**"): if value in used_config_paths: @@ -89,6 +89,9 @@ def interpret_config_from_dict(config: CrudeConfig, external_variables: dict[str """ _recursive_check_config(config) + if not isinstance(config, dict): + raise TypeError(f"Can only interpret dictionary objects, got {type(config)}.") + config = cast(CrudeConfig, config.copy()) variable_mapping = config.pop("variables", {}) if external_variables: @@ -101,10 +104,6 @@ def interpret_config_from_dict(config: CrudeConfig, external_variables: dict[str config_with_variables = _recursive_apply_to_strings( config, lambda config_str: _resolve_variables(config_str, variable_mapping) ) - if not isinstance(config_with_variables, dict): - raise ValueError( - f"Interpretation resulted in object of type {type(config_with_variables)} but a dictionary was expected." - ) return cast(RawConfig, config_with_variables) @@ -153,14 +152,11 @@ def _recursive_apply_to_strings(config: object, function: Callable) -> object: def _recursive_check_config(config: object) -> None: - """Recursively checks if the config satisfies basic conditions for being JSON serializable. - - :raises: ValueError - """ + """Recursively checks if the config satisfies basic conditions for being JSON serializable.""" if isinstance(config, dict): for key, value in config.items(): if not isinstance(key, str): - raise ValueError(f"Config keys should be strings but {key} found") + raise TypeError(f"Config keys should be strings but {key} found") _recursive_check_config(value) elif isinstance(config, list): diff --git a/eogrow/core/pipeline.py b/eogrow/core/pipeline.py index 2749c44c..93bf92aa 100644 --- a/eogrow/core/pipeline.py +++ b/eogrow/core/pipeline.py @@ -81,7 +81,7 @@ def _new_pipeline_id() -> str: def _load_manager(manager_config: ManagerSchema, **manager_params: Any) -> Any: """Loads a manager class and back-propagates parsed config - :param manager_key: A config key name of a sub-config with manager parameters + :param manager_config: A sub-config with manager parameters :param manager_params: Other parameters to initialize a manager class """ if manager_config.manager is None: @@ -94,7 +94,7 @@ def get_pipeline_execution_name(self, pipeline_timestamp: str) -> str: return f"{pipeline_timestamp}-{self._pipeline_name}-{self.pipeline_id}" def get_patch_list(self) -> PatchList: - """Method which at the initialization prepares the list of EOPatches which will be used""" + """Method that prepares the list of EOPatches for which to run the pipeline execution.""" patch_list = self.area_manager.get_patch_list() if self.config.test_subset is not None: @@ -130,7 +130,7 @@ def get_execution_arguments(self, workflow: EOWorkflow, patch_list: PatchList) - """Prepares execution arguments for each eopatch from a list of patches. The output should be a dictionary of form `{execution_name: {node: node_kwargs}}`. Execution names are usually - names of EOPatches, but can be anything. + names of EOPatches, but can be anything. :param workflow: A workflow for which arguments will be prepared """ @@ -262,8 +262,7 @@ def run_procedure(self) -> tuple[list[str], list[str]]: """ if not hasattr(self, "build_workflow"): raise NotImplementedError( - "Default implementation of the `run_procedure` method requires implementation of the `build_workflow`" - " method." + "Implementation of the `run_procedure` method requires implementation of the `build_workflow` method." ) workflow = self.build_workflow() patch_list = self.get_patch_list() diff --git a/eogrow/pipelines/export_maps.py b/eogrow/pipelines/export_maps.py index c627edc5..e07f65d5 100644 --- a/eogrow/pipelines/export_maps.py +++ b/eogrow/pipelines/export_maps.py @@ -131,7 +131,7 @@ def run_procedure(self) -> tuple[list[str], list[str]]: successful, failed, _ = self.run_execution(workflow, exec_args) if not successful: - raise ValueError("Failed to extract tiff files from any of EOPatches.") + raise RuntimeError("Failed to extract tiff files from any of EOPatches.") feature_type, _ = self.config.feature output_folder = self.storage.get_folder(self.config.output_folder_key) diff --git a/eogrow/pipelines/sampling.py b/eogrow/pipelines/sampling.py index 57e4ff8e..2d744784 100644 --- a/eogrow/pipelines/sampling.py +++ b/eogrow/pipelines/sampling.py @@ -92,7 +92,7 @@ def _get_loading_node(self) -> EONode: feature_type = FeatureType(feature_type_str) if not feature_type.is_spatial(): - raise ValueError(f"Only spatial features can be sampled, but found {feature_type}: {feature_names}") + raise TypeError(f"Only spatial features can be sampled, but found {feature_type}: {feature_names}") for feature_name in feature_names: load_features.append((feature_type, feature_name)) # noqa: PERF401 diff --git a/eogrow/pipelines/testing.py b/eogrow/pipelines/testing.py index 76b19248..dd4c82c8 100644 --- a/eogrow/pipelines/testing.py +++ b/eogrow/pipelines/testing.py @@ -2,8 +2,7 @@ from __future__ import annotations -import logging -from typing import List, Literal, Optional, Tuple, TypeVar, Union +from typing import List, Literal, Optional, Tuple, Union import numpy as np from pydantic import Field @@ -11,58 +10,12 @@ from eolearn.core import CreateEOPatchTask, EONode, EOWorkflow, OverwritePermission, SaveTask from eolearn.core.types import Feature -from ..core.config import RawConfig, recursive_config_join from ..core.pipeline import Pipeline from ..core.schemas import BaseSchema -from ..tasks.testing import ( - GenerateRasterFeatureTask, - GenerateTimestampsTask, - NormalDistribution, - UniformDistribution, -) +from ..tasks.testing import GenerateRasterFeatureTask, GenerateTimestampsTask, NormalDistribution, UniformDistribution from ..types import ExecKwargs, PatchList, TimePeriod from ..utils.validators import ensure_storage_key_presence, field_validator, parse_dtype, parse_time_period -Self = TypeVar("Self", bound="TestPipeline") -LOGGER = logging.getLogger(__name__) - - -class TestPipeline(Pipeline): - """Pipeline that just tests if all managers works correctly. It can be used to check if area manager creates a - correct grid. - """ - - class Schema(Pipeline.Schema): - class Config: - extra = "allow" - - _DEFAULT_CONFIG_PARAMS = { # noqa: RUF012 - "pipeline": "eogrow.pipelines.testing.TestPipeline", - "logging": {"manager": "eogrow.logging.LoggingManager", "show_logs": True}, - } - - @classmethod - def with_defaults(cls: type[Self], config: RawConfig) -> Self: - config = recursive_config_join(config, cls._DEFAULT_CONFIG_PARAMS) # type: ignore[assignment] - return cls.from_raw_config(config) - - def run_procedure(self) -> tuple[list, list]: - """Performs basic tests of managers""" - if self.storage.filesystem.exists("/"): - LOGGER.info("Project folder %s exists", self.storage.config.project_folder) - else: - LOGGER.info("Project folder %s does not exist", self.storage.config.project_folder) - - self.area_manager.get_area_geometry() - grid = self.area_manager.get_grid() - num_patches = sum(map(len, grid.values())) - LOGGER.info("Grid has %d EOPatches and is split over %d CRS zones", num_patches, len(grid)) - - patch_list = self.area_manager.get_patch_list() - LOGGER.info("The first EOPatch has a name %s", patch_list[0][0]) - - return [], [] - class UniformDistributionSchema(BaseSchema): kind: Literal["uniform"] diff --git a/examples/workshop/bohinj-aoi.geojson b/examples/workshop/bohinj-aoi.geojson deleted file mode 100644 index fe169fc7..00000000 --- a/examples/workshop/bohinj-aoi.geojson +++ /dev/null @@ -1,29 +0,0 @@ -{ - "type": "FeatureCollection", - "name": "aoi", - "crs": { - "type": "name", - "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } - }, - "features": [ - { - "type": "Feature", - "properties": { "fid": 0 }, - "geometry": { - "type": "MultiPolygon", - "coordinates": [ - [ - [ - [13.834332897646069, 46.302927445541421], - [13.806601314318312, 46.282448248973182], - [13.846812110143556, 46.26567595392445], - [13.903835178361259, 46.269390261729221], - [13.887196228364607, 46.300173379452758], - [13.834332897646069, 46.302927445541421] - ] - ] - ] - } - } - ] -} diff --git a/examples/workshop/cluster.yaml b/examples/workshop/cluster.yaml deleted file mode 100644 index 6f54116c..00000000 --- a/examples/workshop/cluster.yaml +++ /dev/null @@ -1,97 +0,0 @@ -# A configuration of ray cluster for GEM project -# For info about parameters check https://docs.ray.io/en/latest/cluster/config.html#full-configuration - -cluster_name: workshop-cluster - -max_workers: 4 # Max number of worker instances -upscaling_speed: 1.0 -idle_timeout_minutes: 5 - -docker: - image: ".dkr.ecr.eu-central-1.amazonaws.com/" # Edit this! - container_name: "gem_container" - pull_before_run: True - run_options: - - --privileged # Because of s3fs-fuse - -provider: - type: aws - region: eu-central-1 - availability_zone: eu-central-1a,eu-central-1b,eu-central-1c - cache_stopped_nodes: False # Change for terminating instances - -auth: - ssh_user: ubuntu - -available_node_types: - ray.head: - min_workers: 0 - max_workers: 0 - node_config: - InstanceType: m5.xlarge - ImageId: ami- # Edit this! - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 20 - resources: {"CPU": 1} - ray.worker: - min_workers: 0 - max_workers: 4 # Max number of workers of this type - node_config: - InstanceType: m5.xlarge - ImageId: ami- # Edit this! - InstanceMarketOptions: - MarketType: spot # always try using spot because it is cheaper - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 20 -# resources: {"CPU": 1} - -head_node_type: ray.head - -file_mounts: {} -cluster_synced_files: [] -file_mounts_sync_continuously: False -rsync_exclude: - - "**/.git" - - "**/.git/**" -rsync_filter: - - ".gitignore" - -initialization_commands: - - aws ecr get-login-password | docker login --username AWS --password-stdin .dkr.ecr.eu-central-1.amazonaws.com - -setup_commands: - # Set credentials here: - - aws --profile workshop configure set aws_access_key_id - - aws --profile workshop configure set aws_secret_access_key - - aws --profile workshop configure set region eu-central-1 - # Mounting an S3 bucket (useful just for prototyping): - - cat .aws/credentials | grep -m 2 access | awk '{print $3}' | xargs | sed 's/ /:/g' > ~/.passwd-s3fs - - chmod 600 ~/.passwd-s3fs - - s3fs eogrow-workshop ~/data -o umask=0000 | true - - - git -C packages/sentinelhub-py pull - - git -C packages/eo-learn pull - - git -C packages/eo-grow pull - # This is temporal: - - git -C packages/eo-grow checkout -b | true - - git -C packages/eo-grow pull origin - -head_setup_commands: - - pip install jupyter - -worker_setup_commands: [] - -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml - -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 - -head_node: {} -worker_nodes: {} diff --git a/examples/workshop/configs/download.json b/examples/workshop/configs/download.json deleted file mode 100644 index b2547dc3..00000000 --- a/examples/workshop/configs/download.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "pipeline": "eogrow.pipelines.download.DownloadPipeline", - "**global_config": "${config_path}/global_config.json", - "output_folder_key": "data", - "time_period": ["${var:month}-01", "${var:next_month}-01"], - "data_collection": "SENTINEL2_L1C", - "resolution": 10, - "bands_feature_name": "BANDS", - "maxcc": 1.0, - "time_difference": 120, - "threads_per_worker": 5 -} diff --git a/examples/workshop/configs/global_config.json b/examples/workshop/configs/global_config.json deleted file mode 100644 index f0e10adc..00000000 --- a/examples/workshop/configs/global_config.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "variables": { - "month": "2021-06", - "next_month": "2021-07" - }, - "storage": { - "manager": "eogrow.core.storage.StorageManager", - "project_folder": "s3://eogrow-workshop/project/", - "aws_profile": "workshop", - "structure": { - "data": "data/${var:month}", - "results": "results/${var:month}", - "vector_results": "vector-results/${var:month}" - } - }, - "area": { - "manager": "eogrow.core.area.UtmZoneAreaManager", - "area_filename": "bohinj-aoi.geojson", - "area_buffer": 0.01, - "patch_size_x": 2500, - "patch_size_y": 2500 - }, - "eopatch": { - "manager": "eogrow.core.eopatch.EOPatchManager" - }, - "logging": { - "manager": "eogrow.core.logging.LoggingManager", - "save_logs": true, - "show_logs": true - }, - "use_ray": "auto" -} diff --git a/examples/workshop/configs/water_detection.json b/examples/workshop/configs/water_detection.json deleted file mode 100644 index d74898a2..00000000 --- a/examples/workshop/configs/water_detection.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "**global_config": "${config_path}/global_config.json", - "threshold": 0.1 -} diff --git a/examples/workshop/workshop.ipynb b/examples/workshop/workshop.ipynb deleted file mode 100644 index 95b42f39..00000000 --- a/examples/workshop/workshop.ipynb +++ /dev/null @@ -1,762 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "id": "mobile-manor", - "metadata": {}, - "source": [ - "# `eo-grow` Workshop\n", - "\n", - "`eo-grow` is a framework for large-scale processing of EO data. In this workshop we'll learn:\n", - "\n", - "- how to run an `eo-grow` pipeline,\n", - "- how to scale up a pipeline,\n", - "- how to write a new pipeline.\n", - "\n", - "The framework can run:\n", - "\n", - "- completely locally on a laptop,\n", - "- local processing with data storage on S3\n", - " * use only for small data transfers!\n", - "- processing on EC2 instances with data storage on S3.\n", - "\n", - "For this workshop we'll use 2nd and 3rd option.\n", - "\n", - "\n", - "## 0. Prerequisites\n", - "\n", - "The package requires Python version `>=3.8`. You can choose between:\n", - "\n", - "- installing `eo-grow` from PyPI:\n", - " \n", - " ```\n", - " pip install eo-grow\n", - " ```\n", - "\n", - "- or installing `eo-grow` from the current branch with:\n", - "\n", - " ```\n", - " pip install -e .\n", - " ```\n", - " \n", - "This workshop also requires an access to an AWS S3 bucket with data:\n", - "\n", - "```\n", - "aws configure --profile workshop\n", - "```\n", - "\n", - "Additionally you have to set `sentinelhub-py` OAuth credentials.\n", - "\n", - " \n", - "## 1. How to use `eo-grow`?\n", - "\n", - "The core `eo-grow` structure looks like this:\n", - "\n", - "![](../eo-grow.png)\n", - "\n", - "- A `Pipeline` obtains configuration parameters and uses managers as helpers.\n", - "- Configuration parameters can be read from JSON files or Python dictionaries. They are parsed with a special [config language](../config-language.md) and wrapped with an object specific `Schema` class.\n", - "- Storage structure and credentials are handled by a `StorageManager`.\n", - "- AOI is buffered and split into a tiling grid with different implementations of `AreaManager`.\n", - "- EOPatch naming conventions are defined in an `EOPatchManager`.\n", - "- Logging is controlled with a `LoggingManager`.\n", - "\n", - "Pipeline and manager classes all inherit from a base `EOGrowObject` and are similar in a ways that:\n", - "\n", - "- they all contain their own `Schema` class that defines which config parameters they use,\n", - "- they are all meant to be inherited and customized for any use case.\n", - "\n", - "\n", - "The most basic procedure of using `eo-grow` is:\n", - "\n", - "1. set up a project folder for storage,\n", - "2. implement a new pipeline or use one of the basic pipelines in `eogrow.pipelines`,\n", - "3. prepare a config file,\n", - "4. run a pipeline.\n", - "\n", - "### Exercise 1\n", - "\n", - "- As a storage we will use a project folder in an AWS S3 bucket `s3://eogrow-workshop/project/`.\n", - "\n", - "- We will run a basic download pipeline (`eogrow.pipelines.download.DownloadPipeline`) for AOI defined in a file `s3://eogrow-workshop/project/input-data/bohinj_aoi.geojson`.\n", - "\n", - "- We will buffer AOI by `0.01` and split AOI into a UTM grid with a patch size `250x250` pixels on `10m` resolution.\n", - "\n", - "\n", - "For now we will only use CLI commands to run the pipeline. `eo-grow` offers the following commands:\n", - "\n", - "- `eogrow` - run a pipeline\n", - "- `eogrow-template` - create a template config for a\n", - "- `eogrow-validate` - validate a pipeline config\n", - "- `eogrow-test` - test managers on a dummy pipeline\n", - "- `eogrow-ray` - run a pipeline on a cluster\n", - "\n", - "Note: names of these commands are defined in `setup.py`.\n", - "\n", - "A command `eogrow-template` can help us write a config file. Let's check what templates we get for different objects:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "wooden-school", - "metadata": {}, - "outputs": [], - "source": [ - "!eogrow-template eogrow.pipelines.download.DownloadPipeline\n", - "# !eogrow-template eogrow.pipelines.download.DownloadPipeline download_template_openapi.json -f\n", - "\n", - "# !eogrow-template eogrow.core.storage.StorageManager\n", - "# !eogrow-template eogrow.core.area.UtmZoneAreaManager\n", - "# !eogrow-template eogrow.core.eopatch.EOPatchManager\n", - "# !eogrow-template eogrow.core.logging.LoggingManager" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "revised-ethnic", - "metadata": {}, - "source": [ - "We can use config language to:\n", - "\n", - "- split config parameters into multiple files,\n", - "- avoid parameter duplications,\n", - "- reference:\n", - " * relative file paths,\n", - " * package import paths,\n", - " * environmental variables" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "quarterly-collector", - "metadata": {}, - "source": [ - "If we would like to just check if the config file contains correct parameters without running a pipeline we can do that with:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "anticipated-charleston", - "metadata": {}, - "outputs": [], - "source": [ - "!eogrow-validate configs/download.json" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "infinite-clinton", - "metadata": {}, - "source": [ - "Before we run the pipeline let's check if all managers are working correctly:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "posted-meeting", - "metadata": {}, - "outputs": [], - "source": [ - "!eogrow-test configs/download.json" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "prescribed-freedom", - "metadata": {}, - "source": [ - "This ran a simple `TestPipeline` that only checked all managers. The pipeline produced\n", - "\n", - "- logs\n", - "- cached area manager buffered shape and grid\n", - "\n", - "Let's download cached data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "liable-wedding", - "metadata": {}, - "outputs": [], - "source": [ - "!aws s3 sync s3://eogrow-workshop/project/cache/ ./cache --profile workshop" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "great-finger", - "metadata": {}, - "source": [ - "To test if the download pipeline will produce correct results we can first run it for a single patch in the grid:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "antique-theme", - "metadata": {}, - "outputs": [], - "source": [ - "!eogrow configs/download.json -t 0" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "continuing-investigation", - "metadata": {}, - "source": [ - "Now we are ready to run it for the entire grid with a command:\n", - "\n", - "```\n", - "eogrow download.json\n", - "```\n", - "\n", - "But before we do this, let's switch to a Ray cluster." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "continued-bearing", - "metadata": {}, - "source": [ - "## 2. How to scale up?\n", - "\n", - "In `eo-grow` parallelization can be achieved with:\n", - "\n", - "- multiprocessing on a single machine (for simple use cases),\n", - "- Ray parallelization on:\n", - " * a single machine\n", - " * a **cluster of AWS EC2 instances**.\n", - "\n", - "Ray cluster can be fully configured with a single YAML file as described in [Ray documentation](https://docs.ray.io/en/latest/cluster/config.html).\n", - "\n", - "Once we prepared the YAML file we can spawn a ray cluster:\n", - "\n", - "```bash\n", - "ray up cluster.yaml -y\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "collaborative-invitation", - "metadata": {}, - "source": [ - "We can attach to it with:\n", - "\n", - "```bash\n", - "ray attach cluster.yaml\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "recent-prairie", - "metadata": {}, - "source": [ - "We can upload any local files to the cluster.\n", - "\n", - "```bash\n", - "ray rsync_up cluster.yaml '/local/path' '/full/absolute/path/on/cluster'\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "desperate-amount", - "metadata": {}, - "source": [ - "Note: Alternativelly, we could commit local files and let the cluster pull them from a git repository." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "convenient-gibson", - "metadata": {}, - "source": [ - "On a cluster we can then simply run the pipeline with:\n", - " \n", - "```bash\n", - "eogrow download.json\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "rough-intake", - "metadata": {}, - "source": [ - "An even easier option is simply run a pipeline on a cluster using your local config to a cluster with a command:\n", - "\n", - "```bash\n", - "eogrow-ray cluster.yaml configs/download.json\n", - "```\n", - "\n", - "This command also has a few useful optional flags:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "painful-montana", - "metadata": {}, - "outputs": [], - "source": [ - "!eogrow-ray --help" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "formal-laundry", - "metadata": {}, - "source": [ - "Cluster CPU and memory usage can be monitored from a Ray dashboard. We can connect to it with:\n", - "\n", - "```bash\n", - "ray dashboard cluster.yaml\n", - "```\n", - "\n", - "The dashboard will become available at `localhost:8265`." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "published-florence", - "metadata": {}, - "source": [ - "When we are done processing, let's make sure that we shut down the cluster:\n", - "\n", - "```bash\n", - "ray down cluster.yaml -y\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "excessive-programming", - "metadata": {}, - "source": [ - "## 3. How to implement a new pipeline?\n", - "\n", - "Let's start from a typical workflow, which can be created in a prototype phase. The following workflow performs a simple water detection algorithm on a stack of data that we downloaded:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "strong-emerald", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "\n", - "from eolearn.core import (\n", - " EOWorkflow,\n", - " FeatureType,\n", - " LoadTask,\n", - " MapFeatureTask,\n", - " OutputTask,\n", - " OverwritePermission,\n", - " SaveTask,\n", - " linearly_connect_tasks,\n", - ")\n", - "from eolearn.core.utils.fs import get_aws_credentials\n", - "from eolearn.features import NormalizedDifferenceIndexTask\n", - "\n", - "config = get_aws_credentials(aws_profile=\"workshop\")\n", - "\n", - "bands_feature = FeatureType.DATA, \"BANDS\"\n", - "ndwi_feature = FeatureType.DATA, \"NDWI\"\n", - "water_feature = FeatureType.MASK_TIMELESS, \"WATER\"\n", - "\n", - "load_task = LoadTask(\"s3://eogrow-workshop/project/data/2021-06/\", config=config)\n", - "\n", - "ndwi_task = NormalizedDifferenceIndexTask(bands_feature, ndwi_feature, bands=[2, 7])\n", - "\n", - "\n", - "class ThresholdWater(MapFeatureTask):\n", - " def map_method(self, ndwi, threshold):\n", - " max_ndwi = np.max(ndwi, axis=0)\n", - " return max_ndwi > threshold\n", - "\n", - "\n", - "threshold_task = ThresholdWater(ndwi_feature, water_feature, threshold=0.1)\n", - "\n", - "output_task = OutputTask(name=\"result_eop\")\n", - "\n", - "nodes = linearly_connect_tasks(load_task, ndwi_task, threshold_task, output_task)\n", - "workflow = EOWorkflow(nodes)\n", - "\n", - "workflow_results = workflow.execute({nodes[0]: {\"eopatch_folder\": \"eopatch-id-08-col-3-row-1\"}})\n", - "\n", - "eop = workflow_results.outputs[\"result_eop\"]\n", - "\n", - "eop" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "blond-friendly", - "metadata": {}, - "outputs": [], - "source": [ - "ndwi = eop[ndwi_feature]\n", - "\n", - "fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(20, 15))\n", - "for index in range(12):\n", - " ax = axes[index // 4][index % 4]\n", - " ax.imshow(ndwi[index, ...], vmin=0.1, vmax=0.5)\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - "\n", - "fig.subplots_adjust(wspace=0, hspace=0);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "compressed-peeing", - "metadata": {}, - "outputs": [], - "source": [ - "fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))\n", - "\n", - "water = eop[water_feature]\n", - "\n", - "ax.imshow(water)\n", - "ax.set_xticks([])\n", - "ax.set_yticks([]);" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "appreciated-minutes", - "metadata": {}, - "source": [ - "Now let's put this process into a pipeline. The minimum that we have to do is:\n", - "\n", - "- Create a class that inherits from `Pipeline` class.\n", - "- In case you want to have custom config parameters, add `Schema` subclass that inherits from `Pipeline.Schema`.\n", - "- Implement `build_workflow` method." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "impossible-overall", - "metadata": {}, - "outputs": [], - "source": [ - "from eogrow.core.pipeline import Pipeline\n", - "\n", - "\n", - "class WaterDetectionPipeline(Pipeline):\n", - " class Schema(Pipeline.Schema):\n", - " threshold: float\n", - "\n", - " def build_workflow(self):\n", - " bands_feature = FeatureType.DATA, \"BANDS\"\n", - " ndwi_feature = FeatureType.DATA, \"NDWI\"\n", - " water_feature = FeatureType.MASK_TIMELESS, \"WATER\"\n", - "\n", - " load_task = LoadTask(self.storage.get_folder(\"data\", full_path=True), config=self.sh_config)\n", - "\n", - " ndwi_task = NormalizedDifferenceIndexTask(bands_feature, ndwi_feature, bands=[2, 7])\n", - "\n", - " threshold_task = ThresholdWater(ndwi_feature, water_feature, threshold=self.config.threshold)\n", - "\n", - " save_task = SaveTask(\n", - " self.storage.get_folder(\"results\", full_path=True),\n", - " features=[water_feature, FeatureType.BBOX],\n", - " compress_level=1,\n", - " overwrite_permission=OverwritePermission.OVERWRITE_FEATURES,\n", - " config=self.sh_config,\n", - " )\n", - "\n", - " nodes = linearly_connect_tasks(load_task, ndwi_task, threshold_task, save_task)\n", - " return EOWorkflow(nodes)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "played-command", - "metadata": {}, - "source": [ - "This time we cannot run `WaterPipeline` with CLI because the pipeline is implemented in a notebook and we cannot reference its import path. But we can run it from Python. Let's create a config for it. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "crazy-rehabilitation", - "metadata": {}, - "outputs": [], - "source": [ - "from eogrow.core.config import interpret_config_from_path\n", - "\n", - "config = interpret_config_from_path(\"./configs/water_detection.json\")\n", - "config" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "historical-client", - "metadata": {}, - "source": [ - "Let's initialize the pipeline and check some of its basic functionalities:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "suitable-advance", - "metadata": {}, - "outputs": [], - "source": [ - "pipeline = WaterDetectionPipeline.from_raw_config(config)\n", - "\n", - "pipeline\n", - "\n", - "# pipeline.config\n", - "# pipeline.sh_config\n", - "\n", - "# pipeline.storage\n", - "# pipeline.storage.filesystem\n", - "# pipeline.storage.get_folder('data')\n", - "\n", - "# pipeline.area_manager\n", - "# pipeline.area_manager.get_grid()[0]\n", - "\n", - "# pipeline.eopatch_manager\n", - "# pipeline.eopatch_manager.get_eopatch_filenames()\n", - "# pipeline.patch_list\n", - "\n", - "# pipeline.logging_manager\n", - "# pipeline.logging_manager.get_pipeline_logs_folder('pipeline-name')\n", - "# pipeline.get_pipeline_execution_name('2021-10-19')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "absolute-richardson", - "metadata": {}, - "source": [ - "During `Pipeline` class initialization only config is validated and parsed according to schema and managers are initialized. No computation is done yet. Let's run the pipeline for a single `EOPatch`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "orange-camel", - "metadata": {}, - "outputs": [], - "source": [ - "config = interpret_config_from_path(\"./configs/water_detection.json\")\n", - "\n", - "config[\"patch_list\"] = [8] # References EOPatch 'eopatch-id-08-col-3-row-1'\n", - "\n", - "pipeline = WaterDetectionPipeline.from_raw_config(config)\n", - "\n", - "pipeline.run()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "proper-stick", - "metadata": {}, - "source": [ - "Before we run the pipeline for all EOPatches let's write another pipeline. This one will not be limited by `EOWorkflow` execution. After all, a pipeline can implement any process!\n", - "\n", - "In this example we will create a pipeline that vectorizes water masks, joins vectors from all EOPatches and saves them into a single file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "abstract-collection", - "metadata": {}, - "outputs": [], - "source": [ - "from eolearn.geometry import RasterToVectorTask\n", - "\n", - "r2v_task = RasterToVectorTask(water_feature, values=[1], raster_dtype=np.uint8)\n", - "\n", - "eop = r2v_task.execute(eop)\n", - "\n", - "eop.vector_timeless[\"WATER\"].plot();" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "adverse-armor", - "metadata": {}, - "source": [ - "This time we also have to implement `run_procedure` method. This is the main method that is triggered by `Pipeline.run` and its default implementation only runs an EOWorkflow." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "baking-introduction", - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "\n", - "import fs\n", - "\n", - "from eogrow.core.pipeline import Pipeline\n", - "from eogrow.utils.fs import LocalFile\n", - "from eogrow.utils.vector import concat_gdf\n", - "\n", - "LOGGER = logging.getLogger(__name__)\n", - "\n", - "\n", - "class WaterExportPipeline(Pipeline):\n", - " water_feature = FeatureType.MASK_TIMELESS, \"WATER\"\n", - " vector_water_feature = FeatureType.VECTOR_TIMELESS, \"WATER\"\n", - "\n", - " def run_procedure(self):\n", - " workflow = self.build_workflow()\n", - " exec_args = self.get_execution_arguments(workflow)\n", - "\n", - " successful, failed, execution_results = self.run_execution(workflow, exec_args)\n", - "\n", - " gdf_list = []\n", - " for result in execution_results:\n", - " eopatch = result.outputs.get(\"water-vectors\")\n", - " if not eopatch:\n", - " continue\n", - "\n", - " gdf_list.append(eopatch[self.vector_water_feature])\n", - "\n", - " if not gdf_list:\n", - " return successful, failed\n", - "\n", - " LOGGER.info(\"Preparing joined vector dataset\")\n", - " joined_gdf = concat_gdf(gdf_list) # This assumes all dataframes are in the same CRS!\n", - "\n", - " path = fs.path.combine(self.storage.get_folder(\"vector_results\"), \"water-vectors.gpkg\")\n", - " with LocalFile(path, mode=\"w\", filesystem=self.storage.filesystem) as local_file:\n", - " joined_gdf.to_file(local_file.path, driver=\"GPKG\", encoding=\"utf-8\")\n", - " LOGGER.info(\"Saved stats to %s\", path)\n", - "\n", - " return successful, failed\n", - "\n", - " def build_workflow(self):\n", - " load_task = LoadTask(\n", - " self.storage.get_folder(\"results\", full_path=True), lazy_loading=True, config=self.sh_config\n", - " )\n", - "\n", - " r2v_task = RasterToVectorTask(self.water_feature, values=[1], raster_dtype=np.uint8)\n", - "\n", - " output_task = OutputTask(name=\"water-vectors\", features=[self.vector_water_feature])\n", - "\n", - " nodes = linearly_connect_tasks(load_task, r2v_task, output_task)\n", - " return EOWorkflow(nodes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "executed-stream", - "metadata": {}, - "outputs": [], - "source": [ - "# In our implementation the pipeline doesn't need any additional parameters\n", - "config = interpret_config_from_path(\"./configs/global_config.json\")\n", - "\n", - "config.patch_list = [8] # References EOPatch 'eopatch-id-08-col-3-row-1'\n", - "\n", - "pipeline = WaterExportPipeline.from_raw_config(config)\n", - "\n", - "pipeline.run()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "acceptable-investigator", - "metadata": {}, - "source": [ - "Finally, let's run these new pipelines on a cluster. We can do this by uploading files to the Ray head node, starting Jupyter and run the notebook. We also create configs folder on the head done becuase it doesn't exist yet.\n", - "\n", - "```bash\n", - "ray rsync_up cluster.yaml eogrow-workshop.ipynb /home/ray/eogrow-workshop.ipynb\n", - "\n", - "ray exec cluster.yaml 'mkdir configs'\n", - "ray rsync_up cluster.yaml configs/global_config.json /home/ray/configs/global_config.json\n", - "ray rsync_up cluster.yaml configs/water_detection.json /home/ray/configs/water_detection.json\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "individual-fisher", - "metadata": {}, - "source": [ - "Jupyter can be started with the following command:\n", - "\n", - "```bash\n", - "ray exec cluster.yaml --port-forward=8889 'docker exec -it gem_container /bin/bash -c \"jupyter notebook --port=8889\"'\n", - "```\n", - "\n", - "Then go to `localhost:8889` and run the relevant cells in the notebook copy." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tests/test_cli.py b/tests/test_cli.py index 221c81d9..a95ad636 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -14,7 +14,6 @@ "eogrow-ray", "eogrow-template", "eogrow-validate", - "eogrow-test", ], ) def test_help(command): From 1d1b9ff9a655c2d17c140e57451437b9bdd73934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=BDiga=20Luk=C5=A1i=C4=8D?= <31988337+zigaLuksic@users.noreply.github.com> Date: Wed, 22 Nov 2023 13:28:29 +0100 Subject: [PATCH 10/10] Prepare for release (#307) * increase version and add changelog * update pre-commit * add more docs * Update docs/source/common-configuration-patterns.md Co-authored-by: Matic Lubej --------- Co-authored-by: Matic Lubej --- .pre-commit-config.yaml | 2 +- CHANGELOG.md | 11 ++++ docs/source/common-configuration-patterns.md | 59 +++++++++++++++----- eogrow/__init__.py | 2 +- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15eb88f0..2962cf28 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ repos: language_version: python3 - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: "v0.1.5" + rev: "v0.1.6" hooks: - id: ruff diff --git a/CHANGELOG.md b/CHANGELOG.md index a5c3e754..0ed26f54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,14 @@ +## [Version 1.7.0] - 2023-11-22 +With this release we push `eo-grow` towards a more `ray` centered execution model. + +- The local EOExecutor models with multiprocessing/multithreading have been removed. (Most) pipelines no longer have the `use_ray` and `workers` parameters. In order to run instances locally one has to set up a local cluster (via `ray start --head`). We included a `debug` parameter that uses `EOExecutor` instead of `RayExecutor` so that IDE breakpoints work in most pipelines. +- Pipeline chain configs have been adjusted. The user can now specify what kind of resources the main pipeline process would require. This also allows one to run pipelines entirely on worker instances. +- The `ray_worker_type` field was replaced with `worker_resources` that allows for precise resource request specifications. +- Fixed a but where CLI variables were not applied for config chains. +- Removed `TestPipeline` and the `eogrow-test` command. +- Some `ValueError` exceptions were changed to `TypeError`. + + ## [Version 1.6.3] - 2023-11-07 - Pipelines can request specific type of worker when run on a ray cluster with the `ray_worker_type` field. diff --git a/docs/source/common-configuration-patterns.md b/docs/source/common-configuration-patterns.md index ef3c094f..ece144dc 100644 --- a/docs/source/common-configuration-patterns.md +++ b/docs/source/common-configuration-patterns.md @@ -102,11 +102,11 @@ In certain use cases we have multiple pipelines that are meant to be run in a ce But the user still needs to run them in the correct order and by hand. This we can automate with a simple pipeline chain that links them together: ``` [ // end_to_end_run.json - {"**download": "${config_path}/01_download.json"}, - {"**preprocess": "${config_path}/02_preprocess_data.json"}, - {"**predict": "${config_path}/03_use_model.json"}, - {"**export": "${config_path}/04_export_maps.json"}, - {"**ingest": "${config_path}/05_ingest_byoc.json"}, + {"pipeline_config": {"**download": "${config_path}/01_download.json"}}, + {"pipeline_config": {"**preprocess": "${config_path}/02_preprocess_data.json"}}, + {"pipeline_config": {"**predict": "${config_path}/03_use_model.json"}}, + {"pipeline_config": {"**export": "${config_path}/04_export_maps.json"}}, + {"pipeline_config": {"**ingest": "${config_path}/05_ingest_byoc.json"}}, ] ``` @@ -119,28 +119,59 @@ In experimentation we often want to run the same pipeline for multiple parameter ``` [ // run_threshold_experiments.json { - "variables": {"threshold": 0.1}, - "**pipeline": "${config_path}/extract_trees.json" + "pipeline_config:{ + "variables": {"threshold": 0.1}, + "**pipeline": "${config_path}/extract_trees.json" + }, }, { - "variables": {"threshold": 0.2}, - "**pipeline": "${config_path}/extract_trees.json" + "pipeline_config:{ + "variables": {"threshold": 0.2}, + "**pipeline": "${config_path}/extract_trees.json" + }, }, { - "variables": {"threshold": 0.3}, - "**pipeline": "${config_path}/extract_trees.json" + "pipeline_config:{ + "variables": {"threshold": 0.3}, + "**pipeline": "${config_path}/extract_trees.json" + }, }, { - "variables": {"threshold": 0.4}, - "**pipeline": "${config_path}/extract_trees.json" + "pipeline_config:{ + "variables": {"threshold": 0.4}, + "**pipeline": "${config_path}/extract_trees.json" + } } ] ``` -### Using variables with pipelines +### Using variables with pipeline chains While there is no syntactic sugar for specifying pipeline-chain-wide variables in JSON files, one can do that through CLI. Running `eogrow end_to_end_run.json -v "year:2019"` will set the variable `year` to 2019 for all pipelines in the chain. +### Specifying resources for pipeline execution + +Pipeline chains also allow the user to specify resources needed by the main process of each pipeline in a similar way that a pipeline config can specify resources needed by its workers. + +``` +[ // end_to_end_run.json + { + "pipeline_config": {"**download": "${config_path}/01_download.json"} + } + { + "pipeline_config": {"**predict": "${config_path}/03_use_model.json"}, + "pipeline_resources": {"memory": 2e9} // ~ 2GB RAM reserved for the main process + } + { + "pipeline_config": {"**export": "${config_path}/04_export_maps.json"} + } +] +``` + +This also allows us to run certain pipelines on specially tagged workers. When setting up the cluster, one can tag workers with custom resources, for instance a `r5.4xlarge` worker with `big_RAM_worker: 1`. If we set `"pipeline_resources": {"resources": {"big_RAM_worker": 1}}` then the pipeline will run ONLY on such workers, and the whole worker instance will be assigned to it. This is great for pipelines which have a large workload in the main process. + +Pipeline chains can be 1 pipeline long, so this can also be used with a single pipeline. + ## Path modification via variables In some cases one wants fine grained control over path specifications. The following is a simplified example of how one can provide separate download paths for a large amount of batch pipelines. diff --git a/eogrow/__init__.py b/eogrow/__init__.py index 036d6d67..b2967b72 100644 --- a/eogrow/__init__.py +++ b/eogrow/__init__.py @@ -1,3 +1,3 @@ """The main module of the eo-grow package.""" -__version__ = "1.6.3" +__version__ = "1.7.0"