From 59a1e103be3155416c746d3d617934d377f80f1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98yvind=20Lind-Johansen?= <47847084+lindjoha@users.noreply.github.com> Date: Thu, 27 Oct 2022 10:56:26 +0200 Subject: [PATCH] Metadata functionality in `EnsembleTableProvider` (#1135) --- .github/workflows/subsurface.yml | 1 + .../test_ensemble_table_provider.py | 26 +++++++++++++++++++ webviz_subsurface/_providers/__init__.py | 1 + .../ensemble_table_provider/__init__.py | 2 +- .../_field_metadata.py | 24 +++++++++++++++++ .../ensemble_table_provider.py | 15 +++++++++++ .../ensemble_table_provider_impl_arrow.py | 15 ++++++++++- 7 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 webviz_subsurface/_providers/ensemble_table_provider/_field_metadata.py diff --git a/.github/workflows/subsurface.yml b/.github/workflows/subsurface.yml index f9a849cd9..619f893af 100644 --- a/.github/workflows/subsurface.yml +++ b/.github/workflows/subsurface.yml @@ -55,6 +55,7 @@ jobs: pip install "scipy<1.9.3" # breaking change in scipy==1.9.3 pip install "pytest<7.2.0" pip install "pytest-xdist<3.0" + pip install "xtgeo<2.20.2" pip install . # Testing against our latest release (including pre-releases) diff --git a/tests/unit_tests/provider_tests/test_ensemble_table_provider.py b/tests/unit_tests/provider_tests/test_ensemble_table_provider.py index 914c4fb5e..93d683259 100644 --- a/tests/unit_tests/provider_tests/test_ensemble_table_provider.py +++ b/tests/unit_tests/provider_tests/test_ensemble_table_provider.py @@ -4,6 +4,7 @@ import pandas as pd from webviz_subsurface._providers import ( + ColumnMetadata, EnsembleTableProvider, EnsembleTableProviderFactory, ) @@ -58,6 +59,8 @@ def test_synthetic_get_column_data(testdata_folder: Path) -> None: assert df.shape == (4, 2) assert df.columns.tolist() == ["REAL", "STR"] + assert model.column_metadata("REAL") is None + def test_create_from_aggregated_csv_file_smry_csv( testdata_folder: Path, tmp_path: Path @@ -85,6 +88,10 @@ def test_create_from_aggregated_csv_file_smry_csv( assert valdf.columns[1] == "YEARS" assert valdf["REAL"].nunique() == 3 + # No metadata in csv files + meta: Optional[ColumnMetadata] = provider.column_metadata("FOPR") + assert meta is None + def test_create_from_per_realization_csv_file( testdata_folder: Path, tmp_path: Path @@ -110,6 +117,10 @@ def test_create_from_per_realization_csv_file( assert valdf["CONIDX"].nunique() == 24 assert sorted(valdf["CONIDX"].unique()) == list(range(1, 25)) + # No metadata in csv files + meta: Optional[ColumnMetadata] = provider.column_metadata("CONIDX") + assert meta is None + def test_create_from_per_realization_arrow_file( testdata_folder: Path, tmp_path: Path @@ -126,6 +137,11 @@ def test_create_from_per_realization_arrow_file( assert "FOPT" in valdf.columns assert valdf["REAL"].nunique() == 100 + # Test metadata + meta: Optional[ColumnMetadata] = provider.column_metadata("FOPR") + assert meta is not None + assert meta.unit == "SM3/DAY" + def test_create_from_per_realization_parameter_file( testdata_folder: Path, tmp_path: Path @@ -140,6 +156,12 @@ def test_create_from_per_realization_parameter_file( assert "GLOBVAR:FAULT_SEAL_SCALING" in valdf.columns assert valdf["REAL"].nunique() == 100 + # No metadata in parameter files + meta: Optional[ColumnMetadata] = provider.column_metadata( + "GLOBVAR:FAULT_SEAL_SCALING" + ) + assert meta is None + def test_create_provider_set_from_aggregated_csv_file(tmp_path: Path) -> None: """This tests importing a csv file with an ensemble column with multiple @@ -165,3 +187,7 @@ def test_create_provider_set_from_aggregated_csv_file(tmp_path: Path) -> None: "STOIIP_OIL", "SOURCE", }.issubset(set(provider.column_names())) + + # No metadata in csv files + meta: Optional[ColumnMetadata] = provider.column_metadata("ZONE") + assert meta is None diff --git a/webviz_subsurface/_providers/__init__.py b/webviz_subsurface/_providers/__init__.py index 618834d54..ad2dd4513 100644 --- a/webviz_subsurface/_providers/__init__.py +++ b/webviz_subsurface/_providers/__init__.py @@ -27,6 +27,7 @@ SurfaceServer, ) from .ensemble_table_provider import ( + ColumnMetadata, EnsembleTableProvider, EnsembleTableProviderFactory, EnsembleTableProviderImplArrow, diff --git a/webviz_subsurface/_providers/ensemble_table_provider/__init__.py b/webviz_subsurface/_providers/ensemble_table_provider/__init__.py index bf0cf3a0d..c039f52e2 100644 --- a/webviz_subsurface/_providers/ensemble_table_provider/__init__.py +++ b/webviz_subsurface/_providers/ensemble_table_provider/__init__.py @@ -1,3 +1,3 @@ -from .ensemble_table_provider import EnsembleTableProvider +from .ensemble_table_provider import ColumnMetadata, EnsembleTableProvider from .ensemble_table_provider_factory import EnsembleTableProviderFactory from .ensemble_table_provider_impl_arrow import EnsembleTableProviderImplArrow diff --git a/webviz_subsurface/_providers/ensemble_table_provider/_field_metadata.py b/webviz_subsurface/_providers/ensemble_table_provider/_field_metadata.py new file mode 100644 index 000000000..2ae72dfd9 --- /dev/null +++ b/webviz_subsurface/_providers/ensemble_table_provider/_field_metadata.py @@ -0,0 +1,24 @@ +from typing import Optional + +import pyarrow as pa + +from .ensemble_table_provider import ColumnMetadata + + +def create_column_metadata_from_field_meta( + field: pa.Field, +) -> Optional[ColumnMetadata]: + """Create VectorMetadata from keywords stored in the field's metadata""" + + meta_dict = field.metadata + if not meta_dict: + return None + + try: + unit_bytestr = meta_dict[b"unit"] + except KeyError: + return ColumnMetadata(unit=None) + + return ColumnMetadata( + unit=unit_bytestr.decode("ascii"), + ) diff --git a/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider.py b/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider.py index c0cb46584..60817e914 100644 --- a/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider.py +++ b/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider.py @@ -1,9 +1,15 @@ import abc +from dataclasses import dataclass from typing import List, Optional, Sequence import pandas as pd +@dataclass(frozen=True) +class ColumnMetadata: + unit: Optional[str] + + class EnsembleTableProvider(abc.ABC): @abc.abstractmethod def column_names(self) -> List[str]: @@ -18,3 +24,12 @@ def get_column_data( self, column_names: Sequence[str], realizations: Optional[Sequence[int]] = None ) -> pd.DataFrame: ... + + @abc.abstractmethod + def column_metadata(self, column_name: str) -> Optional[ColumnMetadata]: + """Returns metadata for the specified column. + + Returns None if no metadata is found for the column. + Returns a empty ColumnMetadata object if there is metadata, but it's + not the columns specified in ColumnMetadata. + """ diff --git a/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider_impl_arrow.py b/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider_impl_arrow.py index 2e8424a89..1f33c7065 100644 --- a/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider_impl_arrow.py +++ b/webviz_subsurface/_providers/ensemble_table_provider/ensemble_table_provider_impl_arrow.py @@ -13,7 +13,8 @@ add_per_vector_min_max_to_table_schema_metadata, find_min_max_for_numeric_table_columns, ) -from .ensemble_table_provider import EnsembleTableProvider +from ._field_metadata import create_column_metadata_from_field_meta +from .ensemble_table_provider import ColumnMetadata, EnsembleTableProvider # Since PyArrow's actual compute functions are not seen by pylint # pylint: disable=no-member @@ -170,6 +171,13 @@ def from_backing_store( return None + def _get_or_read_schema(self) -> pa.Schema: + if self._cached_reader: + return self._cached_reader.schema + + source = pa.memory_map(self._arrow_file_name, "r") + return pa.ipc.RecordBatchFileReader(source).schema + def column_names(self) -> List[str]: return self._column_names @@ -215,3 +223,8 @@ def get_column_data( ) return df + + def column_metadata(self, column_name: str) -> Optional[ColumnMetadata]: + schema = self._get_or_read_schema() + field = schema.field(column_name) + return create_column_metadata_from_field_meta(field)