JCSDA-internal · danholdaway · Jun 5, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/.gitattributes b/.gitattributes
@@ -3,3 +3,5 @@
 *.res filter=lfs diff=lfs merge=lfs -text
 *.odb filter=lfs diff=lfs merge=lfs -text
 *.ipynb filter=lfs diff=lfs merge=lfs -text
+*.satbias filter=lfs diff=lfs merge=lfs -text
+*.tlapse filter=lfs diff=lfs merge=lfs -text
diff --git a/src/eva/data/csv_space.py b/src/eva/data/csv_space.py
@@ -98,9 +98,6 @@ def execute(self, dataset_config, data_collections, timing):
         data_collections.create_or_add_to_collection(collection_name, ds)
         ds.close()
 
-        # Display the contents of the collections for helping the user with making plots
-        data_collections.display_collections()
-
     # ----------------------------------------------------------------------------------------------
 
     def generate_default_config(self, filenames, collection_name):

diff --git a/src/eva/data/cubed_sphere_restart.py b/src/eva/data/cubed_sphere_restart.py
@@ -142,10 +142,6 @@ def execute(self, dataset_config, data_collections, timing):
         # -------------------------
         data_collections.nan_float_values_outside_threshold(threshold)
 
-        # Display the contents of the collections for helping the user with making plots
-        # -------------------------
-        data_collections.display_collections()
-
     # ----------------------------------------------------------------------------------------------
 
     def generate_default_config(self, filenames, collection_name):

diff --git a/src/eva/data/data_collections.py b/src/eva/data/data_collections.py
@@ -32,7 +32,7 @@ class DataCollections:
 
     """Manage collections of xarray Datasets with variable manipulations."""
 
-    def __init__(self):
+    def __init__(self, time_series=False):
 
         """Initialize the DataCollections instance."""
 
@@ -42,6 +42,9 @@ def __init__(self):
         # Create a logger
         self.logger = Logger('DataCollections')
 
+        # If this is a time series, store it
+        self.time_series = False if not time_series else True
+
     # ----------------------------------------------------------------------------------------------
 
     def create_or_add_to_collection(self, collection_name, collection, concat_dimension=None):
@@ -61,6 +64,11 @@ def create_or_add_to_collection(self, collection_name, collection, concat_dimens
             ValueError: If concatenation dimension is missing or invalid.
         """
 
+        # If time_series collection name must also be time_series
+        if self.time_series and collection_name != 'time_series':
+            self.logger.abort('In create_or_add_to_collection: time_series collection must ' +
+                              'be \'time_series\'')
+
         # Collections should only be xarray datasets
         if not isinstance(collection, Dataset):
             self.logger.abort('In add_collection: collection must be an xarray.Dataset')
@@ -149,6 +157,11 @@ def add_variable_to_collection(self, collection_name, group_name, variable_name,
             ValueError: If variable is not an xarray DataArray.
         """
 
+        # If time_series collection name must also be time_series
+        if self.time_series and collection_name != 'time_series':
+            self.logger.abort('In add_variable_to_collection: time_series collection must ' +
+                              'be \'time_series\'')
+
         # Assert that new variable is an xarray Dataarray
         if not isinstance(variable, DataArray):
             self.logger.abort('In add_variable_to_collection: variable must be xarray.DataArray')
@@ -197,6 +210,11 @@ def get_variable_data_array(self, collection_name, group_name, variable_name,
                         is missing.
         """
 
+        # If time_series collection name must also be time_series
+        if self.time_series and collection_name != 'time_series':
+            self.logger.abort('In get_variable_data_array: time_series collection must ' +
+                              'be \'time_series\'')
+
         group_variable_name = group_name + '::' + variable_name
         data_array = self._collections[collection_name][group_variable_name]
 
@@ -274,6 +292,11 @@ def get_variable_data(self, collection_name, group_name, variable_name,
             ndarray: The selected variable data as a NumPy array.
         """
 
+        # If time_series collection name must also be time_series
+        if self.time_series and collection_name != 'time_series':
+            self.logger.abort('In get_variable_data: time_series collection must ' +
+                              'be \'time_series\'')
+
         variable_array = self.get_variable_data_array(collection_name, group_name, variable_name,
                                                       channels, levels, datatypes)
 
@@ -378,6 +401,7 @@ def display_collections(self):
             'float32': '{:+.4e}',
             'int64': '{:+11d}',
             'int32': '{:+11d}',
+            'datetime64[ns]': '{}'
         }
 
         # Display a list of variables that are available in the collection
@@ -388,7 +412,7 @@ def display_collections(self):
             self.logger.info('Collection name: ' + fcol.underline + collection + fcol.end)
             self.logger.info('\n Dimensions:')
             for dim in list(self._collections[collection].dims):
-                dim_value = self._collections[collection].dims[dim]
+                dim_value = self._collections[collection].sizes[dim]
                 self.logger.info(f'  {dim}: {dim_value}')
             self.logger.info('\n Coordinates:')
             for coord in list(self._collections[collection].coords):
@@ -411,8 +435,25 @@ def display_collections(self):
                         rms = np.sqrt(np.nanmean(data_var_value**2))
                         rms_string = ', RMS=' + minmaxrms_format.format(rms)
                     minmaxrms_string = ' | ' + min_string + ', ' + max_string + rms_string
-                self.logger.info('  ' + data_var.ljust(max_name_len) + ' (' +
-                                 str(data_var_value.dtype).ljust(7) + ')' + minmaxrms_string)
+                    full_str = '  ' + data_var.ljust(max_name_len) + ' (' + \
+                        str(data_var_value.dtype)[0:7].ljust(7) + ')' + minmaxrms_string
+                else:
+                    # No min/max
+                    min_string = ''
+                    max_string = ''
+                    minmaxrms_string = ' | ' + min_string + ', ' + max_string
+                    full_str = '  ' + data_var.ljust(max_name_len) + ' (' + \
+                        str(data_var_value.dtype)[0:7].ljust(7) + ')' + minmaxrms_string
+                self.logger.info(full_str)
+
+        # Add the raw xarray display of the collection for more information about coords/dims
+        self.logger.info(' ')
+        self.logger.info('/'*80)
+        self.logger.info(' ')
+        self.logger.info(f'Raw xarray display of the {fcol.underline + collection + fcol.end} ' +
+                         'collection:')
+        self.logger.info(' ')
+        self.logger.info(str(self._collections[collection]))
         self.logger.info('-'*80)
 
     # ----------------------------------------------------------------------------------------------
diff --git a/src/eva/data/data_driver.py b/src/eva/data/data_driver.py
@@ -11,55 +11,40 @@
 # --------------------------------------------------------------------------------------------------
 
 
-from eva.utilities.config import get
 from eva.data.eva_dataset_base import EvaDatasetFactory
 
-import importlib
-import os
-
 
 # --------------------------------------------------------------------------------------------------
 
-def data_driver(config, data_collections, timing, logger):
+def data_driver(dataset_config, data_collections, timing, logger):
 
     """
     Driver for executing data processing.
 
     Args:
-        config (dict): Configuration settings for data processing.
+        dataset_config (dict): Configuration settings for data processing.
         data_collections (DataCollections): Instance of the DataCollections class.
         timing (Timing): Timing instance for performance measurement.
         logger (Logger): Logger instance for logging messages.
 
     """
 
-    # Get list of dataset dictionaries
-    datasets = get(config, logger, 'datasets')
-
-    # Loop over datasets
-    for dataset in datasets:
+    # Check if the dataset_config contains the 'type' key
+    logger.assert_abort('type' in dataset_config, 'Each dataset must have a \'type\' key')
 
-        # Extract name for this diagnostic data type
-        try:
-            eva_data_class_name = dataset['type']
-        except Exception as e:
-            msg = '\'type\' key not found. \'diagnostic_data_config\': ' \
-                f'{diagnostic_data_config}, error: {e}'
-            raise KeyError(msg)
+    # Extract name for this diagnostic data type
+    eva_data_class_name = dataset_config['type']
 
-        # Create the data object
-        creator = EvaDatasetFactory()
-        timing.start('DataObjectConstructor')
-        eva_data_object = creator.create_eva_object(eva_data_class_name,
-                                                    'data',
-                                                    logger,
-                                                    timing)
-        timing.stop('DataObjectConstructor')
+    # Create the data object
+    creator = EvaDatasetFactory()
+    timing.start('DataObjectConstructor')
+    eva_data_object = creator.create_eva_object(eva_data_class_name, 'data', logger, timing)
+    timing.stop('DataObjectConstructor')
 
-        # Prepare diagnostic data
-        logger.info(f'Running execute for {eva_data_object.name}')
-        timing.start('DataObjectExecute')
-        eva_data_object.execute(dataset, data_collections, timing)
-        timing.stop('DataObjectExecute')
+    # Prepare diagnostic data
+    logger.info(f'Running execute for {eva_data_object.name}')
+    timing.start('DataObjectExecute')
+    eva_data_object.execute(dataset_config, data_collections, timing)
+    timing.stop('DataObjectExecute')
 
 # --------------------------------------------------------------------------------------------------
diff --git a/src/eva/data/geoval_space.py b/src/eva/data/geoval_space.py
@@ -86,9 +86,6 @@ def execute(self, dataset_config, data_collections, timing):
         # Nan out unphysical values
         data_collections.nan_float_values_outside_threshold(threshold)
 
-        # Display the contents of the collections for helping the user with making plots
-        data_collections.display_collections()
-
     def generate_default_config(self, filenames, collection_name):
 
         """

diff --git a/src/eva/data/gsi_obs_space.py b/src/eva/data/gsi_obs_space.py
@@ -297,9 +297,6 @@ def execute(self, dataset_config, data_collections, timeing):
         # Change the channel dimension name
         data_collections.adjust_channel_dimension_name('nchans')
 
-        # Display the contents of the collections for helping the user with making plots
-        data_collections.display_collections()
-
     # ----------------------------------------------------------------------------------------------
 
     def generate_default_config(self, filenames, collection_name):

diff --git a/src/eva/data/ioda_obs_space.py b/src/eva/data/ioda_obs_space.py
@@ -277,9 +277,6 @@ def execute(self, dataset_config, data_collections, timing):
         # Nan out unphysical values
         data_collections.nan_float_values_outside_threshold(threshold)
 
-        # Display the contents of the collections for helping the user with making plots
-        data_collections.display_collections()
-
     def generate_default_config(self, filenames, collection_name):
 
         """

diff --git a/src/eva/data/jedi_log.py b/src/eva/data/jedi_log.py
@@ -120,9 +120,6 @@ def execute(self, dataset_config, data_collections, timing):
                 # Add to the Eva dataset
                 data_collections.create_or_add_to_collection(collection_name, convergence_ds)
 
-        # Write out all the collections
-        data_collections.display_collections()
-
     # ----------------------------------------------------------------------------------------------
 
     def get_from_log(self, search_term, separator, position, custom_log=None):