From 2e35bed5247f203c2225ded2c4f7b7512e013218 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Sun, 27 May 2018 20:50:29 +0200 Subject: [PATCH 1/2] Update all multifile datasets to use ncrcat Some obs. and preprocessed datasets were being opened with xarray's open_mfdataset() funciton, which somtimes led to "too many open files" errors. --- ...ot_depth_integrated_time_series_subtask.py | 34 ++++--- mpas_analysis/ocean/time_series_sst.py | 17 ++-- mpas_analysis/sea_ice/time_series.py | 79 ++++++++-------- mpas_analysis/shared/time_series/__init__.py | 3 +- .../shared/time_series/time_series.py | 89 +++++++++++++++++++ 5 files changed, 162 insertions(+), 60 deletions(-) diff --git a/mpas_analysis/ocean/plot_depth_integrated_time_series_subtask.py b/mpas_analysis/ocean/plot_depth_integrated_time_series_subtask.py index bd54eb1d1..d060c28c8 100644 --- a/mpas_analysis/ocean/plot_depth_integrated_time_series_subtask.py +++ b/mpas_analysis/ocean/plot_depth_integrated_time_series_subtask.py @@ -18,17 +18,18 @@ from mpas_analysis.shared.plot.plotting import timeseries_analysis_plot -from mpas_analysis.shared.generalized_reader import open_multifile_dataset from mpas_analysis.shared.io import open_mpas_dataset, write_netcdf from mpas_analysis.shared.timekeeping.utility import date_to_days, \ days_to_datetime -from mpas_analysis.shared.io.utility import build_config_full_path +from mpas_analysis.shared.io.utility import build_config_full_path, \ + make_directories from mpas_analysis.shared.html import write_image_xml -from mpas_analysis.shared.time_series import compute_moving_avg +from mpas_analysis.shared.time_series import compute_moving_avg, \ + combine_time_series_with_ncrcat class PlotDepthIntegratedTimeSeriesSubtask(AnalysisTask): @@ -223,7 +224,12 @@ def setup_and_check(self): # {{{ baseDirectory = build_config_full_path( config, 'output', 'timeSeriesSubdirectory') - self.preprocessedFileName = '{}/preprocessed_{}'.format( + make_directories('{}/preprocessed'.format(baseDirectory)) + + self.preprocessedIntermediateFileName = \ + '{}/preprocessed/intermediate_{}'.format(baseDirectory, + self.inFileName) + self.preprocessedFileName = '{}/preprocessed/{}'.format( baseDirectory, self.inFileName) if not os.path.isabs(self.inFileName): @@ -337,18 +343,20 @@ def run_task(self): # {{{ preprocessedInputDirectory = config.get( 'oceanPreprocessedReference', 'baseDirectory') - self.logger.info(' Load in OHC from preprocessed reference ' - 'run...') + self.logger.info(' Load in preprocessed reference data...') preprocessedFilePrefix = config.get(self.sectionName, 'preprocessedFilePrefix') inFilesPreprocessed = '{}/{}.{}.year*.nc'.format( preprocessedInputDirectory, preprocessedFilePrefix, preprocessedReferenceRunName) - dsPreprocessed = open_multifile_dataset( - fileNames=inFilesPreprocessed, - calendar=calendar, - config=config, - timeVariableName='xtime') + + combine_time_series_with_ncrcat( + inFilesPreprocessed, self.preprocessedIntermediateFileName, + logger=self.logger) + dsPreprocessed = open_mpas_dataset( + fileName=self.preprocessedIntermediateFileName, + calendar=calendar, + timeVariableNames='xtime') yearStart = days_to_datetime(ds.Time.min(), calendar=calendar).year yearEnd = days_to_datetime(ds.Time.max(), calendar=calendar).year @@ -378,7 +386,7 @@ def run_task(self): # {{{ if preprocessedReferenceRunName != 'None': color = 'purple' title = '{} \n {} (purple)'.format(title, - preprocessedReferenceRunName) + preprocessedReferenceRunName) preprocessedFieldPrefix = config.get(self.sectionName, 'preprocessedFieldPrefix') @@ -389,7 +397,7 @@ def run_task(self): # {{{ suffixes = ['tot'] + ['{}m'.format(depth) for depth in divisionDepths] + ['btm'] - # these preprocessed data are OHC *anomalies* + # these preprocessed data are already anomalies dsPreprocessed = compute_moving_avg(dsPreprocessed, movingAveragePoints) for rangeIndex in range(len(suffixes)): diff --git a/mpas_analysis/ocean/time_series_sst.py b/mpas_analysis/ocean/time_series_sst.py index 61e15e2c7..2afad9bf1 100644 --- a/mpas_analysis/ocean/time_series_sst.py +++ b/mpas_analysis/ocean/time_series_sst.py @@ -13,7 +13,7 @@ from mpas_analysis.shared.plot.plotting import timeseries_analysis_plot -from mpas_analysis.shared.generalized_reader import open_multifile_dataset +from mpas_analysis.shared.time_series import combine_time_series_with_ncrcat from mpas_analysis.shared.io import open_mpas_dataset from mpas_analysis.shared.timekeeping.utility import date_to_days, \ @@ -200,11 +200,16 @@ def run_task(self): # {{{ 'run...') inFilesPreprocessed = '{}/SST.{}.year*.nc'.format( preprocessedInputDirectory, preprocessedReferenceRunName) - dsPreprocessed = open_multifile_dataset( - fileNames=inFilesPreprocessed, - calendar=calendar, - config=config, - timeVariableName='xtime') + + outFolder = '{}/preprocessed'.format(outputDirectory) + make_directories(outFolder) + outFileName = '{}/sst.nc'.format(outFolder) + + combine_time_series_with_ncrcat(inFilesPreprocessed, + outFileName, logger=self.logger) + dsPreprocessed = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') yearEndPreprocessed = days_to_datetime(dsPreprocessed.Time.max(), calendar=calendar).year if yearStart <= yearEndPreprocessed: diff --git a/mpas_analysis/sea_ice/time_series.py b/mpas_analysis/sea_ice/time_series.py index 4980502f8..2ccd025f1 100644 --- a/mpas_analysis/sea_ice/time_series.py +++ b/mpas_analysis/sea_ice/time_series.py @@ -24,7 +24,7 @@ from mpas_analysis.shared.timekeeping.MpasRelativeDelta import \ MpasRelativeDelta -from mpas_analysis.shared.generalized_reader import open_multifile_dataset +from mpas_analysis.shared.time_series import combine_time_series_with_ncrcat from mpas_analysis.shared.io import open_mpas_dataset, write_netcdf from mpas_analysis.shared.mpas_xarray.mpas_xarray import subset_variables @@ -260,25 +260,6 @@ def run_task(self): # {{{ timeEnd = date_to_days(year=yearEnd, month=12, day=31, calendar=calendar) - if preprocessedReferenceRunName != 'None': - inFilesPreprocessed = '{}/icevol.{}.year*.nc'.format( - preprocessedReferenceDirectory, preprocessedReferenceRunName) - dsPreprocessed = open_multifile_dataset( - fileNames=inFilesPreprocessed, - calendar=calendar, - config=config, - timeVariableName='xtime') - preprocessedYearEnd = days_to_datetime(dsPreprocessed.Time.max(), - calendar=calendar).year - if yearStart <= preprocessedYearEnd: - dsPreprocessedTimeSlice = \ - dsPreprocessed.sel(Time=slice(timeStart, timeEnd)) - else: - self.logger.warning('Preprocessed time series ends before the ' - 'timeSeries startYear and will not be ' - 'plotted.') - preprocessedReferenceRunName = 'None' - if self.refConfig is not None: dsTimeSeriesRef = {} @@ -349,22 +330,31 @@ def run_task(self): # {{{ key = (hemisphere, variableName) if compareWithObservations: - dsObs = open_multifile_dataset( - fileNames=obsFileNames['iceArea'][hemisphere], - calendar=calendar, - config=config, - timeVariableName='xtime') + + outFolder = '{}/obs'.format(outputDirectory) + make_directories(outFolder) + outFileName = '{}/iceArea{}.nc'.format(outFolder, hemisphere) + + combine_time_series_with_ncrcat( + obsFileNames['iceArea'][hemisphere], + outFileName, logger=self.logger) + dsObs = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') key = (hemisphere, 'iceArea') obs[key] = self._replicate_cycle(plotVars[key], dsObs.IceArea, calendar) key = (hemisphere, 'iceVolume') if hemisphere == 'NH': - dsObs = open_multifile_dataset( - fileNames=obsFileNames['iceVolume'][hemisphere], - calendar=calendar, - config=config, - timeVariableName='xtime') + outFileName = '{}/iceVolume{}.nc'.format(outFolder, + hemisphere) + combine_time_series_with_ncrcat( + obsFileNames['iceVolume'][hemisphere], + outFileName, logger=self.logger) + dsObs = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') obs[key] = self._replicate_cycle(plotVars[key], dsObs.IceVol, calendar) @@ -372,14 +362,20 @@ def run_task(self): # {{{ obs[key] = None if preprocessedReferenceRunName != 'None': + outFolder = '{}/preprocessed'.format(outputDirectory) + make_directories(outFolder) inFilesPreprocessed = '{}/icearea.{}.year*.nc'.format( preprocessedReferenceDirectory, preprocessedReferenceRunName) - dsPreprocessed = open_multifile_dataset( - fileNames=inFilesPreprocessed, - calendar=calendar, - config=config, - timeVariableName='xtime') + + outFileName = '{}/iceArea{}.nc'.format(outFolder, hemisphere) + + combine_time_series_with_ncrcat(inFilesPreprocessed, + outFileName, + logger=self.logger) + dsPreprocessed = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') dsPreprocessedTimeSlice = dsPreprocessed.sel( Time=slice(timeStart, timeEnd)) key = (hemisphere, 'iceArea') @@ -389,11 +385,14 @@ def run_task(self): # {{{ inFilesPreprocessed = '{}/icevol.{}.year*.nc'.format( preprocessedReferenceDirectory, preprocessedReferenceRunName) - dsPreprocessed = open_multifile_dataset( - fileNames=inFilesPreprocessed, - calendar=calendar, - config=config, - timeVariableName='xtime') + outFileName = '{}/iceVolume{}.nc'.format(outFolder, hemisphere) + + combine_time_series_with_ncrcat(inFilesPreprocessed, + outFileName, + logger=self.logger) + dsPreprocessed = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') dsPreprocessedTimeSlice = dsPreprocessed.sel( Time=slice(timeStart, timeEnd)) key = (hemisphere, 'iceVolume') diff --git a/mpas_analysis/shared/time_series/__init__.py b/mpas_analysis/shared/time_series/__init__.py index 627ff0408..4062e3102 100644 --- a/mpas_analysis/shared/time_series/__init__.py +++ b/mpas_analysis/shared/time_series/__init__.py @@ -1,4 +1,5 @@ -from mpas_analysis.shared.time_series.time_series import cache_time_series +from mpas_analysis.shared.time_series.time_series import cache_time_series, \ + combine_time_series_with_ncrcat from mpas_analysis.shared.time_series.mpas_time_series_task import \ MpasTimeSeriesTask diff --git a/mpas_analysis/shared/time_series/time_series.py b/mpas_analysis/shared/time_series/time_series.py index 3f2b2c187..f3a28175f 100644 --- a/mpas_analysis/shared/time_series/time_series.py +++ b/mpas_analysis/shared/time_series/time_series.py @@ -18,10 +18,99 @@ import xarray as xr import numpy import os +from distutils.spawn import find_executable +import glob +import subprocess +from six import string_types from mpas_analysis.shared.timekeeping.utility import days_to_datetime +def combine_time_series_with_ncrcat(inFileNames, outFileName, + variableList=None, logger=None): + # {{{ + ''' + Uses ncrcat to extact time series from a series of files + + inFileNames : str or list of str + A file name with wildcard(s) or a list of input files from which to + extract the time series. + + outFileName : str + The output NetCDF file where the time series should be written. + + variableList : list of str, optional + A list of varibles to include. All variables are included by default + + logger : `logging.Logger``, optional + A logger to which ncclimo output should be redirected + + Raises + ------ + OSError + If ``ncrcat`` is not in the system path. + + Author + ------ + Xylar Asay-Davis + ''' + + if find_executable('ncrcat') is None: + raise OSError('ncrcat not found. Make sure the latest nco ' + 'package is installed: \n' + 'conda install nco\n' + 'Note: this presumes use of the conda-forge ' + 'channel.') + + if os.path.exists(outFileName): + return + + if isinstance(inFileNames, string_types): + inFileNames = sorted(glob.glob(inFileNames)) + + args = ['ncrcat', '-4', '--record_append', '--no_tmp_fl'] + + if variableList is not None: + args.extend(['-v', ','.join(variableList)]) + + printCommand = '{} {} ... {} {}'.format(' '.join(args), inFileNames[0], + inFileNames[-1], + outFileName) + args.extend(inFileNames) + args.append(outFileName) + + if logger is None: + print('running: {}'.format(printCommand)) + else: + logger.info('running: {}'.format(printCommand)) + for handler in logger.handlers: + handler.flush() + process = subprocess.Popen(args, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + + if stdout: + stdout = stdout.decode('utf-8') + for line in stdout.split('\n'): + if logger is None: + print(line) + else: + logger.info(line) + if stderr: + stderr = stderr.decode('utf-8') + for line in stderr.split('\n'): + if logger is None: + print(line) + else: + logger.error(line) + + if process.returncode != 0: + raise subprocess.CalledProcessError(process.returncode, + ' '.join(args)) + + # }}} + + def cache_time_series(timesInDataSet, timeSeriesCalcFunction, cacheFileName, calendar, yearsPerCacheUpdate=1, logger=None): # {{{ From 34e574a99775ed9375583a64d098e6feaff28a21 Mon Sep 17 00:00:00 2001 From: Xylar Asay-Davis Date: Wed, 30 May 2018 23:33:29 +0200 Subject: [PATCH 2/2] Fix check for preprocessed run out of range --- mpas_analysis/sea_ice/time_series.py | 31 +++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/mpas_analysis/sea_ice/time_series.py b/mpas_analysis/sea_ice/time_series.py index 2ccd025f1..ec84f8528 100644 --- a/mpas_analysis/sea_ice/time_series.py +++ b/mpas_analysis/sea_ice/time_series.py @@ -260,6 +260,32 @@ def run_task(self): # {{{ timeEnd = date_to_days(year=yearEnd, month=12, day=31, calendar=calendar) + if preprocessedReferenceRunName != 'None': + # determine if we're beyond the end of the preprocessed data + # (and go ahead and cache the data set while we're checking) + outFolder = '{}/preprocessed'.format(outputDirectory) + make_directories(outFolder) + inFilesPreprocessed = '{}/icevol.{}.year*.nc'.format( + preprocessedReferenceDirectory, preprocessedReferenceRunName) + outFileName = '{}/iceVolume.nc'.format(outFolder) + + combine_time_series_with_ncrcat(inFilesPreprocessed, + outFileName, + logger=self.logger) + dsPreprocessed = open_mpas_dataset(fileName=outFileName, + calendar=calendar, + timeVariableNames='xtime') + preprocessedYearEnd = days_to_datetime(dsPreprocessed.Time.max(), + calendar=calendar).year + if yearStart <= preprocessedYearEnd: + dsPreprocessedTimeSlice = \ + dsPreprocessed.sel(Time=slice(timeStart, timeEnd)) + else: + self.logger.warning('Preprocessed time series ends before the ' + 'timeSeries startYear and will not be ' + 'plotted.') + preprocessedReferenceRunName = 'None' + if self.refConfig is not None: dsTimeSeriesRef = {} @@ -363,12 +389,11 @@ def run_task(self): # {{{ if preprocessedReferenceRunName != 'None': outFolder = '{}/preprocessed'.format(outputDirectory) - make_directories(outFolder) inFilesPreprocessed = '{}/icearea.{}.year*.nc'.format( preprocessedReferenceDirectory, preprocessedReferenceRunName) - outFileName = '{}/iceArea{}.nc'.format(outFolder, hemisphere) + outFileName = '{}/iceArea.nc'.format(outFolder) combine_time_series_with_ncrcat(inFilesPreprocessed, outFileName, @@ -385,7 +410,7 @@ def run_task(self): # {{{ inFilesPreprocessed = '{}/icevol.{}.year*.nc'.format( preprocessedReferenceDirectory, preprocessedReferenceRunName) - outFileName = '{}/iceVolume{}.nc'.format(outFolder, hemisphere) + outFileName = '{}/iceVolume.nc'.format(outFolder) combine_time_series_with_ncrcat(inFilesPreprocessed, outFileName,